drbh HF Staff commited on
Commit
d87c146
·
verified ·
1 Parent(s): d8c3a70

Upload folder using huggingface_hub

Browse files
Files changed (46) hide show
  1. activation/impls/artifacts/benchmark/activation.jsonl +9 -9
  2. activation/impls/cells/benchmark.py +13 -7
  3. activation/impls/hf_kernels_swiglu.html +99 -99
  4. activation/impls/torch_swiglu.html +130 -124
  5. activation/results/artifacts/combine/latency.svg +1 -1
  6. activation/results/combined_results.html +79 -79
  7. causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +24 -24
  8. causal_conv1d/impls/hf_kernels_causal_conv1d.html +0 -0
  9. causal_conv1d/impls/torch_causal_conv1d.html +0 -0
  10. causal_conv1d/results/artifacts/combine/latency.svg +2 -2
  11. causal_conv1d/results/combined_results.html +134 -134
  12. deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl +4 -4
  13. deformable_detr/impls/cells/benchmark.py +18 -94
  14. deformable_detr/impls/hf_kernels_deformable_detr.html +81 -79
  15. deformable_detr/impls/torch_deformable_detr.html +105 -99
  16. deformable_detr/results/artifacts/combine/latency.svg +2 -2
  17. deformable_detr/results/combined_results.html +232 -128
  18. flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -6
  19. flash_attn/impls/cells/benchmark.py +9 -8
  20. flash_attn/impls/flash_attention.html +144 -144
  21. flash_attn/impls/hf_kernels_flash_attn.html +98 -101
  22. flash_attn/impls/hf_kernels_flash_attn3.html +89 -85
  23. flash_attn/impls/mem_efficient_attention.html +134 -134
  24. flash_attn/impls/sage_attention.html +11 -11
  25. flash_attn/impls/xformers.html +94 -94
  26. flash_attn/results/artifacts/combine/latency.svg +2 -2
  27. flash_attn/results/cells/combine.py +1 -0
  28. flash_attn/results/combined_results.html +154 -152
  29. index.html +1 -1
  30. layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +4 -4
  31. layer_norm/impls/hf_kernels_layer_norm.html +54 -54
  32. layer_norm/impls/torch_layer_norm.html +56 -56
  33. layer_norm/results/artifacts/combine/latency.svg +2 -2
  34. layer_norm/results/combined_results.html +55 -55
  35. openai_moe/impls/artifacts/benchmark/openai_moe.jsonl +8 -8
  36. openai_moe/impls/binned_torch.html +186 -186
  37. openai_moe/impls/gpt_oss_moe.html +199 -197
  38. openai_moe/results/artifacts/combine/latency.svg +2 -2
  39. openai_moe/results/combined_results.html +191 -243
  40. rotary/impls/artifacts/benchmark/rotary.jsonl +24 -24
  41. rotary/impls/cells/benchmark.py +21 -12
  42. rotary/impls/hf_kernels_rotary.html +0 -0
  43. rotary/impls/torch_rotary.html +0 -0
  44. rotary/index.html +1 -1
  45. rotary/results/artifacts/combine/latency.svg +2 -2
  46. rotary/results/combined_results.html +167 -167
activation/impls/artifacts/benchmark/activation.jsonl CHANGED
@@ -1,9 +1,9 @@
1
- {"ts": "2025-11-10T22:11:36Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04274000002624234, "p50": 0.043191000031583826, "p90": 0.04467100006877445, "mean": 0.04373860001578578, "iqr": 0.0017300001218245598, "raw_times": [0.04467100006877445, 0.04515000000537839, 0.043191000031583826, 0.04274000002624234, 0.04294099994694989], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04910000006930204, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
2
- {"ts": "2025-11-10T22:11:36Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048549999974056846, "p50": 0.049830999842015444, "p90": 0.05033100001128332, "mean": 0.04977279995728168, "iqr": 0.0006400000529538374, "raw_times": [0.048549999974056846, 0.049690999958329485, 0.05033100001128332, 0.05046100000072329, 0.049830999842015444], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05312100006449327, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
3
- {"ts": "2025-11-10T22:11:36Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049481000132800546, "p50": 0.04955999997946492, "p90": 0.04985100008525478, "mean": 0.049792600020737154, "iqr": 0.000360000058208243, "raw_times": [0.04955999997946492, 0.050579999879118986, 0.049481000132800546, 0.04985100008525478, 0.04949100002704654], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052620999895225395, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
4
- {"ts": "2025-11-10T22:11:36Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04747100001623039, "p50": 0.049561000196263194, "p90": 0.04995100016458309, "mean": 0.04936700006510364, "iqr": 0.0008900001375877764, "raw_times": [0.04747100001623039, 0.049561000196263194, 0.0507909999214462, 0.049061000026995316, 0.04995100016458309], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0509510000483715, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
5
- {"ts": "2025-11-10T22:11:36Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04721999994217185, "p50": 0.04802100011147559, "p90": 0.048511000159123796, "mean": 0.0482608000311302, "iqr": 0.0008600002274761209, "raw_times": [0.04802100011147559, 0.04721999994217185, 0.048511000159123796, 0.0499010000112321, 0.047650999931647675], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.051911000127802254, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
6
- {"ts": "2025-11-10T22:11:37Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04553000007945229, "p50": 0.047661000053267344, "p90": 0.04845100011152681, "mean": 0.049852800020744326, "iqr": 0.0010610001481836662, "raw_times": [0.04553000007945229, 0.04738999996334314, 0.047661000053267344, 0.04845100011152681, 0.06023199989613204], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04891099979431601, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
7
- {"ts": "2025-11-10T22:11:37Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04606099992088275, "p50": 0.04722100015897013, "p90": 0.047730999995110324, "mean": 0.04745279998132901, "iqr": 0.0006210000265127746, "raw_times": [0.04606099992088275, 0.04914099986308429, 0.04722100015897013, 0.047730999995110324, 0.04710999996859755], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05060099988440925, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
8
- {"ts": "2025-11-10T22:11:37Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047480999910476385, "p50": 0.04807099981007923, "p90": 0.04905100013274932, "mean": 0.049742999999580206, "iqr": 0.0014700001429446274, "raw_times": [0.047480999910476385, 0.047580999989804695, 0.04905100013274932, 0.0565310001547914, 0.04807099981007923], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04896100017504068, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
9
- {"ts": "2025-11-10T22:11:37Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.046829999973851955, "p50": 0.04784099996868463, "p90": 0.0479610000638786, "mean": 0.047636799990868894, "iqr": 0.001030000021273736, "raw_times": [0.046829999973851955, 0.048620999905324425, 0.046931000042604865, 0.0479610000638786, 0.04784099996868463], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05104100000608014, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
 
1
+ {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024160000066331122, "p50": 0.024919999987105257, "p90": 0.025289999939559493, "mean": 0.025252000023101573, "iqr": 0.0006499999471998308, "raw_times": [0.025289999939559493, 0.02725000013015233, 0.024639999992359662, 0.024919999987105257, 0.024160000066331122], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030839999908494065, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
2
+ {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027569999929255573, "p50": 0.029069999982311856, "p90": 0.029229999881863478, "mean": 0.029034399949523504, "iqr": 0.0008489998890581774, "raw_times": [0.027569999929255573, 0.030920999961381312, 0.0283809999928053, 0.029229999881863478, 0.029069999982311856], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03184999991390214, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
3
+ {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0278800000614865, "p50": 0.02896099999816215, "p90": 0.029151000035199104, "mean": 0.02896060000239231, "iqr": 0.0004910000370728085, "raw_times": [0.028659999998126295, 0.03015099991898751, 0.029151000035199104, 0.02896099999816215, 0.0278800000614865], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03205000007255876, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
4
+ {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027180999950360274, "p50": 0.028851000024587847, "p90": 0.029309999945326126, "mean": 0.02889839993258647, "iqr": 0.000470000031782547, "raw_times": [0.027180999950360274, 0.028851000024587847, 0.02883999991354358, 0.03030999982911453, 0.029309999945326126], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030620000188719132, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
5
+ {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027590999934545835, "p50": 0.028819999897677917, "p90": 0.02953100010927301, "mean": 0.02878659997804789, "iqr": 0.0017000002117129043, "raw_times": [0.027590999934545835, 0.02953100010927301, 0.027830999897560105, 0.03016000005118258, 0.028819999897677917], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031159999934970983, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
6
+ {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026730000172392465, "p50": 0.028800999871236854, "p90": 0.02885999992940924, "mean": 0.028368599987516063, "iqr": 0.0005089998467155965, "raw_times": [0.026730000172392465, 0.02885999992940924, 0.02910099988184811, 0.028800999871236854, 0.028351000082693645], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030940999977246975, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
7
+ {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02737999989221862, "p50": 0.0283800000033807, "p90": 0.02853099999811093, "mean": 0.028162599983261316, "iqr": 0.0007899998308857903, "raw_times": [0.02737999989221862, 0.0283800000033807, 0.028780999855371192, 0.02774100016722514, 0.02853099999811093], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.034010999797828845, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
8
+ {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02824100010911934, "p50": 0.028820000125051592, "p90": 0.02886099991883384, "mean": 0.029222400007711258, "iqr": 0.00022099993657320738, "raw_times": [0.02824100010911934, 0.02886099991883384, 0.028639999982260633, 0.03154999990329088, 0.028820000125051592], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029901000061727245, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
9
+ {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02627100002428051, "p50": 0.02855000002455199, "p90": 0.02863100007743924, "mean": 0.028174600083730184, "iqr": 0.0002599999788799323, "raw_times": [0.02627100002428051, 0.028371000098559307, 0.02863100007743924, 0.02905000019381987, 0.02855000002455199], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02980999988722033, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
activation/impls/cells/benchmark.py CHANGED
@@ -4,6 +4,7 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
@@ -12,17 +13,22 @@
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
15
- import torch, torch.nn.functional as F
16
 
 
 
17
 
18
- def swiglu_eager(x):
19
- d = x.shape[-1] // 2
20
- return F.silu(x[..., :d]) * x[..., d:]
 
 
 
21
 
22
 
23
  run_benchmark(
24
  kernel_type=KernelTypeEnum.ACTIVATION,
25
- impl_name="torch_eager",
26
- impl_tags={"family":"hf-kernels", "backend":"eager"},
27
- impl_func=swiglu_eager,
28
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
+ # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
 
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
+ from kernels import get_kernel
17
 
18
+ # Load the activation kernel
19
+ activation = get_kernel("kernels-community/activation")
20
 
21
+
22
+ def hf_kernels_swiglu(input_tensor):
23
+ hidden_dim = input_tensor.shape[-1] // 2
24
+ out_shape = input_tensor.shape[:-1] + (hidden_dim,)
25
+ out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
26
+ return activation.silu_and_mul(out, input_tensor)
27
 
28
 
29
  run_benchmark(
30
  kernel_type=KernelTypeEnum.ACTIVATION,
31
+ impl_name="hf_kernels_swiglu",
32
+ impl_tags={"family": "hf-kernels", "backend": "cuda"},
33
+ impl_func=hf_kernels_swiglu,
34
  )
activation/impls/hf_kernels_swiglu.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.22s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3905,16 +3905,16 @@ Cell: nv | 0.22s
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
- <div class="cell-stdout"><pre class="stdout-text">Mon Nov 10 21:58:08 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
- | NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
3912
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3913
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
- | N/A 28C P0 78W / 350W | 0MiB / 46068MiB | 11% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.22s
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
- Cell: benchmark | 8.29s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3995,17 +3995,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 81.151us 1892.96% 81.151us 81.151us 1
3999
- hf_kernels_swiglu 8.90% 185.545us 99.31% 2.071ms 2.071ms 0.000us 0.00% 5.727us 5.727us 1
4000
- _activation_beeaae6::silu_and_mul 0.90% 18.858us 88.30% 1.842ms 613.846us 4.287us 100.00% 5.727us 1.909us 3
4001
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.287us 100.00% 4.287us 1.429us 3
4002
- Activity Buffer Request 85.28% 1.779ms 85.28% 1.779ms 1.779ms 1.440us 33.59% 1.440us 1.440us 1
4003
- aten::empty 2.11% 44.080us 2.11% 44.080us 14.693us 0.000us 0.00% 0.000us 0.000us 3
4004
- cudaLaunchKernel 2.11% 44.091us 2.11% 44.091us 14.697us 0.000us 0.00% 0.000us 0.000us 3
4005
- cudaDeviceSynchronize 0.69% 14.370us 0.69% 14.370us 14.370us 0.000us 0.00% 0.000us 0.000us 1
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
- Self CPU time total: 2.086ms
4008
- Self CUDA time total: 4.287us
4009
 
4010
 
4011
 
@@ -4015,17 +4015,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.344us 1660.16% 65.344us 65.344us 1
4019
- hf_kernels_swiglu 4.80% 90.161us 99.69% 1.871ms 1.871ms 0.000us 0.00% 5.280us 5.280us 1
4020
- _activation_beeaae6::silu_and_mul 1.05% 19.620us 93.88% 1.762ms 587.343us 3.936us 100.00% 5.280us 1.760us 3
4021
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.936us 100.00% 3.936us 1.312us 3
4022
- Activity Buffer Request 91.30% 1.714ms 91.30% 1.714ms 1.714ms 1.344us 34.15% 1.344us 1.344us 1
4023
- aten::empty 1.01% 18.871us 1.01% 18.871us 6.290us 0.000us 0.00% 0.000us 0.000us 3
4024
- cudaLaunchKernel 1.53% 28.801us 1.53% 28.801us 9.600us 0.000us 0.00% 0.000us 0.000us 3
4025
- cudaDeviceSynchronize 0.31% 5.880us 0.31% 5.880us 5.880us 0.000us 0.00% 0.000us 0.000us 1
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
- Self CPU time total: 1.877ms
4028
- Self CUDA time total: 3.936us
4029
 
4030
 
4031
 
@@ -4035,17 +4035,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.967us 1388.21% 67.967us 67.967us 1
4039
- hf_kernels_swiglu 4.59% 88.711us 99.72% 1.927ms 1.927ms 0.000us 0.00% 6.560us 6.560us 1
4040
- _activation_beeaae6::silu_and_mul 0.94% 18.080us 94.11% 1.819ms 606.193us 4.896us 100.00% 6.560us 2.187us 3
4041
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.896us 100.00% 4.896us 1.632us 3
4042
- Activity Buffer Request 91.80% 1.774ms 91.80% 1.774ms 1.774ms 1.664us 33.99% 1.664us 1.664us 1
4043
- aten::empty 1.02% 19.730us 1.02% 19.730us 6.577us 0.000us 0.00% 0.000us 0.000us 3
4044
- cudaLaunchKernel 1.37% 26.441us 1.37% 26.441us 8.814us 0.000us 0.00% 0.000us 0.000us 3
4045
- cudaDeviceSynchronize 0.28% 5.470us 0.28% 5.470us 5.470us 0.000us 0.00% 0.000us 0.000us 1
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
- Self CPU time total: 1.932ms
4048
- Self CUDA time total: 4.896us
4049
 
4050
 
4051
 
@@ -4055,17 +4055,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
4055
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4056
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4057
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4058
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.448us 1584.08% 68.448us 68.448us 1
4059
- hf_kernels_swiglu 4.10% 87.981us 99.77% 2.141ms 2.141ms 0.000us 0.00% 5.794us 5.794us 1
4060
- _activation_beeaae6::silu_and_mul 0.89% 19.190us 94.80% 2.034ms 678.097us 4.321us 100.00% 5.794us 1.931us 3
4061
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.321us 100.00% 4.321us 1.440us 3
4062
- Activity Buffer Request 83.35% 1.789ms 83.35% 1.789ms 1.789ms 1.473us 34.09% 1.473us 1.473us 1
4063
- aten::empty 0.87% 18.670us 0.87% 18.670us 6.223us 0.000us 0.00% 0.000us 0.000us 3
4064
- cudaLaunchKernel 10.55% 226.443us 10.55% 226.443us 75.481us 0.000us 0.00% 0.000us 0.000us 3
4065
- cudaDeviceSynchronize 0.23% 4.930us 0.23% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
4066
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4067
- Self CPU time total: 2.146ms
4068
- Self CUDA time total: 4.321us
4069
 
4070
 
4071
 
@@ -4075,17 +4075,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4077
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4078
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.816us 1128.46% 66.816us 66.816us 1
4079
- hf_kernels_swiglu 4.29% 87.791us 99.73% 2.043ms 2.043ms 0.000us 0.00% 7.906us 7.906us 1
4080
- _activation_beeaae6::silu_and_mul 1.03% 21.101us 94.53% 1.936ms 645.491us 5.921us 100.00% 7.906us 2.635us 3
4081
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.921us 100.00% 5.921us 1.974us 3
4082
- Activity Buffer Request 84.88% 1.739ms 84.88% 1.739ms 1.739ms 1.985us 33.52% 1.985us 1.985us 1
4083
- aten::empty 0.92% 18.779us 0.92% 18.779us 6.260us 0.000us 0.00% 0.000us 0.000us 3
4084
- cudaLaunchKernel 8.62% 176.604us 8.62% 176.604us 58.868us 0.000us 0.00% 0.000us 0.000us 3
4085
- cudaDeviceSynchronize 0.27% 5.500us 0.27% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1
4086
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4087
- Self CPU time total: 2.049ms
4088
- Self CUDA time total: 5.921us
4089
 
4090
 
4091
 
@@ -4095,17 +4095,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
4095
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4096
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.807us 824.06% 63.807us 63.807us 1
4099
- hf_kernels_swiglu 17.99% 83.441us 98.85% 458.487us 458.487us 0.000us 0.00% 10.335us 10.335us 1
4100
- _activation_beeaae6::silu_and_mul 4.27% 19.820us 76.93% 356.816us 118.939us 7.743us 100.00% 10.335us 3.445us 3
4101
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.743us 100.00% 7.743us 2.581us 3
4102
- Activity Buffer Request 37.06% 171.903us 37.06% 171.903us 171.903us 2.592us 33.48% 2.592us 2.592us 1
4103
- aten::empty 3.93% 18.230us 3.93% 18.230us 6.077us 0.000us 0.00% 0.000us 0.000us 3
4104
- cudaLaunchKernel 35.60% 165.093us 35.60% 165.093us 55.031us 0.000us 0.00% 0.000us 0.000us 3
4105
- cudaDeviceSynchronize 1.15% 5.320us 1.15% 5.320us 5.320us 0.000us 0.00% 0.000us 0.000us 1
4106
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4107
- Self CPU time total: 463.807us
4108
- Self CUDA time total: 7.743us
4109
 
4110
 
4111
 
@@ -4115,17 +4115,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
4115
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4116
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4117
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4118
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.231us 959.06% 63.231us 63.231us 1
4119
- hf_kernels_swiglu 19.32% 83.900us 98.89% 429.436us 429.436us 0.000us 0.00% 8.802us 8.802us 1
4120
- _activation_beeaae6::silu_and_mul 4.57% 19.830us 75.32% 327.085us 109.028us 6.593us 100.00% 8.802us 2.934us 3
4121
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.593us 100.00% 6.593us 2.198us 3
4122
- Activity Buffer Request 34.73% 150.793us 34.73% 150.793us 150.793us 2.209us 33.51% 2.209us 2.209us 1
4123
- aten::empty 4.25% 18.451us 4.25% 18.451us 6.150us 0.000us 0.00% 0.000us 0.000us 3
4124
- cudaLaunchKernel 36.03% 156.462us 36.03% 156.462us 52.154us 0.000us 0.00% 0.000us 0.000us 3
4125
- cudaDeviceSynchronize 1.11% 4.800us 1.11% 4.800us 4.800us 0.000us 0.00% 0.000us 0.000us 1
4126
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4127
- Self CPU time total: 434.236us
4128
- Self CUDA time total: 6.593us
4129
 
4130
 
4131
 
@@ -4135,17 +4135,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
4135
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4136
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4137
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4138
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.544us 726.10% 68.544us 68.544us 1
4139
- hf_kernels_swiglu 4.25% 86.402us 99.73% 2.027ms 2.027ms 0.000us 0.00% 12.608us 12.608us 1
4140
- _activation_beeaae6::silu_and_mul 1.00% 20.252us 94.52% 1.921ms 640.494us 9.440us 100.00% 12.608us 4.203us 3
4141
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.440us 100.00% 9.440us 3.147us 3
4142
- Activity Buffer Request 85.77% 1.743ms 85.77% 1.743ms 1.743ms 3.168us 33.56% 3.168us 3.168us 1
4143
- aten::empty 0.96% 19.489us 0.96% 19.489us 6.496us 0.000us 0.00% 0.000us 0.000us 3
4144
- cudaLaunchKernel 7.76% 157.752us 7.76% 157.752us 52.584us 0.000us 0.00% 0.000us 0.000us 3
4145
- cudaDeviceSynchronize 0.27% 5.440us 0.27% 5.440us 5.440us 0.000us 0.00% 0.000us 0.000us 1
4146
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4147
- Self CPU time total: 2.033ms
4148
- Self CUDA time total: 9.440us
4149
 
4150
 
4151
 
@@ -4155,17 +4155,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
4155
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4156
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4157
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4158
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 61.247us 467.96% 61.247us 61.247us 1
4159
- hf_kernels_swiglu 19.95% 80.811us 98.74% 399.916us 399.916us 0.000us 0.00% 17.504us 17.504us 1
4160
- _activation_beeaae6::silu_and_mul 4.55% 18.440us 74.43% 301.465us 100.488us 13.088us 100.00% 17.504us 5.835us 3
4161
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.088us 100.00% 13.088us 4.363us 3
4162
- Activity Buffer Request 32.08% 129.932us 32.08% 129.932us 129.932us 4.416us 33.74% 4.416us 4.416us 1
4163
- aten::empty 4.36% 17.640us 4.36% 17.640us 5.880us 0.000us 0.00% 0.000us 0.000us 3
4164
- cudaLaunchKernel 37.80% 153.093us 37.80% 153.093us 51.031us 0.000us 0.00% 0.000us 0.000us 3
4165
- cudaDeviceSynchronize 1.26% 5.090us 1.26% 5.090us 5.090us 0.000us 0.00% 0.000us 0.000us 1
4166
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4167
- Self CPU time total: 405.006us
4168
- Self CUDA time total: 13.088us
4169
 
4170
 
4171
  impl wl p50(ms) ok
@@ -4182,12 +4182,12 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
4182
  <div class="uv-install-logs" id="uv-logs-benchmark">
4183
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4184
  <div class="uv-logs-content" style="display: none;">
4185
- Installed 52 packages in 291ms
4186
  </div>
4187
  </div>
4188
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4189
- Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 12.91it/s]
4190
- Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 18.06it/s]</div>
4191
  <div class="cell-artifacts">
4192
  <h4>Artifacts:</h4>
4193
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.24s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:56:03 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
+ | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
3912
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3913
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
+ | N/A 30C P0 77W / 350W | 0MiB / 46068MiB | 10% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
 
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
+ Cell: benchmark | 4.62s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 72.704us 1747.69% 72.704us 72.704us 1
3999
+ hf_kernels_swiglu 10.22% 211.154us 99.32% 2.053ms 2.053ms 0.000us 0.00% 5.600us 5.600us 1
4000
+ _activation_23bf3fb::silu_and_mul 1.00% 20.580us 87.11% 1.800ms 600.140us 4.160us 100.00% 5.600us 1.867us 3
4001
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.160us 100.00% 4.160us 1.387us 3
4002
+ Activity Buffer Request 84.13% 1.739ms 84.13% 1.739ms 1.739ms 1.440us 34.62% 1.440us 1.440us 1
4003
+ aten::empty 1.99% 41.071us 1.99% 41.071us 13.690us 0.000us 0.00% 0.000us 0.000us 3
4004
+ cudaLaunchKernel 1.99% 41.111us 1.99% 41.111us 13.704us 0.000us 0.00% 0.000us 0.000us 3
4005
+ cudaDeviceSynchronize 0.68% 14.100us 0.68% 14.100us 14.100us 0.000us 0.00% 0.000us 0.000us 1
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
+ Self CPU time total: 2.067ms
4008
+ Self CUDA time total: 4.160us
4009
 
4010
 
4011
 
 
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 105.278us 2696.67% 105.278us 105.278us 1
4019
+ hf_kernels_swiglu 7.13% 139.913us 99.69% 1.957ms 1.957ms 0.000us 0.00% 5.216us 5.216us 1
4020
+ _activation_23bf3fb::silu_and_mul 1.22% 23.859us 91.38% 1.794ms 598.043us 3.904us 100.00% 5.216us 1.739us 3
4021
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.904us 100.00% 3.904us 1.301us 3
4022
+ Activity Buffer Request 88.47% 1.737ms 88.47% 1.737ms 1.737ms 1.312us 33.61% 1.312us 1.312us 1
4023
+ aten::empty 1.19% 23.420us 1.19% 23.420us 7.807us 0.000us 0.00% 0.000us 0.000us 3
4024
+ cudaLaunchKernel 1.70% 33.281us 1.70% 33.281us 11.094us 0.000us 0.00% 0.000us 0.000us 3
4025
+ cudaDeviceSynchronize 0.31% 6.000us 0.31% 6.000us 6.000us 0.000us 0.00% 0.000us 0.000us 1
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
+ Self CPU time total: 1.963ms
4028
+ Self CUDA time total: 3.904us
4029
 
4030
 
4031
 
 
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.849us 1275.09% 62.849us 62.849us 1
4039
+ hf_kernels_swiglu 5.51% 105.232us 99.72% 1.903ms 1.903ms 0.000us 0.00% 6.594us 6.594us 1
4040
+ _activation_23bf3fb::silu_and_mul 1.04% 19.839us 93.23% 1.779ms 593.100us 4.929us 100.00% 6.594us 2.198us 3
4041
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.929us 100.00% 4.929us 1.643us 3
4042
+ Activity Buffer Request 90.86% 1.734ms 90.86% 1.734ms 1.734ms 1.665us 33.78% 1.665us 1.665us 1
4043
+ aten::empty 0.98% 18.730us 0.98% 18.730us 6.243us 0.000us 0.00% 0.000us 0.000us 3
4044
+ cudaLaunchKernel 1.33% 25.362us 1.33% 25.362us 8.454us 0.000us 0.00% 0.000us 0.000us 3
4045
+ cudaDeviceSynchronize 0.28% 5.330us 0.28% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
+ Self CPU time total: 1.909ms
4048
+ Self CUDA time total: 4.929us
4049
 
4050
 
4051
 
 
4055
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4056
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4057
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4058
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.512us 1515.79% 64.512us 64.512us 1
4059
+ hf_kernels_swiglu 5.00% 107.783us 99.78% 2.152ms 2.152ms 0.000us 0.00% 5.696us 5.696us 1
4060
+ _activation_23bf3fb::silu_and_mul 0.93% 20.060us 93.90% 2.025ms 675.114us 4.256us 100.00% 5.696us 1.899us 3
4061
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.256us 100.00% 4.256us 1.419us 3
4062
+ Activity Buffer Request 82.83% 1.787ms 82.83% 1.787ms 1.787ms 1.440us 33.83% 1.440us 1.440us 1
4063
+ aten::empty 0.89% 19.099us 0.89% 19.099us 6.366us 0.000us 0.00% 0.000us 0.000us 3
4064
+ cudaLaunchKernel 10.14% 218.744us 10.14% 218.744us 72.915us 0.000us 0.00% 0.000us 0.000us 3
4065
+ cudaDeviceSynchronize 0.22% 4.671us 0.22% 4.671us 4.671us 0.000us 0.00% 0.000us 0.000us 1
4066
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4067
+ Self CPU time total: 2.157ms
4068
+ Self CUDA time total: 4.256us
4069
 
4070
 
4071
 
 
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4077
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4078
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 60.319us 1029.86% 60.319us 60.319us 1
4079
+ hf_kernels_swiglu 13.59% 83.190us 99.22% 607.209us 607.209us 0.000us 0.00% 7.809us 7.809us 1
4080
+ _activation_23bf3fb::silu_and_mul 3.33% 20.351us 82.60% 505.509us 168.503us 5.857us 100.00% 7.809us 2.603us 3
4081
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.857us 100.00% 5.857us 1.952us 3
4082
+ Activity Buffer Request 46.13% 282.314us 46.13% 282.314us 282.314us 1.952us 33.33% 1.952us 1.952us 1
4083
+ aten::empty 3.02% 18.510us 3.02% 18.510us 6.170us 0.000us 0.00% 0.000us 0.000us 3
4084
+ cudaLaunchKernel 33.14% 202.844us 33.14% 202.844us 67.615us 0.000us 0.00% 0.000us 0.000us 3
4085
+ cudaDeviceSynchronize 0.78% 4.791us 0.78% 4.791us 4.791us 0.000us 0.00% 0.000us 0.000us 1
4086
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4087
+ Self CPU time total: 612.000us
4088
+ Self CUDA time total: 5.857us
4089
 
4090
 
4091
 
 
4095
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4096
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.087us 899.57% 69.087us 69.087us 1
4099
+ hf_kernels_swiglu 5.09% 105.021us 99.75% 2.059ms 2.059ms 0.000us 0.00% 10.240us 10.240us 1
4100
+ _activation_23bf3fb::silu_and_mul 0.96% 19.861us 93.70% 1.934ms 644.594us 7.680us 100.00% 10.240us 3.413us 3
4101
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.680us 100.00% 7.680us 2.560us 3
4102
+ Activity Buffer Request 83.16% 1.716ms 83.16% 1.716ms 1.716ms 2.560us 33.33% 2.560us 2.560us 1
4103
+ aten::empty 0.96% 19.840us 0.96% 19.840us 6.613us 0.000us 0.00% 0.000us 0.000us 3
4104
+ cudaLaunchKernel 9.57% 197.533us 9.57% 197.533us 65.844us 0.000us 0.00% 0.000us 0.000us 3
4105
+ cudaDeviceSynchronize 0.25% 5.209us 0.25% 5.209us 5.209us 0.000us 0.00% 0.000us 0.000us 1
4106
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4107
+ Self CPU time total: 2.064ms
4108
+ Self CUDA time total: 7.680us
4109
 
4110
 
4111
 
 
4115
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4116
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4117
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4118
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.615us 969.59% 63.615us 63.615us 1
4119
+ hf_kernels_swiglu 4.67% 99.430us 99.78% 2.123ms 2.123ms 0.000us 0.00% 8.769us 8.769us 1
4120
+ _activation_23bf3fb::silu_and_mul 0.94% 19.910us 94.25% 2.005ms 668.341us 6.561us 100.00% 8.769us 2.923us 3
4121
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.561us 100.00% 6.561us 2.187us 3
4122
+ Activity Buffer Request 84.26% 1.793ms 84.26% 1.793ms 1.793ms 2.208us 33.65% 2.208us 2.208us 1
4123
+ aten::empty 0.86% 18.221us 0.86% 18.221us 6.074us 0.000us 0.00% 0.000us 0.000us 3
4124
+ cudaLaunchKernel 9.05% 192.544us 9.05% 192.544us 64.181us 0.000us 0.00% 0.000us 0.000us 3
4125
+ cudaDeviceSynchronize 0.22% 4.771us 0.22% 4.771us 4.771us 0.000us 0.00% 0.000us 0.000us 1
4126
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4127
+ Self CPU time total: 2.127ms
4128
+ Self CUDA time total: 6.561us
4129
 
4130
 
4131
 
 
4135
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4136
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4137
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4138
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 58.655us 627.73% 58.655us 58.655us 1
4139
+ hf_kernels_swiglu 14.96% 80.683us 99.03% 533.948us 533.948us 0.000us 0.00% 12.480us 12.480us 1
4140
+ _activation_23bf3fb::silu_and_mul 3.95% 21.299us 80.75% 435.406us 145.135us 9.344us 100.00% 12.480us 4.160us 3
4141
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.344us 100.00% 9.344us 3.115us 3
4142
+ Activity Buffer Request 41.04% 221.264us 41.04% 221.264us 221.264us 3.136us 33.56% 3.136us 3.136us 1
4143
+ aten::empty 3.31% 17.859us 3.31% 17.859us 5.953us 0.000us 0.00% 0.000us 0.000us 3
4144
+ cudaLaunchKernel 35.77% 192.843us 35.77% 192.843us 64.281us 0.000us 0.00% 0.000us 0.000us 3
4145
+ cudaDeviceSynchronize 0.97% 5.240us 0.97% 5.240us 5.240us 0.000us 0.00% 0.000us 0.000us 1
4146
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4147
+ Self CPU time total: 539.188us
4148
+ Self CUDA time total: 9.344us
4149
 
4150
 
4151
 
 
4155
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4156
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4157
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4158
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 60.863us 469.62% 60.863us 60.863us 1
4159
+ hf_kernels_swiglu 16.50% 95.821us 99.18% 576.059us 576.059us 0.000us 0.00% 17.312us 17.312us 1
4160
+ _activation_23bf3fb::silu_and_mul 3.50% 20.301us 79.69% 462.858us 154.286us 12.960us 100.00% 17.312us 5.771us 3
4161
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 12.960us 100.00% 12.960us 4.320us 3
4162
+ Activity Buffer Request 43.18% 250.794us 43.18% 250.794us 250.794us 4.352us 33.58% 4.352us 4.352us 1
4163
+ aten::empty 2.99% 17.380us 2.99% 17.380us 5.793us 0.000us 0.00% 0.000us 0.000us 3
4164
+ cudaLaunchKernel 33.01% 191.763us 33.01% 191.763us 63.921us 0.000us 0.00% 0.000us 0.000us 3
4165
+ cudaDeviceSynchronize 0.82% 4.790us 0.82% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1
4166
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4167
+ Self CPU time total: 580.849us
4168
+ Self CUDA time total: 12.960us
4169
 
4170
 
4171
  impl wl p50(ms) ok
 
4182
  <div class="uv-install-logs" id="uv-logs-benchmark">
4183
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4184
  <div class="uv-logs-content" style="display: none;">
4185
+ Installed 14 packages in 12ms
4186
  </div>
4187
  </div>
4188
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4189
+ Fetching 7 files: 14%|█▍ | 1/7 [00:00&lt;00:02, 3.00it/s]
4190
+ Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 17.15it/s]</div>
4191
  <div class="cell-artifacts">
4192
  <h4>Artifacts:</h4>
4193
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
activation/impls/torch_swiglu.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.22s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,16 +3904,16 @@ Cell: nv | 0.22s
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
- <div class="cell-stdout"><pre class="stdout-text">Mon Nov 10 21:58:08 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
- | NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
3911
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3912
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
- | N/A 28C P0 78W / 350W | 0MiB / 46068MiB | 11% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
@@ -3935,9 +3935,9 @@ Cell: nv | 0.22s
3935
  <span class="collapse-indicators">
3936
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3939
  </span> |
3940
- Cell: benchmark | 3.61s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3987,20 +3987,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 172.065us 1357.73% 172.065us 172.065us 1
3991
- torch_eager 8.84% 192.611us 99.34% 2.164ms 2.164ms 0.000us 0.00% 14.977us 14.977us 1
3992
- aten::silu 2.51% 54.611us 85.85% 1.870ms 623.473us 6.496us 51.26% 8.800us 2.933us 3
3993
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.496us 51.26% 6.496us 2.165us 3
3994
- aten::mul 1.45% 31.541us 2.42% 52.781us 17.594us 6.177us 48.74% 6.177us 2.059us 3
3995
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.177us 48.74% 6.177us 2.059us 3
3996
- Activity Buffer Request 81.30% 1.771ms 81.30% 1.771ms 1.771ms 2.304us 18.18% 2.304us 2.304us 1
3997
- aten::slice 1.79% 39.021us 2.23% 48.532us 8.089us 0.000us 0.00% 0.000us 0.000us 6
3998
- aten::as_strided 0.44% 9.511us 0.44% 9.511us 1.585us 0.000us 0.00% 0.000us 0.000us 6
3999
- cudaLaunchKernel 3.01% 65.621us 3.01% 65.621us 10.937us 0.000us 0.00% 0.000us 0.000us 6
4000
- cudaDeviceSynchronize 0.66% 14.470us 0.66% 14.470us 14.470us 0.000us 0.00% 0.000us 0.000us 1
4001
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4002
- Self CPU time total: 2.179ms
4003
- Self CUDA time total: 12.673us
4004
 
4005
 
4006
 
@@ -4010,20 +4010,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 147.935us 1197.66% 147.935us 147.935us 1
4014
- torch_eager 6.19% 128.671us 99.72% 2.072ms 2.072ms 0.000us 0.00% 14.528us 14.528us 1
4015
- aten::silu 1.99% 41.241us 90.00% 1.870ms 623.253us 6.432us 52.07% 8.608us 2.869us 3
4016
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 52.07% 6.432us 2.144us 3
4017
- aten::mul 1.21% 25.191us 2.13% 44.341us 14.780us 5.920us 47.93% 5.920us 1.973us 3
4018
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 47.93% 5.920us 1.973us 3
4019
- Activity Buffer Request 86.71% 1.801ms 86.71% 1.801ms 1.801ms 2.176us 17.62% 2.176us 2.176us 1
4020
- aten::slice 1.12% 23.301us 1.40% 28.981us 4.830us 0.000us 0.00% 0.000us 0.000us 6
4021
- aten::as_strided 0.27% 5.680us 0.27% 5.680us 0.947us 0.000us 0.00% 0.000us 0.000us 6
4022
- cudaLaunchKernel 2.23% 46.310us 2.23% 46.310us 7.718us 0.000us 0.00% 0.000us 0.000us 6
4023
- cudaDeviceSynchronize 0.28% 5.721us 0.28% 5.721us 5.721us 0.000us 0.00% 0.000us 0.000us 1
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
- Self CPU time total: 2.077ms
4026
- Self CUDA time total: 12.352us
4027
 
4028
 
4029
 
@@ -4033,20 +4033,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
4033
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4034
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 147.135us 1116.10% 147.135us 147.135us 1
4037
- torch_eager 6.76% 134.342us 99.73% 1.980ms 1.980ms 0.000us 0.00% 15.455us 15.455us 1
4038
- aten::silu 1.89% 37.461us 89.35% 1.774ms 591.479us 6.784us 51.46% 9.056us 3.019us 3
4039
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.46% 6.784us 2.261us 3
4040
- aten::mul 1.28% 25.422us 2.19% 43.411us 14.470us 6.399us 48.54% 6.399us 2.133us 3
4041
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 48.54% 6.399us 2.133us 3
4042
- Activity Buffer Request 86.14% 1.711ms 86.14% 1.711ms 1.711ms 2.272us 17.23% 2.272us 2.272us 1
4043
- aten::slice 1.16% 23.079us 1.42% 28.280us 4.713us 0.000us 0.00% 0.000us 0.000us 6
4044
- aten::as_strided 0.26% 5.201us 0.26% 5.201us 0.867us 0.000us 0.00% 0.000us 0.000us 6
4045
- cudaLaunchKernel 2.23% 44.359us 2.23% 44.359us 7.393us 0.000us 0.00% 0.000us 0.000us 6
4046
- cudaDeviceSynchronize 0.27% 5.441us 0.27% 5.441us 5.441us 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
- Self CPU time total: 1.986ms
4049
- Self CUDA time total: 13.183us
4050
 
4051
 
4052
 
@@ -4056,20 +4056,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.034us 1193.64% 152.034us 152.034us 1
4060
- torch_eager 5.57% 123.804us 99.78% 2.219ms 2.219ms 0.000us 0.00% 14.945us 14.945us 1
4061
- aten::silu 1.71% 38.060us 90.80% 2.019ms 672.957us 6.561us 51.51% 8.769us 2.923us 3
4062
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.561us 51.51% 6.561us 2.187us 3
4063
- aten::mul 1.26% 28.020us 2.11% 46.890us 15.630us 6.176us 48.49% 6.176us 2.059us 3
4064
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
4065
- Activity Buffer Request 81.46% 1.811ms 81.46% 1.811ms 1.811ms 2.208us 17.34% 2.208us 2.208us 1
4066
- aten::slice 1.06% 23.629us 1.31% 29.120us 4.853us 0.000us 0.00% 0.000us 0.000us 6
4067
- aten::as_strided 0.25% 5.491us 0.25% 5.491us 0.915us 0.000us 0.00% 0.000us 0.000us 6
4068
- cudaLaunchKernel 8.48% 188.472us 8.48% 188.472us 31.412us 0.000us 0.00% 0.000us 0.000us 6
4069
- cudaDeviceSynchronize 0.22% 4.841us 0.22% 4.841us 4.841us 0.000us 0.00% 0.000us 0.000us 1
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
- Self CPU time total: 2.224ms
4072
- Self CUDA time total: 12.737us
4073
 
4074
 
4075
 
@@ -4079,20 +4079,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4081
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4082
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.353us 1152.70% 152.353us 152.353us 1
4083
- torch_eager 6.19% 135.991us 99.76% 2.192ms 2.192ms 0.000us 0.00% 15.489us 15.489us 1
4084
- aten::silu 1.77% 38.889us 90.16% 1.981ms 660.320us 6.752us 51.09% 9.024us 3.008us 3
4085
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 51.09% 6.752us 2.251us 3
4086
- aten::mul 1.20% 26.341us 2.10% 46.211us 15.404us 6.465us 48.91% 6.465us 2.155us 3
4087
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.465us 48.91% 6.465us 2.155us 3
4088
- Activity Buffer Request 80.60% 1.771ms 80.60% 1.771ms 1.771ms 2.272us 17.19% 2.272us 2.272us 1
4089
- aten::slice 1.06% 23.362us 1.31% 28.762us 4.794us 0.000us 0.00% 0.000us 0.000us 6
4090
- aten::as_strided 0.25% 5.400us 0.25% 5.400us 0.900us 0.000us 0.00% 0.000us 0.000us 6
4091
- cudaLaunchKernel 8.70% 191.103us 8.70% 191.103us 31.851us 0.000us 0.00% 0.000us 0.000us 6
4092
- cudaDeviceSynchronize 0.24% 5.211us 0.24% 5.211us 5.211us 0.000us 0.00% 0.000us 0.000us 1
4093
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4094
- Self CPU time total: 2.197ms
4095
- Self CUDA time total: 13.217us
4096
 
4097
 
4098
 
@@ -4102,20 +4102,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4104
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4105
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.216us 991.30% 153.216us 153.216us 1
4106
- torch_eager 5.88% 135.461us 99.78% 2.300ms 2.300ms 0.000us 0.00% 18.144us 18.144us 1
4107
- aten::silu 1.72% 39.670us 90.62% 2.089ms 696.338us 7.936us 51.35% 10.624us 3.541us 3
4108
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.35% 7.936us 2.645us 3
4109
- aten::mul 1.19% 27.391us 2.02% 46.461us 15.487us 7.520us 48.65% 7.520us 2.507us 3
4110
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.520us 48.65% 7.520us 2.507us 3
4111
- Activity Buffer Request 81.58% 1.881ms 81.58% 1.881ms 1.881ms 2.688us 17.39% 2.688us 2.688us 1
4112
- aten::slice 1.04% 24.071us 1.27% 29.261us 4.877us 0.000us 0.00% 0.000us 0.000us 6
4113
- aten::as_strided 0.23% 5.190us 0.23% 5.190us 0.865us 0.000us 0.00% 0.000us 0.000us 6
4114
- cudaLaunchKernel 8.15% 187.833us 8.15% 187.833us 31.305us 0.000us 0.00% 0.000us 0.000us 6
4115
- cudaDeviceSynchronize 0.22% 5.060us 0.22% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1
4116
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4117
- Self CPU time total: 2.305ms
4118
- Self CUDA time total: 15.456us
4119
 
4120
 
4121
 
@@ -4125,20 +4125,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
4125
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4126
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 148.287us 1031.99% 148.287us 148.287us 1
4129
- torch_eager 4.89% 105.043us 99.76% 2.144ms 2.144ms 0.000us 0.00% 16.833us 16.833us 1
4130
- aten::silu 1.85% 39.730us 91.47% 1.966ms 655.253us 7.361us 51.23% 9.825us 3.275us 3
4131
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.361us 51.23% 7.361us 2.454us 3
4132
- aten::mul 1.23% 26.350us 2.09% 44.980us 14.993us 7.008us 48.77% 7.008us 2.336us 3
4133
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.77% 7.008us 2.336us 3
4134
- Activity Buffer Request 81.83% 1.759ms 81.83% 1.759ms 1.759ms 2.464us 17.15% 2.464us 2.464us 1
4135
- aten::slice 1.07% 23.090us 1.31% 28.260us 4.710us 0.000us 0.00% 0.000us 0.000us 6
4136
- aten::as_strided 0.24% 5.170us 0.24% 5.170us 0.862us 0.000us 0.00% 0.000us 0.000us 6
4137
- cudaLaunchKernel 8.65% 185.993us 8.65% 185.993us 30.999us 0.000us 0.00% 0.000us 0.000us 6
4138
- cudaDeviceSynchronize 0.24% 5.111us 0.24% 5.111us 5.111us 0.000us 0.00% 0.000us 0.000us 1
4139
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4140
- Self CPU time total: 2.149ms
4141
- Self CUDA time total: 14.369us
4142
 
4143
 
4144
 
@@ -4148,20 +4148,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
4148
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4149
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4150
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4151
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.095us 983.92% 152.095us 152.095us 1
4152
- torch_eager 10.87% 257.253us 99.76% 2.361ms 2.361ms 0.000us 0.00% 18.146us 18.146us 1
4153
- aten::silu 1.67% 39.540us 85.73% 2.029ms 676.344us 7.905us 51.14% 10.593us 3.531us 3
4154
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.905us 51.14% 7.905us 2.635us 3
4155
- aten::mul 1.20% 28.421us 1.97% 46.561us 15.520us 7.553us 48.86% 7.553us 2.518us 3
4156
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.553us 48.86% 7.553us 2.518us 3
4157
- Activity Buffer Request 76.39% 1.808ms 76.39% 1.808ms 1.808ms 2.688us 17.39% 2.688us 2.688us 1
4158
- aten::slice 0.98% 23.079us 1.19% 28.100us 4.683us 0.000us 0.00% 0.000us 0.000us 6
4159
- aten::as_strided 0.21% 5.021us 0.21% 5.021us 0.837us 0.000us 0.00% 0.000us 0.000us 6
4160
- cudaLaunchKernel 8.43% 199.594us 8.43% 199.594us 33.266us 0.000us 0.00% 0.000us 0.000us 6
4161
- cudaDeviceSynchronize 0.24% 5.780us 0.24% 5.780us 5.780us 0.000us 0.00% 0.000us 0.000us 1
4162
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4163
- Self CPU time total: 2.367ms
4164
- Self CUDA time total: 15.458us
4165
 
4166
 
4167
 
@@ -4171,20 +4171,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
4171
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4172
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4173
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4174
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 145.313us 647.79% 145.313us 145.313us 1
4175
- torch_eager 16.48% 98.469us 99.14% 592.319us 592.319us 0.000us 0.00% 26.336us 26.336us 1
4176
- aten::silu 6.71% 40.110us 70.79% 422.906us 140.969us 11.520us 51.36% 15.424us 5.141us 3
4177
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.520us 51.36% 11.520us 3.840us 3
4178
- aten::mul 4.29% 25.642us 7.38% 44.092us 14.697us 10.912us 48.64% 10.912us 3.637us 3
4179
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.912us 48.64% 10.912us 3.637us 3
4180
- Activity Buffer Request 36.05% 215.374us 36.05% 215.374us 215.374us 3.904us 17.40% 3.904us 3.904us 1
4181
- aten::slice 3.67% 21.912us 4.49% 26.852us 4.475us 0.000us 0.00% 0.000us 0.000us 6
4182
- aten::as_strided 0.83% 4.940us 0.83% 4.940us 0.823us 0.000us 0.00% 0.000us 0.000us 6
4183
- cudaLaunchKernel 31.11% 185.872us 31.11% 185.872us 30.979us 0.000us 0.00% 0.000us 0.000us 6
4184
- cudaDeviceSynchronize 0.86% 5.130us 0.86% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1
4185
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4186
- Self CPU time total: 597.449us
4187
- Self CUDA time total: 22.432us
4188
 
4189
 
4190
  impl wl p50(ms) ok
@@ -4198,6 +4198,12 @@ torch_eager cuda_T512_D1024 0.05 True
4198
  torch_eager cuda_T512_D2048 0.05 True
4199
  torch_eager cuda_T512_D768 0.05 True
4200
  </pre></div>
 
 
 
 
 
 
4201
  <div class="cell-artifacts">
4202
  <h4>Artifacts:</h4>
4203
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.24s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:56:03 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
+ | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
3911
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3912
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
+ | N/A 30C P0 77W / 350W | 0MiB / 46068MiB | 10% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
 
3935
  <span class="collapse-indicators">
3936
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3939
  </span> |
3940
+ Cell: benchmark | 7.23s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 170.752us 1351.10% 170.752us 170.752us 1
3991
+ torch_eager 8.36% 195.202us 99.35% 2.320ms 2.320ms 0.000us 0.00% 14.941us 14.941us 1
3992
+ aten::silu 2.60% 60.811us 86.31% 2.016ms 671.908us 6.463us 51.14% 8.766us 2.922us 3
3993
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.463us 51.14% 6.463us 2.154us 3
3994
+ aten::mul 1.36% 31.870us 2.27% 52.962us 17.654us 6.175us 48.86% 6.175us 2.058us 3
3995
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.175us 48.86% 6.175us 2.058us 3
3996
+ Activity Buffer Request 81.78% 1.910ms 81.78% 1.910ms 1.910ms 2.303us 18.22% 2.303us 2.303us 1
3997
+ aten::slice 1.97% 46.103us 2.42% 56.432us 9.405us 0.000us 0.00% 0.000us 0.000us 6
3998
+ aten::as_strided 0.44% 10.329us 0.44% 10.329us 1.721us 0.000us 0.00% 0.000us 0.000us 6
3999
+ cudaLaunchKernel 2.83% 66.203us 2.83% 66.203us 11.034us 0.000us 0.00% 0.000us 0.000us 6
4000
+ cudaDeviceSynchronize 0.65% 15.081us 0.65% 15.081us 15.081us 0.000us 0.00% 0.000us 0.000us 1
4001
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4002
+ Self CPU time total: 2.335ms
4003
+ Self CUDA time total: 12.638us
4004
 
4005
 
4006
 
 
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 142.911us 1157.08% 142.911us 142.911us 1
4014
+ torch_eager 5.43% 102.941us 99.70% 1.891ms 1.891ms 0.000us 0.00% 14.495us 14.495us 1
4015
+ aten::silu 2.14% 40.580us 90.39% 1.715ms 571.523us 6.399us 51.81% 8.543us 2.848us 3
4016
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.81% 6.399us 2.133us 3
4017
+ aten::mul 1.41% 26.703us 2.36% 44.783us 14.928us 5.952us 48.19% 5.952us 1.984us 3
4018
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.952us 48.19% 5.952us 1.984us 3
4019
+ Activity Buffer Request 86.86% 1.648ms 86.86% 1.648ms 1.648ms 2.144us 17.36% 2.144us 2.144us 1
4020
+ aten::slice 1.25% 23.641us 1.52% 28.820us 4.803us 0.000us 0.00% 0.000us 0.000us 6
4021
+ aten::as_strided 0.27% 5.179us 0.27% 5.179us 0.863us 0.000us 0.00% 0.000us 0.000us 6
4022
+ cudaLaunchKernel 2.34% 44.460us 2.34% 44.460us 7.410us 0.000us 0.00% 0.000us 0.000us 6
4023
+ cudaDeviceSynchronize 0.30% 5.691us 0.30% 5.691us 5.691us 0.000us 0.00% 0.000us 0.000us 1
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
+ Self CPU time total: 1.897ms
4026
+ Self CUDA time total: 12.351us
4027
 
4028
 
4029
 
 
4033
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4034
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 146.017us 1102.26% 146.017us 146.017us 1
4037
+ torch_eager 5.52% 107.884us 99.72% 1.948ms 1.948ms 0.000us 0.00% 15.519us 15.519us 1
4038
+ aten::silu 2.05% 40.061us 90.43% 1.767ms 588.983us 6.783us 51.20% 9.055us 3.018us 3
4039
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.783us 51.20% 6.783us 2.261us 3
4040
+ aten::mul 1.30% 25.470us 2.24% 43.800us 14.600us 6.464us 48.80% 6.464us 2.155us 3
4041
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.80% 6.464us 2.155us 3
4042
+ Activity Buffer Request 87.02% 1.700ms 87.02% 1.700ms 1.700ms 2.272us 17.15% 2.272us 2.272us 1
4043
+ aten::slice 1.26% 24.689us 1.53% 29.809us 4.968us 0.000us 0.00% 0.000us 0.000us 6
4044
+ aten::as_strided 0.26% 5.120us 0.26% 5.120us 0.853us 0.000us 0.00% 0.000us 0.000us 6
4045
+ cudaLaunchKernel 2.30% 44.851us 2.30% 44.851us 7.475us 0.000us 0.00% 0.000us 0.000us 6
4046
+ cudaDeviceSynchronize 0.28% 5.500us 0.28% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
+ Self CPU time total: 1.954ms
4049
+ Self CUDA time total: 13.247us
4050
 
4051
 
4052
 
 
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 147.168us 1164.30% 147.168us 147.168us 1
4060
+ torch_eager 6.37% 108.862us 99.70% 1.705ms 1.705ms 0.000us 0.00% 14.816us 14.816us 1
4061
+ aten::silu 2.27% 38.759us 89.04% 1.523ms 507.511us 6.496us 51.39% 8.672us 2.891us 3
4062
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.496us 51.39% 6.496us 2.165us 3
4063
+ aten::mul 1.56% 26.620us 2.60% 44.441us 14.814us 6.144us 48.61% 6.144us 2.048us 3
4064
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.144us 48.61% 6.144us 2.048us 3
4065
+ Activity Buffer Request 74.65% 1.277ms 74.65% 1.277ms 1.277ms 2.176us 17.22% 2.176us 2.176us 1
4066
+ aten::slice 1.39% 23.842us 1.70% 29.081us 4.847us 0.000us 0.00% 0.000us 0.000us 6
4067
+ aten::as_strided 0.31% 5.239us 0.31% 5.239us 0.873us 0.000us 0.00% 0.000us 0.000us 6
4068
+ cudaLaunchKernel 13.16% 225.035us 13.16% 225.035us 37.506us 0.000us 0.00% 0.000us 0.000us 6
4069
+ cudaDeviceSynchronize 0.30% 5.120us 0.30% 5.120us 5.120us 0.000us 0.00% 0.000us 0.000us 1
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
+ Self CPU time total: 1.710ms
4072
+ Self CUDA time total: 12.640us
4073
 
4074
 
4075
 
 
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4081
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4082
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 146.271us 1106.86% 146.271us 146.271us 1
4083
+ torch_eager 4.97% 106.601us 99.77% 2.139ms 2.139ms 0.000us 0.00% 15.486us 15.486us 1
4084
+ aten::silu 1.88% 40.251us 91.37% 1.959ms 652.944us 6.751us 51.09% 9.022us 3.007us 3
4085
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.751us 51.09% 6.751us 2.250us 3
4086
+ aten::mul 1.15% 24.611us 1.97% 42.221us 14.074us 6.464us 48.91% 6.464us 2.155us 3
4087
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.91% 6.464us 2.155us 3
4088
+ Activity Buffer Request 80.01% 1.715ms 80.01% 1.715ms 1.715ms 2.271us 17.19% 2.271us 2.271us 1
4089
+ aten::slice 1.17% 25.129us 1.45% 31.071us 5.179us 0.000us 0.00% 0.000us 0.000us 6
4090
+ aten::as_strided 0.28% 5.942us 0.28% 5.942us 0.990us 0.000us 0.00% 0.000us 0.000us 6
4091
+ cudaLaunchKernel 10.31% 220.963us 10.31% 220.963us 36.827us 0.000us 0.00% 0.000us 0.000us 6
4092
+ cudaDeviceSynchronize 0.23% 5.031us 0.23% 5.031us 5.031us 0.000us 0.00% 0.000us 0.000us 1
4093
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4094
+ Self CPU time total: 2.144ms
4095
+ Self CUDA time total: 13.215us
4096
 
4097
 
4098
 
 
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4104
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4105
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 139.706us 900.17% 139.706us 139.706us 1
4106
+ torch_eager 16.34% 103.162us 99.17% 626.050us 626.050us 0.000us 0.00% 18.208us 18.208us 1
4107
+ aten::silu 6.36% 40.131us 71.62% 452.127us 150.709us 7.968us 51.34% 10.656us 3.552us 3
4108
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 51.34% 7.968us 2.656us 3
4109
+ aten::mul 3.68% 23.240us 6.43% 40.610us 13.537us 7.552us 48.66% 7.552us 2.517us 3
4110
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.66% 7.552us 2.517us 3
4111
+ Activity Buffer Request 33.83% 213.593us 33.83% 213.593us 213.593us 2.688us 17.32% 2.688us 2.688us 1
4112
+ aten::slice 3.84% 24.240us 4.78% 30.151us 5.025us 0.000us 0.00% 0.000us 0.000us 6
4113
+ aten::as_strided 0.94% 5.911us 0.94% 5.911us 0.985us 0.000us 0.00% 0.000us 0.000us 6
4114
+ cudaLaunchKernel 34.18% 215.773us 34.18% 215.773us 35.962us 0.000us 0.00% 0.000us 0.000us 6
4115
+ cudaDeviceSynchronize 0.83% 5.229us 0.83% 5.229us 5.229us 0.000us 0.00% 0.000us 0.000us 1
4116
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4117
+ Self CPU time total: 631.279us
4118
+ Self CUDA time total: 15.520us
4119
 
4120
 
4121
 
 
4125
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4126
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 149.023us 1039.50% 149.023us 149.023us 1
4129
+ torch_eager 4.97% 105.151us 99.76% 2.112ms 2.112ms 0.000us 0.00% 16.800us 16.800us 1
4130
+ aten::silu 1.93% 40.940us 91.23% 1.932ms 643.947us 7.360us 51.34% 9.824us 3.275us 3
4131
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 51.34% 7.360us 2.453us 3
4132
+ aten::mul 1.20% 25.341us 2.15% 45.422us 15.141us 6.976us 48.66% 6.976us 2.325us 3
4133
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.976us 48.66% 6.976us 2.325us 3
4134
+ Activity Buffer Request 80.00% 1.694ms 80.00% 1.694ms 1.694ms 2.464us 17.19% 2.464us 2.464us 1
4135
+ aten::slice 1.16% 24.531us 1.41% 29.941us 4.990us 0.000us 0.00% 0.000us 0.000us 6
4136
+ aten::as_strided 0.26% 5.410us 0.26% 5.410us 0.902us 0.000us 0.00% 0.000us 0.000us 6
4137
+ cudaLaunchKernel 10.25% 217.014us 10.25% 217.014us 36.169us 0.000us 0.00% 0.000us 0.000us 6
4138
+ cudaDeviceSynchronize 0.24% 5.140us 0.24% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1
4139
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4140
+ Self CPU time total: 2.117ms
4141
+ Self CUDA time total: 14.336us
4142
 
4143
 
4144
 
 
4148
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4149
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4150
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4151
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 229.537us 1472.90% 229.537us 229.537us 1
4152
+ torch_eager 26.22% 183.030us 99.31% 693.152us 693.152us 0.000us 0.00% 18.272us 18.272us 1
4153
+ aten::silu 5.68% 39.610us 61.61% 430.047us 143.349us 7.967us 51.12% 10.655us 3.552us 3
4154
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.967us 51.12% 7.967us 2.656us 3
4155
+ aten::mul 3.79% 26.431us 6.97% 48.673us 16.224us 7.617us 48.88% 7.617us 2.539us 3
4156
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.617us 48.88% 7.617us 2.539us 3
4157
+ Activity Buffer Request 27.95% 195.093us 27.95% 195.093us 195.093us 2.688us 17.25% 2.688us 2.688us 1
4158
+ aten::slice 3.65% 25.463us 4.50% 31.402us 5.234us 0.000us 0.00% 0.000us 0.000us 6
4159
+ aten::as_strided 0.85% 5.939us 0.85% 5.939us 0.990us 0.000us 0.00% 0.000us 0.000us 6
4160
+ cudaLaunchKernel 31.17% 217.586us 31.17% 217.586us 36.264us 0.000us 0.00% 0.000us 0.000us 6
4161
+ cudaDeviceSynchronize 0.69% 4.809us 0.69% 4.809us 4.809us 0.000us 0.00% 0.000us 0.000us 1
4162
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4163
+ Self CPU time total: 697.961us
4164
+ Self CUDA time total: 15.584us
4165
 
4166
 
4167
 
 
4171
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4172
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4173
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4174
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 162.367us 719.68% 162.367us 162.367us 1
4175
+ torch_eager 5.30% 112.718us 99.76% 2.123ms 2.123ms 0.000us 0.00% 26.497us 26.497us 1
4176
+ aten::silu 1.99% 42.361us 90.94% 1.935ms 644.944us 11.584us 51.35% 15.520us 5.173us 3
4177
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.584us 51.35% 11.584us 3.861us 3
4178
+ aten::mul 1.24% 26.291us 2.09% 44.551us 14.850us 10.977us 48.65% 10.977us 3.659us 3
4179
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.977us 48.65% 10.977us 3.659us 3
4180
+ Activity Buffer Request 79.75% 1.697ms 79.75% 1.697ms 1.697ms 3.936us 17.45% 3.936us 3.936us 1
4181
+ aten::slice 1.18% 25.032us 1.43% 30.473us 5.079us 0.000us 0.00% 0.000us 0.000us 6
4182
+ aten::as_strided 0.26% 5.441us 0.26% 5.441us 0.907us 0.000us 0.00% 0.000us 0.000us 6
4183
+ cudaLaunchKernel 10.06% 214.034us 10.06% 214.034us 35.672us 0.000us 0.00% 0.000us 0.000us 6
4184
+ cudaDeviceSynchronize 0.24% 5.051us 0.24% 5.051us 5.051us 0.000us 0.00% 0.000us 0.000us 1
4185
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4186
+ Self CPU time total: 2.128ms
4187
+ Self CUDA time total: 22.561us
4188
 
4189
 
4190
  impl wl p50(ms) ok
 
4198
  torch_eager cuda_T512_D2048 0.05 True
4199
  torch_eager cuda_T512_D768 0.05 True
4200
  </pre></div>
4201
+ <div class="uv-install-logs" id="uv-logs-benchmark">
4202
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4203
+ <div class="uv-logs-content" style="display: none;">
4204
+ Installed 37 packages in 324ms
4205
+ </div>
4206
+ </div>
4207
  <div class="cell-artifacts">
4208
  <h4>Artifacts:</h4>
4209
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
activation/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: e3876e00c4cce265206e6202ea531c8de65cca8fefa80010473a6b76e6f54cc4
  • Pointer size: 130 Bytes
  • Size of remote file: 20.6 kB

Git LFS Details

  • SHA256: 27a85e52e962d014c803889d44451c0ae5aabe3525a35bf09a387e50af005e30
  • Pointer size: 130 Bytes
  • Size of remote file: 20.6 kB
activation/results/combined_results.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3889,11 +3889,11 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-11-10T22:12:14.776732</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
3896
- <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
3897
  </ns2:Agent>
3898
  </dc:creator>
3899
  </ns2:Work>
@@ -4038,83 +4038,83 @@ body[data-tool="eraser"] .main-content {
4038
  <g id="matplotlib.axis_2">
4039
  <g id="ytick_1">
4040
  <g id="grid-y--2" class="grid grid-y">
4041
- <path d="M 60.23 439.014187 L 847.294169 439.014187 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4042
  </g>
4043
  <g id="line2d_10">
4044
  <defs>
4045
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4046
  </defs>
4047
  <g>
4048
- <use ns4:href="#m0fca2865ba" x="60.23" y="439.014187" style="stroke: #000000; stroke-width: 0.8" />
4049
  </g>
4050
  </g>
4051
  <g id="text_10">
4052
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="442.813406" transform="rotate(-0 53.23 442.813406)">0.025</text>
4053
  </g>
4054
  </g>
4055
  <g id="ytick_2">
4056
  <g id="grid-y--3" class="grid grid-y">
4057
- <path d="M 60.23 360.09469 L 847.294169 360.09469 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
  </g>
4059
  <g id="line2d_11">
4060
  <g>
4061
- <use ns4:href="#m0fca2865ba" x="60.23" y="360.09469" style="stroke: #000000; stroke-width: 0.8" />
4062
  </g>
4063
  </g>
4064
  <g id="text_11">
4065
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="363.893909" transform="rotate(-0 53.23 363.893909)">0.030</text>
4066
  </g>
4067
  </g>
4068
  <g id="ytick_3">
4069
  <g id="grid-y--4" class="grid grid-y">
4070
- <path d="M 60.23 281.175192 L 847.294169 281.175192 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
  </g>
4072
  <g id="line2d_12">
4073
  <g>
4074
- <use ns4:href="#m0fca2865ba" x="60.23" y="281.175192" style="stroke: #000000; stroke-width: 0.8" />
4075
  </g>
4076
  </g>
4077
  <g id="text_12">
4078
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="284.974411" transform="rotate(-0 53.23 284.974411)">0.035</text>
4079
  </g>
4080
  </g>
4081
  <g id="ytick_4">
4082
  <g id="grid-y--5" class="grid grid-y">
4083
- <path d="M 60.23 202.255694 L 847.294169 202.255694 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
  </g>
4085
  <g id="line2d_13">
4086
  <g>
4087
- <use ns4:href="#m0fca2865ba" x="60.23" y="202.255694" style="stroke: #000000; stroke-width: 0.8" />
4088
  </g>
4089
  </g>
4090
  <g id="text_13">
4091
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="206.054913" transform="rotate(-0 53.23 206.054913)">0.040</text>
4092
  </g>
4093
  </g>
4094
  <g id="ytick_5">
4095
  <g id="grid-y--6" class="grid grid-y">
4096
- <path d="M 60.23 123.336197 L 847.294169 123.336197 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4097
  </g>
4098
  <g id="line2d_14">
4099
  <g>
4100
- <use ns4:href="#m0fca2865ba" x="60.23" y="123.336197" style="stroke: #000000; stroke-width: 0.8" />
4101
  </g>
4102
  </g>
4103
  <g id="text_14">
4104
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="127.135415" transform="rotate(-0 53.23 127.135415)">0.045</text>
4105
  </g>
4106
  </g>
4107
  <g id="ytick_6">
4108
  <g id="grid-y--7" class="grid grid-y">
4109
- <path d="M 60.23 44.416699 L 847.294169 44.416699 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4110
  </g>
4111
  <g id="line2d_15">
4112
  <g>
4113
- <use ns4:href="#m0fca2865ba" x="60.23" y="44.416699" style="stroke: #000000; stroke-width: 0.8" />
4114
  </g>
4115
  </g>
4116
  <g id="text_15">
4117
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="48.215918" transform="rotate(-0 53.23 48.215918)">0.050</text>
4118
  </g>
4119
  </g>
4120
  <g id="label--y" class="ylabel">
@@ -4122,37 +4122,37 @@ body[data-tool="eraser"] .main-content {
4122
  </g>
4123
  </g>
4124
  <g id="series--hf-kernels-swiglu" class="series">
4125
- <path d="M 96.005644 451.16779 L 185.444754 377.125517 L 274.883864 379.177424 L 364.322974 390.383993 L 453.762084 388.647765 L 543.201194 386.12234 L 632.640304 394.961324 L 722.079415 388.032192 L 811.518525 413.744164 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4126
  <defs>
4127
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4128
  </defs>
4129
  <g clip-path="url(#p620c7d392f)">
4130
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4131
- <use ns4:href="#md7efaf3aec" x="185.444754" y="377.125517" style="fill: #1f77b4; stroke: #1f77b4" />
4132
- <use ns4:href="#md7efaf3aec" x="274.883864" y="379.177424" style="fill: #1f77b4; stroke: #1f77b4" />
4133
- <use ns4:href="#md7efaf3aec" x="364.322974" y="390.383993" style="fill: #1f77b4; stroke: #1f77b4" />
4134
- <use ns4:href="#md7efaf3aec" x="453.762084" y="388.647765" style="fill: #1f77b4; stroke: #1f77b4" />
4135
- <use ns4:href="#md7efaf3aec" x="543.201194" y="386.12234" style="fill: #1f77b4; stroke: #1f77b4" />
4136
- <use ns4:href="#md7efaf3aec" x="632.640304" y="394.961324" style="fill: #1f77b4; stroke: #1f77b4" />
4137
- <use ns4:href="#md7efaf3aec" x="722.079415" y="388.032192" style="fill: #1f77b4; stroke: #1f77b4" />
4138
- <use ns4:href="#md7efaf3aec" x="811.518525" y="413.744164" style="fill: #1f77b4; stroke: #1f77b4" />
4139
  </g>
4140
  </g>
4141
  <g id="series--torch-eager" class="series">
4142
- <path d="M 96.005644 151.88927 L 185.444754 47.08418 L 274.883864 51.361615 L 364.322974 51.345828 L 453.762084 75.653034 L 543.201194 81.335239 L 632.640304 88.280153 L 722.079415 74.863844 L 811.518525 78.494139 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4143
  <defs>
4144
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4145
  </defs>
4146
  <g clip-path="url(#p620c7d392f)">
4147
- <use ns4:href="#m9b8c54d372" x="96.005644" y="151.88927" style="fill: #ff7f0e; stroke: #ff7f0e" />
4148
  <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4149
- <use ns4:href="#m9b8c54d372" x="274.883864" y="51.361615" style="fill: #ff7f0e; stroke: #ff7f0e" />
4150
- <use ns4:href="#m9b8c54d372" x="364.322974" y="51.345828" style="fill: #ff7f0e; stroke: #ff7f0e" />
4151
- <use ns4:href="#m9b8c54d372" x="453.762084" y="75.653034" style="fill: #ff7f0e; stroke: #ff7f0e" />
4152
- <use ns4:href="#m9b8c54d372" x="543.201194" y="81.335239" style="fill: #ff7f0e; stroke: #ff7f0e" />
4153
- <use ns4:href="#m9b8c54d372" x="632.640304" y="88.280153" style="fill: #ff7f0e; stroke: #ff7f0e" />
4154
- <use ns4:href="#m9b8c54d372" x="722.079415" y="74.863844" style="fill: #ff7f0e; stroke: #ff7f0e" />
4155
- <use ns4:href="#m9b8c54d372" x="811.518525" y="78.494139" style="fill: #ff7f0e; stroke: #ff7f0e" />
4156
  </g>
4157
  </g>
4158
  <g id="patch_3">
@@ -4210,7 +4210,7 @@ body[data-tool="eraser"] .main-content {
4210
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4211
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4212
  </span> |
4213
- Cell: combine | 4.55s
4214
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4215
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4216
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4336,7 +4336,7 @@ Implementations included:
4336
  <div class="uv-install-logs" id="uv-logs-combine">
4337
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4338
  <div class="uv-logs-content" style="display: none;">
4339
- Installed 37 packages in 348ms
4340
  </div>
4341
  </div>
4342
  <div class="cell-artifacts">
@@ -4349,11 +4349,11 @@ Installed 37 packages in 348ms
4349
  <rdf:RDF>
4350
  <ns2:Work>
4351
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4352
- <dc:date>2025-11-10T22:12:14.776732</dc:date>
4353
  <dc:format>image/svg+xml</dc:format>
4354
  <dc:creator>
4355
  <ns2:Agent>
4356
- <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
4357
  </ns2:Agent>
4358
  </dc:creator>
4359
  </ns2:Work>
@@ -4498,83 +4498,83 @@ Installed 37 packages in 348ms
4498
  <g id="matplotlib.axis_2">
4499
  <g id="ytick_1">
4500
  <g id="grid-y--2" class="grid grid-y">
4501
- <path d="M 60.23 439.014187 L 847.294169 439.014187 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4502
  </g>
4503
  <g id="line2d_10">
4504
  <defs>
4505
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4506
  </defs>
4507
  <g>
4508
- <use ns4:href="#m0fca2865ba" x="60.23" y="439.014187" style="stroke: #000000; stroke-width: 0.8" />
4509
  </g>
4510
  </g>
4511
  <g id="text_10">
4512
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="442.813406" transform="rotate(-0 53.23 442.813406)">0.025</text>
4513
  </g>
4514
  </g>
4515
  <g id="ytick_2">
4516
  <g id="grid-y--3" class="grid grid-y">
4517
- <path d="M 60.23 360.09469 L 847.294169 360.09469 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4518
  </g>
4519
  <g id="line2d_11">
4520
  <g>
4521
- <use ns4:href="#m0fca2865ba" x="60.23" y="360.09469" style="stroke: #000000; stroke-width: 0.8" />
4522
  </g>
4523
  </g>
4524
  <g id="text_11">
4525
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="363.893909" transform="rotate(-0 53.23 363.893909)">0.030</text>
4526
  </g>
4527
  </g>
4528
  <g id="ytick_3">
4529
  <g id="grid-y--4" class="grid grid-y">
4530
- <path d="M 60.23 281.175192 L 847.294169 281.175192 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4531
  </g>
4532
  <g id="line2d_12">
4533
  <g>
4534
- <use ns4:href="#m0fca2865ba" x="60.23" y="281.175192" style="stroke: #000000; stroke-width: 0.8" />
4535
  </g>
4536
  </g>
4537
  <g id="text_12">
4538
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="284.974411" transform="rotate(-0 53.23 284.974411)">0.035</text>
4539
  </g>
4540
  </g>
4541
  <g id="ytick_4">
4542
  <g id="grid-y--5" class="grid grid-y">
4543
- <path d="M 60.23 202.255694 L 847.294169 202.255694 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4544
  </g>
4545
  <g id="line2d_13">
4546
  <g>
4547
- <use ns4:href="#m0fca2865ba" x="60.23" y="202.255694" style="stroke: #000000; stroke-width: 0.8" />
4548
  </g>
4549
  </g>
4550
  <g id="text_13">
4551
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="206.054913" transform="rotate(-0 53.23 206.054913)">0.040</text>
4552
  </g>
4553
  </g>
4554
  <g id="ytick_5">
4555
  <g id="grid-y--6" class="grid grid-y">
4556
- <path d="M 60.23 123.336197 L 847.294169 123.336197 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4557
  </g>
4558
  <g id="line2d_14">
4559
  <g>
4560
- <use ns4:href="#m0fca2865ba" x="60.23" y="123.336197" style="stroke: #000000; stroke-width: 0.8" />
4561
  </g>
4562
  </g>
4563
  <g id="text_14">
4564
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="127.135415" transform="rotate(-0 53.23 127.135415)">0.045</text>
4565
  </g>
4566
  </g>
4567
  <g id="ytick_6">
4568
  <g id="grid-y--7" class="grid grid-y">
4569
- <path d="M 60.23 44.416699 L 847.294169 44.416699 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4570
  </g>
4571
  <g id="line2d_15">
4572
  <g>
4573
- <use ns4:href="#m0fca2865ba" x="60.23" y="44.416699" style="stroke: #000000; stroke-width: 0.8" />
4574
  </g>
4575
  </g>
4576
  <g id="text_15">
4577
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="48.215918" transform="rotate(-0 53.23 48.215918)">0.050</text>
4578
  </g>
4579
  </g>
4580
  <g id="label--y" class="ylabel">
@@ -4582,37 +4582,37 @@ Installed 37 packages in 348ms
4582
  </g>
4583
  </g>
4584
  <g id="series--hf-kernels-swiglu" class="series">
4585
- <path d="M 96.005644 451.16779 L 185.444754 377.125517 L 274.883864 379.177424 L 364.322974 390.383993 L 453.762084 388.647765 L 543.201194 386.12234 L 632.640304 394.961324 L 722.079415 388.032192 L 811.518525 413.744164 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4586
  <defs>
4587
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4588
  </defs>
4589
  <g clip-path="url(#p620c7d392f)">
4590
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4591
- <use ns4:href="#md7efaf3aec" x="185.444754" y="377.125517" style="fill: #1f77b4; stroke: #1f77b4" />
4592
- <use ns4:href="#md7efaf3aec" x="274.883864" y="379.177424" style="fill: #1f77b4; stroke: #1f77b4" />
4593
- <use ns4:href="#md7efaf3aec" x="364.322974" y="390.383993" style="fill: #1f77b4; stroke: #1f77b4" />
4594
- <use ns4:href="#md7efaf3aec" x="453.762084" y="388.647765" style="fill: #1f77b4; stroke: #1f77b4" />
4595
- <use ns4:href="#md7efaf3aec" x="543.201194" y="386.12234" style="fill: #1f77b4; stroke: #1f77b4" />
4596
- <use ns4:href="#md7efaf3aec" x="632.640304" y="394.961324" style="fill: #1f77b4; stroke: #1f77b4" />
4597
- <use ns4:href="#md7efaf3aec" x="722.079415" y="388.032192" style="fill: #1f77b4; stroke: #1f77b4" />
4598
- <use ns4:href="#md7efaf3aec" x="811.518525" y="413.744164" style="fill: #1f77b4; stroke: #1f77b4" />
4599
  </g>
4600
  </g>
4601
  <g id="series--torch-eager" class="series">
4602
- <path d="M 96.005644 151.88927 L 185.444754 47.08418 L 274.883864 51.361615 L 364.322974 51.345828 L 453.762084 75.653034 L 543.201194 81.335239 L 632.640304 88.280153 L 722.079415 74.863844 L 811.518525 78.494139 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4603
  <defs>
4604
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4605
  </defs>
4606
  <g clip-path="url(#p620c7d392f)">
4607
- <use ns4:href="#m9b8c54d372" x="96.005644" y="151.88927" style="fill: #ff7f0e; stroke: #ff7f0e" />
4608
  <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4609
- <use ns4:href="#m9b8c54d372" x="274.883864" y="51.361615" style="fill: #ff7f0e; stroke: #ff7f0e" />
4610
- <use ns4:href="#m9b8c54d372" x="364.322974" y="51.345828" style="fill: #ff7f0e; stroke: #ff7f0e" />
4611
- <use ns4:href="#m9b8c54d372" x="453.762084" y="75.653034" style="fill: #ff7f0e; stroke: #ff7f0e" />
4612
- <use ns4:href="#m9b8c54d372" x="543.201194" y="81.335239" style="fill: #ff7f0e; stroke: #ff7f0e" />
4613
- <use ns4:href="#m9b8c54d372" x="632.640304" y="88.280153" style="fill: #ff7f0e; stroke: #ff7f0e" />
4614
- <use ns4:href="#m9b8c54d372" x="722.079415" y="74.863844" style="fill: #ff7f0e; stroke: #ff7f0e" />
4615
- <use ns4:href="#m9b8c54d372" x="811.518525" y="78.494139" style="fill: #ff7f0e; stroke: #ff7f0e" />
4616
  </g>
4617
  </g>
4618
  <g id="patch_3">
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T19:10:09.156027</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
3896
+ <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
3897
  </ns2:Agent>
3898
  </dc:creator>
3899
  </ns2:Work>
 
4038
  <g id="matplotlib.axis_2">
4039
  <g id="ytick_1">
4040
  <g id="grid-y--2" class="grid grid-y">
4041
+ <path d="M 60.23 449.91292 L 847.294169 449.91292 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4042
  </g>
4043
  <g id="line2d_10">
4044
  <defs>
4045
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4046
  </defs>
4047
  <g>
4048
+ <use ns4:href="#m0fca2865ba" x="60.23" y="449.91292" style="stroke: #000000; stroke-width: 0.8" />
4049
  </g>
4050
  </g>
4051
  <g id="text_10">
4052
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="453.712139" transform="rotate(-0 53.23 453.712139)">0.025</text>
4053
  </g>
4054
  </g>
4055
  <g id="ytick_2">
4056
  <g id="grid-y--3" class="grid grid-y">
4057
+ <path d="M 60.23 371.483588 L 847.294169 371.483588 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
  </g>
4059
  <g id="line2d_11">
4060
  <g>
4061
+ <use ns4:href="#m0fca2865ba" x="60.23" y="371.483588" style="stroke: #000000; stroke-width: 0.8" />
4062
  </g>
4063
  </g>
4064
  <g id="text_11">
4065
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="375.282806" transform="rotate(-0 53.23 375.282806)">0.030</text>
4066
  </g>
4067
  </g>
4068
  <g id="ytick_3">
4069
  <g id="grid-y--4" class="grid grid-y">
4070
+ <path d="M 60.23 293.054255 L 847.294169 293.054255 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
  </g>
4072
  <g id="line2d_12">
4073
  <g>
4074
+ <use ns4:href="#m0fca2865ba" x="60.23" y="293.054255" style="stroke: #000000; stroke-width: 0.8" />
4075
  </g>
4076
  </g>
4077
  <g id="text_12">
4078
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="296.853473" transform="rotate(-0 53.23 296.853473)">0.035</text>
4079
  </g>
4080
  </g>
4081
  <g id="ytick_4">
4082
  <g id="grid-y--5" class="grid grid-y">
4083
+ <path d="M 60.23 214.624922 L 847.294169 214.624922 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
  </g>
4085
  <g id="line2d_13">
4086
  <g>
4087
+ <use ns4:href="#m0fca2865ba" x="60.23" y="214.624922" style="stroke: #000000; stroke-width: 0.8" />
4088
  </g>
4089
  </g>
4090
  <g id="text_13">
4091
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="218.42414" transform="rotate(-0 53.23 218.42414)">0.040</text>
4092
  </g>
4093
  </g>
4094
  <g id="ytick_5">
4095
  <g id="grid-y--6" class="grid grid-y">
4096
+ <path d="M 60.23 136.195589 L 847.294169 136.195589 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4097
  </g>
4098
  <g id="line2d_14">
4099
  <g>
4100
+ <use ns4:href="#m0fca2865ba" x="60.23" y="136.195589" style="stroke: #000000; stroke-width: 0.8" />
4101
  </g>
4102
  </g>
4103
  <g id="text_14">
4104
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="139.994807" transform="rotate(-0 53.23 139.994807)">0.045</text>
4105
  </g>
4106
  </g>
4107
  <g id="ytick_6">
4108
  <g id="grid-y--7" class="grid grid-y">
4109
+ <path d="M 60.23 57.766256 L 847.294169 57.766256 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4110
  </g>
4111
  <g id="line2d_15">
4112
  <g>
4113
+ <use ns4:href="#m0fca2865ba" x="60.23" y="57.766256" style="stroke: #000000; stroke-width: 0.8" />
4114
  </g>
4115
  </g>
4116
  <g id="text_15">
4117
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="61.565474" transform="rotate(-0 53.23 61.565474)">0.050</text>
4118
  </g>
4119
  </g>
4120
  <g id="label--y" class="ylabel">
 
4122
  </g>
4123
  </g>
4124
  <g id="series--hf-kernels-swiglu" class="series">
4125
+ <path d="M 96.005644 451.16779 L 185.444754 386.071444 L 274.883864 387.781203 L 364.322974 389.506648 L 453.762084 389.992912 L 543.201194 390.290944 L 632.640304 396.894691 L 722.079415 389.992908 L 811.518525 394.228094 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4126
  <defs>
4127
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4128
  </defs>
4129
  <g clip-path="url(#p620c7d392f)">
4130
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4131
+ <use ns4:href="#md7efaf3aec" x="185.444754" y="386.071444" style="fill: #1f77b4; stroke: #1f77b4" />
4132
+ <use ns4:href="#md7efaf3aec" x="274.883864" y="387.781203" style="fill: #1f77b4; stroke: #1f77b4" />
4133
+ <use ns4:href="#md7efaf3aec" x="364.322974" y="389.506648" style="fill: #1f77b4; stroke: #1f77b4" />
4134
+ <use ns4:href="#md7efaf3aec" x="453.762084" y="389.992912" style="fill: #1f77b4; stroke: #1f77b4" />
4135
+ <use ns4:href="#md7efaf3aec" x="543.201194" y="390.290944" style="fill: #1f77b4; stroke: #1f77b4" />
4136
+ <use ns4:href="#md7efaf3aec" x="632.640304" y="396.894691" style="fill: #1f77b4; stroke: #1f77b4" />
4137
+ <use ns4:href="#md7efaf3aec" x="722.079415" y="389.992908" style="fill: #1f77b4; stroke: #1f77b4" />
4138
+ <use ns4:href="#md7efaf3aec" x="811.518525" y="394.228094" style="fill: #1f77b4; stroke: #1f77b4" />
4139
  </g>
4140
  </g>
4141
  <g id="series--torch-eager" class="series">
4142
+ <path d="M 96.005644 171.802506 L 185.444754 47.08418 L 274.883864 55.554548 L 364.322974 56.966277 L 453.762084 62.45633 L 543.201194 80.651935 L 632.640304 68.416959 L 722.079415 69.358111 L 811.518525 82.847956 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4143
  <defs>
4144
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4145
  </defs>
4146
  <g clip-path="url(#p620c7d392f)">
4147
+ <use ns4:href="#m9b8c54d372" x="96.005644" y="171.802506" style="fill: #ff7f0e; stroke: #ff7f0e" />
4148
  <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4149
+ <use ns4:href="#m9b8c54d372" x="274.883864" y="55.554548" style="fill: #ff7f0e; stroke: #ff7f0e" />
4150
+ <use ns4:href="#m9b8c54d372" x="364.322974" y="56.966277" style="fill: #ff7f0e; stroke: #ff7f0e" />
4151
+ <use ns4:href="#m9b8c54d372" x="453.762084" y="62.45633" style="fill: #ff7f0e; stroke: #ff7f0e" />
4152
+ <use ns4:href="#m9b8c54d372" x="543.201194" y="80.651935" style="fill: #ff7f0e; stroke: #ff7f0e" />
4153
+ <use ns4:href="#m9b8c54d372" x="632.640304" y="68.416959" style="fill: #ff7f0e; stroke: #ff7f0e" />
4154
+ <use ns4:href="#m9b8c54d372" x="722.079415" y="69.358111" style="fill: #ff7f0e; stroke: #ff7f0e" />
4155
+ <use ns4:href="#m9b8c54d372" x="811.518525" y="82.847956" style="fill: #ff7f0e; stroke: #ff7f0e" />
4156
  </g>
4157
  </g>
4158
  <g id="patch_3">
 
4210
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4211
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4212
  </span> |
4213
+ Cell: combine | 4.43s
4214
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4215
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4216
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4336
  <div class="uv-install-logs" id="uv-logs-combine">
4337
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4338
  <div class="uv-logs-content" style="display: none;">
4339
+ Installed 37 packages in 283ms
4340
  </div>
4341
  </div>
4342
  <div class="cell-artifacts">
 
4349
  <rdf:RDF>
4350
  <ns2:Work>
4351
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4352
+ <dc:date>2025-12-19T19:10:09.156027</dc:date>
4353
  <dc:format>image/svg+xml</dc:format>
4354
  <dc:creator>
4355
  <ns2:Agent>
4356
+ <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
4357
  </ns2:Agent>
4358
  </dc:creator>
4359
  </ns2:Work>
 
4498
  <g id="matplotlib.axis_2">
4499
  <g id="ytick_1">
4500
  <g id="grid-y--2" class="grid grid-y">
4501
+ <path d="M 60.23 449.91292 L 847.294169 449.91292 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4502
  </g>
4503
  <g id="line2d_10">
4504
  <defs>
4505
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4506
  </defs>
4507
  <g>
4508
+ <use ns4:href="#m0fca2865ba" x="60.23" y="449.91292" style="stroke: #000000; stroke-width: 0.8" />
4509
  </g>
4510
  </g>
4511
  <g id="text_10">
4512
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="453.712139" transform="rotate(-0 53.23 453.712139)">0.025</text>
4513
  </g>
4514
  </g>
4515
  <g id="ytick_2">
4516
  <g id="grid-y--3" class="grid grid-y">
4517
+ <path d="M 60.23 371.483588 L 847.294169 371.483588 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4518
  </g>
4519
  <g id="line2d_11">
4520
  <g>
4521
+ <use ns4:href="#m0fca2865ba" x="60.23" y="371.483588" style="stroke: #000000; stroke-width: 0.8" />
4522
  </g>
4523
  </g>
4524
  <g id="text_11">
4525
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="375.282806" transform="rotate(-0 53.23 375.282806)">0.030</text>
4526
  </g>
4527
  </g>
4528
  <g id="ytick_3">
4529
  <g id="grid-y--4" class="grid grid-y">
4530
+ <path d="M 60.23 293.054255 L 847.294169 293.054255 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4531
  </g>
4532
  <g id="line2d_12">
4533
  <g>
4534
+ <use ns4:href="#m0fca2865ba" x="60.23" y="293.054255" style="stroke: #000000; stroke-width: 0.8" />
4535
  </g>
4536
  </g>
4537
  <g id="text_12">
4538
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="296.853473" transform="rotate(-0 53.23 296.853473)">0.035</text>
4539
  </g>
4540
  </g>
4541
  <g id="ytick_4">
4542
  <g id="grid-y--5" class="grid grid-y">
4543
+ <path d="M 60.23 214.624922 L 847.294169 214.624922 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4544
  </g>
4545
  <g id="line2d_13">
4546
  <g>
4547
+ <use ns4:href="#m0fca2865ba" x="60.23" y="214.624922" style="stroke: #000000; stroke-width: 0.8" />
4548
  </g>
4549
  </g>
4550
  <g id="text_13">
4551
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="218.42414" transform="rotate(-0 53.23 218.42414)">0.040</text>
4552
  </g>
4553
  </g>
4554
  <g id="ytick_5">
4555
  <g id="grid-y--6" class="grid grid-y">
4556
+ <path d="M 60.23 136.195589 L 847.294169 136.195589 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4557
  </g>
4558
  <g id="line2d_14">
4559
  <g>
4560
+ <use ns4:href="#m0fca2865ba" x="60.23" y="136.195589" style="stroke: #000000; stroke-width: 0.8" />
4561
  </g>
4562
  </g>
4563
  <g id="text_14">
4564
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="139.994807" transform="rotate(-0 53.23 139.994807)">0.045</text>
4565
  </g>
4566
  </g>
4567
  <g id="ytick_6">
4568
  <g id="grid-y--7" class="grid grid-y">
4569
+ <path d="M 60.23 57.766256 L 847.294169 57.766256 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4570
  </g>
4571
  <g id="line2d_15">
4572
  <g>
4573
+ <use ns4:href="#m0fca2865ba" x="60.23" y="57.766256" style="stroke: #000000; stroke-width: 0.8" />
4574
  </g>
4575
  </g>
4576
  <g id="text_15">
4577
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="61.565474" transform="rotate(-0 53.23 61.565474)">0.050</text>
4578
  </g>
4579
  </g>
4580
  <g id="label--y" class="ylabel">
 
4582
  </g>
4583
  </g>
4584
  <g id="series--hf-kernels-swiglu" class="series">
4585
+ <path d="M 96.005644 451.16779 L 185.444754 386.071444 L 274.883864 387.781203 L 364.322974 389.506648 L 453.762084 389.992912 L 543.201194 390.290944 L 632.640304 396.894691 L 722.079415 389.992908 L 811.518525 394.228094 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4586
  <defs>
4587
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4588
  </defs>
4589
  <g clip-path="url(#p620c7d392f)">
4590
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4591
+ <use ns4:href="#md7efaf3aec" x="185.444754" y="386.071444" style="fill: #1f77b4; stroke: #1f77b4" />
4592
+ <use ns4:href="#md7efaf3aec" x="274.883864" y="387.781203" style="fill: #1f77b4; stroke: #1f77b4" />
4593
+ <use ns4:href="#md7efaf3aec" x="364.322974" y="389.506648" style="fill: #1f77b4; stroke: #1f77b4" />
4594
+ <use ns4:href="#md7efaf3aec" x="453.762084" y="389.992912" style="fill: #1f77b4; stroke: #1f77b4" />
4595
+ <use ns4:href="#md7efaf3aec" x="543.201194" y="390.290944" style="fill: #1f77b4; stroke: #1f77b4" />
4596
+ <use ns4:href="#md7efaf3aec" x="632.640304" y="396.894691" style="fill: #1f77b4; stroke: #1f77b4" />
4597
+ <use ns4:href="#md7efaf3aec" x="722.079415" y="389.992908" style="fill: #1f77b4; stroke: #1f77b4" />
4598
+ <use ns4:href="#md7efaf3aec" x="811.518525" y="394.228094" style="fill: #1f77b4; stroke: #1f77b4" />
4599
  </g>
4600
  </g>
4601
  <g id="series--torch-eager" class="series">
4602
+ <path d="M 96.005644 171.802506 L 185.444754 47.08418 L 274.883864 55.554548 L 364.322974 56.966277 L 453.762084 62.45633 L 543.201194 80.651935 L 632.640304 68.416959 L 722.079415 69.358111 L 811.518525 82.847956 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4603
  <defs>
4604
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4605
  </defs>
4606
  <g clip-path="url(#p620c7d392f)">
4607
+ <use ns4:href="#m9b8c54d372" x="96.005644" y="171.802506" style="fill: #ff7f0e; stroke: #ff7f0e" />
4608
  <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4609
+ <use ns4:href="#m9b8c54d372" x="274.883864" y="55.554548" style="fill: #ff7f0e; stroke: #ff7f0e" />
4610
+ <use ns4:href="#m9b8c54d372" x="364.322974" y="56.966277" style="fill: #ff7f0e; stroke: #ff7f0e" />
4611
+ <use ns4:href="#m9b8c54d372" x="453.762084" y="62.45633" style="fill: #ff7f0e; stroke: #ff7f0e" />
4612
+ <use ns4:href="#m9b8c54d372" x="543.201194" y="80.651935" style="fill: #ff7f0e; stroke: #ff7f0e" />
4613
+ <use ns4:href="#m9b8c54d372" x="632.640304" y="68.416959" style="fill: #ff7f0e; stroke: #ff7f0e" />
4614
+ <use ns4:href="#m9b8c54d372" x="722.079415" y="69.358111" style="fill: #ff7f0e; stroke: #ff7f0e" />
4615
+ <use ns4:href="#m9b8c54d372" x="811.518525" y="82.847956" style="fill: #ff7f0e; stroke: #ff7f0e" />
4616
  </g>
4617
  </g>
4618
  <g id="patch_3">
causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl CHANGED
@@ -1,24 +1,24 @@
1
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0682910000477932, "p50": 0.0693509999791786, "p90": 0.06985099997791622, "mean": 0.0695532000008825, "iqr": 0.0006490000146186503, "raw_times": [0.0710710000362269, 0.06985099997791622, 0.0693509999791786, 0.0682910000477932, 0.06920199996329757], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07755100000395032, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
2
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08078100000830091, "p50": 0.08191099999521612, "p90": 0.08361100003639876, "mean": 0.08277140000245709, "iqr": 0.0022690000491820683, "raw_times": [0.08191099999521612, 0.08361100003639876, 0.08078100000830091, 0.0813419999872167, 0.08621199998515294], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08947200001330202, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
3
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07914100001471525, "p50": 0.0805310000373538, "p90": 0.08094200001096397, "mean": 0.08066520001648314, "iqr": 0.0007310000000870787, "raw_times": [0.0805310000373538, 0.0825010000085058, 0.08094200001096397, 0.08021100001087689, 0.07914100001471525], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0867210000023988, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
4
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08032099998445119, "p50": 0.08104099998718084, "p90": 0.08252100002437146, "mean": 0.08145320000494394, "iqr": 0.0017790000015338592, "raw_times": [0.08104099998718084, 0.0826410000058786, 0.08252100002437146, 0.0807420000228376, 0.08032099998445119], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08690100003150292, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
5
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08014200000161509, "p50": 0.08045200002015918, "p90": 0.08049199999504708, "mean": 0.08065180001040062, "iqr": 9.099994713324122e-05, "raw_times": [0.08049199999504708, 0.08177199998726792, 0.08045200002015918, 0.08014200000161509, 0.08040100004791384], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08574200001021381, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
6
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08046099998182399, "p50": 0.08075099998450241, "p90": 0.08139099998061283, "mean": 0.0811031999774059, "iqr": 0.0006690000304843124, "raw_times": [0.08075099998450241, 0.08046099998182399, 0.08072199995012852, 0.08139099998061283, 0.08219099998996171], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08660099996404824, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
7
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07871100001466402, "p50": 0.0798610000174449, "p90": 0.08112200004006809, "mean": 0.07994760002247858, "iqr": 0.0023900000201138027, "raw_times": [0.07873200001995428, 0.07871100001466402, 0.08112200004006809, 0.08131200002026162, 0.0798610000174449], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0850010000021939, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
8
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08012099999632483, "p50": 0.08051100002148814, "p90": 0.08112199998322467, "mean": 0.08078719999957684, "iqr": 0.0007209999921542476, "raw_times": [0.08051100002148814, 0.08040099999107042, 0.08012099999632483, 0.08112199998322467, 0.08178100000577615], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08605099998248988, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
9
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07783099999869592, "p50": 0.07915100002264808, "p90": 0.07922200001075907, "mean": 0.0788234000083321, "iqr": 0.0006800000278417428, "raw_times": [0.07915100002264808, 0.0793710000266401, 0.07854199998291733, 0.07922200001075907, 0.07783099999869592], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08282100003498272, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
10
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07959199996321331, "p50": 0.08125099998324004, "p90": 0.08163100000047052, "mean": 0.08102919999828373, "iqr": 0.0016999999843392288, "raw_times": [0.08163100000047052, 0.08274100002836349, 0.07959199996321331, 0.07993100001613129, 0.08125099998324004], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08671099999446596, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
11
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.15852199999244476, "p50": 0.15985300001375435, "p90": 0.15988199999128483, "mean": 0.15945239999837213, "iqr": 0.0009890000001178123, "raw_times": [0.15988199999128483, 0.15985300001375435, 0.1601120000032097, 0.15889299999116702, 0.15852199999244476], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.16028200002438098, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
12
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16347299998642484, "p50": 0.1640229999679832, "p90": 0.16425199999048345, "mean": 0.1702108000017688, "iqr": 0.00036999995245423634, "raw_times": [0.16388200003802922, 0.1954240000259233, 0.16347299998642484, 0.1640229999679832, 0.16425199999048345], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1656729999695017, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
13
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07862100000011196, "p50": 0.07903099998429752, "p90": 0.08047100004660024, "mean": 0.08115100000622988, "iqr": 0.001730000064981141, "raw_times": [0.07862100000011196, 0.0787409999816191, 0.08047100004660024, 0.08889100001852057, 0.07903099998429752], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08358100001260027, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
14
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08011200003466001, "p50": 0.0809910000043601, "p90": 0.08145100002820982, "mean": 0.0812654000128532, "iqr": 0.0010090000159834744, "raw_times": [0.08044200001222634, 0.08333099998480975, 0.08145100002820982, 0.0809910000043601, 0.08011200003466001], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08477100004711247, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
15
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07822099996701581, "p50": 0.07905100000016319, "p90": 0.07909100003189451, "mean": 0.07904699999699005, "iqr": 0.00011000003041772288, "raw_times": [0.07822099996701581, 0.07989099998439997, 0.07898100000147679, 0.07905100000016319, 0.07909100003189451], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08219099998996171, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
16
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07998199998837663, "p50": 0.08066199995937495, "p90": 0.08123099996737437, "mean": 0.08075179998741078, "iqr": 0.0008489999459015962, "raw_times": [0.07998199998837663, 0.08150200000045515, 0.08123099996737437, 0.08066199995937495, 0.08038200002147278], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08594100000891558, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
17
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07915100002264808, "p50": 0.0798309999936464, "p90": 0.07988099997646714, "mean": 0.07979119999390605, "iqr": 0.0006789999815737247, "raw_times": [0.07915100002264808, 0.08089099998187521, 0.07988099997646714, 0.07920199999489341, 0.0798309999936464], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08425199996509036, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
18
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08051100002148814, "p50": 0.08109199995942618, "p90": 0.08230200000980403, "mean": 0.08190759998569774, "iqr": 0.0016110000160551863, "raw_times": [0.08109199995942618, 0.08494199994402152, 0.08069099999374885, 0.08051100002148814, 0.08230200000980403], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09682099999963611, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
19
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07942100000946084, "p50": 0.08037099996727193, "p90": 0.08038100003204818, "mean": 0.08024699999396034, "iqr": 0.0001800000291041215, "raw_times": [0.07942100000946084, 0.08038100003204818, 0.08020100000294406, 0.08037099996727193, 0.08086099995807672], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0851419999889913, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
20
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07963100000552004, "p50": 0.08073099996863675, "p90": 0.08166100002426901, "mean": 0.08793740000783146, "iqr": 0.0012990000186619, "raw_times": [0.07963100000552004, 0.08073099996863675, 0.11730200003512437, 0.08166100002426901, 0.08036200000560711], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08495099996252975, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
21
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0912609999659253, "p50": 0.09163199996464755, "p90": 0.09213200002022859, "mean": 0.09177579999004593, "iqr": 0.0006200000370881753, "raw_times": [0.09151199998314041, 0.09163199996464755, 0.0912609999659253, 0.09213200002022859, 0.09234200001628778], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0949509999941256, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
22
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09837099997866972, "p50": 0.09879099997078811, "p90": 0.09935200000654731, "mean": 0.09890359998507847, "iqr": 0.0009400000067216752, "raw_times": [0.09841199999982564, 0.09837099997866972, 0.09879099997078811, 0.09959199996956158, 0.09935200000654731], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10164200000417623, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
23
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.47857700002396086, "p50": 0.48315699996237527, "p90": 0.4835080000020753, "mean": 0.48229559999981575, "iqr": 0.0009899999895424116, "raw_times": [0.48251800001253287, 0.4837179999981345, 0.4835080000020753, 0.47857700002396086, 0.48315699996237527], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.48609800001031545, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
24
- {"ts": "2025-11-10T21:59:05Z", "run": "2e4d4658589243d8bcde88068971c4df", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.49741800000902003, "p50": 0.5014380000147867, "p90": 0.5020579999950314, "mean": 0.501099999996768, "iqr": 0.0024500000108673703, "raw_times": [0.5020579999950314, 0.5014380000147867, 0.5049779999808379, 0.49960799998416405, 0.49741800000902003], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5005179999670872, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
 
1
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07121199996618088, "p50": 0.07280099998752121, "p90": 0.07319100001268453, "mean": 0.07264140000415864, "iqr": 0.0009299999987888441, "raw_times": [0.07226100001389568, 0.07280099998752121, 0.07121199996618088, 0.07319100001268453, 0.0737420000405109], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07994200001348872, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
2
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08309200001122008, "p50": 0.08365100001128667, "p90": 0.08423100001664352, "mean": 0.08372920000283557, "iqr": 0.0009500000146545062, "raw_times": [0.08328100000198901, 0.08439099997303856, 0.08365100001128667, 0.08309200001122008, 0.08423100001664352], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08683200002224112, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
3
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08177099999784332, "p50": 0.08268100003760992, "p90": 0.08301100001517625, "mean": 0.08273520002148871, "iqr": 0.0009299999987888441, "raw_times": [0.0820810000163874, 0.08413200004042665, 0.08301100001517625, 0.08268100003760992, 0.08177099999784332], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08564099999830432, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
4
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08037100002411535, "p50": 0.0813119999634182, "p90": 0.08183099998859689, "mean": 0.08147719998987668, "iqr": 0.0007300000106624793, "raw_times": [0.08037100002411535, 0.08183099998859689, 0.08277099999531856, 0.08110099997793441, 0.0813119999634182], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08382099997561454, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
5
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07959199996321331, "p50": 0.08119099999248647, "p90": 0.08157100000971695, "mean": 0.08080360000803921, "iqr": 0.0016889999869817984, "raw_times": [0.08157100000971695, 0.08178200005204417, 0.08119099999248647, 0.07988200002273516, 0.07959199996321331], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08668100002751089, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
6
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0801809999870784, "p50": 0.08213100005605156, "p90": 0.08258200000454963, "mean": 0.08208560000184661, "iqr": 0.0013300000318849925, "raw_times": [0.0801809999870784, 0.08213100005605156, 0.08428199998888886, 0.08258200000454963, 0.08125199997266463], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08745099995621786, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
7
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07887199996048366, "p50": 0.08016100002805615, "p90": 0.08020199999236866, "mean": 0.0800293999986934, "iqr": 7.099998811099795e-05, "raw_times": [0.08078100000830091, 0.08013100000425766, 0.08016100002805615, 0.08020199999236866, 0.07887199996048366], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0863010000102804, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
8
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07991100000026563, "p50": 0.0806710000347266, "p90": 0.08072099996070392, "mean": 0.08052299999690149, "iqr": 0.0007399999617518915, "raw_times": [0.0806710000347266, 0.07998099999895203, 0.08133099998985927, 0.08072099996070392, 0.07991100000026563], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08638100001689963, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
9
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07996199997251097, "p50": 0.08190199997670788, "p90": 0.08266099996490084, "mean": 0.08240359998126223, "iqr": 0.0024089999897114467, "raw_times": [0.07996199997251097, 0.08724100001700208, 0.08266099996490084, 0.08025199997518939, 0.08190199997670788], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0852109999982531, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
10
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08023199995932373, "p50": 0.08207100000845458, "p90": 0.08239099997808808, "mean": 0.08726959998739403, "iqr": 0.0012489999789977446, "raw_times": [0.08239099997808808, 0.11051199999201344, 0.08023199995932373, 0.08114199999909033, 0.08207100000845458], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0862620000248171, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
11
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.1583530000175415, "p50": 0.15909299997929338, "p90": 0.15925299999253184, "mean": 0.15925679999782005, "iqr": 0.00036099999078942346, "raw_times": [0.15909299997929338, 0.15889200000174242, 0.16069299999799114, 0.15925299999253184, 0.1583530000175415], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1604330000191112, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
12
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16316300002472417, "p50": 0.164162999965356, "p90": 0.16488300002492906, "mean": 0.16412500000342334, "iqr": 0.001400000030571391, "raw_times": [0.164162999965356, 0.16316300002472417, 0.16488300002492906, 0.1649330000077498, 0.16348299999435767], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.16465299995616078, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
13
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07947099999228158, "p50": 0.08210099997540965, "p90": 0.08814200003826045, "mean": 0.08720339999399584, "iqr": 0.0077410000471900275, "raw_times": [0.08814200003826045, 0.08040099999107042, 0.10590199997295713, 0.08210099997540965, 0.07947099999228158], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08537200000091616, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
14
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07920100000546881, "p50": 0.0799709999910192, "p90": 0.0812010000004193, "mean": 0.08044520000112243, "iqr": 0.0014700000292577897, "raw_times": [0.07973099997116151, 0.08212200003754333, 0.0812010000004193, 0.07920100000546881, 0.0799709999910192], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0854219999837369, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
15
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07912099999884958, "p50": 0.08021200000030149, "p90": 0.08024099997783196, "mean": 0.07987539999021465, "iqr": 0.0010200000133409048, "raw_times": [0.07912099999884958, 0.07922099996449106, 0.08021200000030149, 0.08024099997783196, 0.08058200000959914], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.084330999982285, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
16
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08098199998585187, "p50": 0.08176099998991049, "p90": 0.08322100001123545, "mean": 0.08342759999777627, "iqr": 0.0014889999988554337, "raw_times": [0.08098199998585187, 0.08322100001123545, 0.08176099998991049, 0.08173200001238001, 0.08944199998950353], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08506099999294747, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
17
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07974099997909434, "p50": 0.0811310000017329, "p90": 0.08221100000582737, "mean": 0.08252540000057706, "iqr": 0.001659000020026724, "raw_times": [0.07974099997909434, 0.08221100000582737, 0.0811310000017329, 0.08899200003043006, 0.08055199998580065], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0852420000114762, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
18
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08050200000297991, "p50": 0.08192199999257355, "p90": 0.08227199998600554, "mean": 0.08175159999836978, "iqr": 0.000530999955117295, "raw_times": [0.08174100003088824, 0.08050200000297991, 0.08192199999257355, 0.08227199998600554, 0.08232099997940168], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0854219999837369, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
19
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07911099999091675, "p50": 0.08030099996858553, "p90": 0.08107100001097933, "mean": 0.08047899999610308, "iqr": 0.0018000000068241206, "raw_times": [0.08030099996858553, 0.0826410000058786, 0.08107100001097933, 0.07911099999091675, 0.07927100000415521], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0837320000073305, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
20
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0812719999885303, "p50": 0.08212100004811873, "p90": 0.08214100000714097, "mean": 0.08211120000396477, "iqr": 0.00043000000005122274, "raw_times": [0.08171100000708975, 0.08212100004811873, 0.08331099996894409, 0.08214100000714097, 0.0812719999885303], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0876720000064779, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
21
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09240200000704135, "p50": 0.09415100004162014, "p90": 0.09504199999810226, "mean": 0.09392180000986627, "iqr": 0.002310000013494573, "raw_times": [0.09240200000704135, 0.09273199998460768, 0.09504199999810226, 0.09528200001795994, 0.09415100004162014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09613200001012956, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
22
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09898100000782506, "p50": 0.09988099998281541, "p90": 0.10064200000670098, "mean": 0.1000036000050386, "iqr": 0.000919999990856013, "raw_times": [0.09972200001584497, 0.09898100000782506, 0.09988099998281541, 0.10079200001200661, 0.10064200000670098], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10166200002004189, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
23
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4828179999663007, "p50": 0.4840080000008129, "p90": 0.4856980000340627, "mean": 0.48515199999883407, "iqr": 0.0026800000227922283, "raw_times": [0.4830180000112705, 0.4828179999663007, 0.4856980000340627, 0.49021799998172355, 0.4840080000008129], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4863379999733297, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
24
+ {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.49664799996662623, "p50": 0.4980280000381754, "p90": 0.5011090000266449, "mean": 0.4989826000155517, "iqr": 0.004230000001825829, "raw_times": [0.4980280000381754, 0.502249000021493, 0.49664799996662623, 0.4968790000248191, 0.5011090000266449], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.49949900000001435, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
causal_conv1d/impls/hf_kernels_causal_conv1d.html CHANGED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/impls/torch_causal_conv1d.html CHANGED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 62198a37ec11e9842df4a67d55c5b1bec2c5617a8dd04d029f52a460eb48ca2f
  • Pointer size: 130 Bytes
  • Size of remote file: 35.4 kB

Git LFS Details

  • SHA256: 69a09b74de90d8a70bffd10eab24eac79df8e4954f3e91129689ad6a56422eed
  • Pointer size: 130 Bytes
  • Size of remote file: 35.4 kB
causal_conv1d/results/combined_results.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3889,11 +3889,11 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-11-10T22:12:01.020731</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
3896
- <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
3897
  </ns2:Agent>
3898
  </dc:creator>
3899
  </ns2:Work>
@@ -4233,70 +4233,70 @@ body[data-tool="eraser"] .main-content {
4233
  <g id="matplotlib.axis_2">
4234
  <g id="ytick_1">
4235
  <g id="grid-y--2" class="grid grid-y">
4236
- <path d="M 47.72 373.364114 L 831.034248 373.364114 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4237
  </g>
4238
  <g id="line2d_25">
4239
  <defs>
4240
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4241
  </defs>
4242
  <g>
4243
- <use ns4:href="#m0fca2865ba" x="47.72" y="373.364114" style="stroke: #000000; stroke-width: 0.8" />
4244
  </g>
4245
  </g>
4246
  <g id="text_25">
4247
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="377.163333" transform="rotate(-0 40.72 377.163333)">0.1</text>
4248
  </g>
4249
  </g>
4250
  <g id="ytick_2">
4251
  <g id="grid-y--3" class="grid grid-y">
4252
- <path d="M 47.72 291.718825 L 831.034248 291.718825 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4253
  </g>
4254
  <g id="line2d_26">
4255
  <g>
4256
- <use ns4:href="#m0fca2865ba" x="47.72" y="291.718825" style="stroke: #000000; stroke-width: 0.8" />
4257
  </g>
4258
  </g>
4259
  <g id="text_26">
4260
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="295.518044" transform="rotate(-0 40.72 295.518044)">0.2</text>
4261
  </g>
4262
  </g>
4263
  <g id="ytick_3">
4264
  <g id="grid-y--4" class="grid grid-y">
4265
- <path d="M 47.72 210.073536 L 831.034248 210.073536 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4266
  </g>
4267
  <g id="line2d_27">
4268
  <g>
4269
- <use ns4:href="#m0fca2865ba" x="47.72" y="210.073536" style="stroke: #000000; stroke-width: 0.8" />
4270
  </g>
4271
  </g>
4272
  <g id="text_27">
4273
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.872755" transform="rotate(-0 40.72 213.872755)">0.3</text>
4274
  </g>
4275
  </g>
4276
  <g id="ytick_4">
4277
  <g id="grid-y--5" class="grid grid-y">
4278
- <path d="M 47.72 128.428247 L 831.034248 128.428247 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4279
  </g>
4280
  <g id="line2d_28">
4281
  <g>
4282
- <use ns4:href="#m0fca2865ba" x="47.72" y="128.428247" style="stroke: #000000; stroke-width: 0.8" />
4283
  </g>
4284
  </g>
4285
  <g id="text_28">
4286
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="132.227466" transform="rotate(-0 40.72 132.227466)">0.4</text>
4287
  </g>
4288
  </g>
4289
  <g id="ytick_5">
4290
  <g id="grid-y--6" class="grid grid-y">
4291
- <path d="M 47.72 46.782958 L 831.034248 46.782958 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4292
  </g>
4293
  <g id="line2d_29">
4294
  <g>
4295
- <use ns4:href="#m0fca2865ba" x="47.72" y="46.782958" style="stroke: #000000; stroke-width: 0.8" />
4296
  </g>
4297
  </g>
4298
  <g id="text_29">
4299
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="50.582177" transform="rotate(-0 40.72 50.582177)">0.5</text>
4300
  </g>
4301
  </g>
4302
  <g id="label--y" class="ylabel">
@@ -4304,66 +4304,66 @@ body[data-tool="eraser"] .main-content {
4304
  </g>
4305
  </g>
4306
  <g id="series--hf-kernels-causal-conv1d" class="series">
4307
- <path d="M 83.325193 420.186871 L 114.286231 413.573602 L 145.247268 414.178594 L 176.208306 414.545181 L 207.169343 415.255495 L 238.130381 415.345305 L 269.091418 415.451444 L 300.052455 415.345305 L 331.013493 415.17385 L 361.97453 415.141192 L 392.935568 414.643155 L 423.896605 413.916512 L 454.857643 415.614734 L 485.81868 415.802519 L 516.779718 414.790117 L 547.740755 414.667649 L 578.701793 414.234929 L 609.66283 413.843032 L 640.623868 415.648209 L 671.584905 414.553346 L 702.545943 414.977901 L 733.50698 414.90442 L 764.468018 415.475937 L 795.429055 415.026888 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4308
  <defs>
4309
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4310
  </defs>
4311
  <g clip-path="url(#pb49fc4c8d2)">
4312
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4313
- <use ns4:href="#md7efaf3aec" x="114.286231" y="413.573602" style="fill: #1f77b4; stroke: #1f77b4" />
4314
- <use ns4:href="#md7efaf3aec" x="145.247268" y="414.178594" style="fill: #1f77b4; stroke: #1f77b4" />
4315
- <use ns4:href="#md7efaf3aec" x="176.208306" y="414.545181" style="fill: #1f77b4; stroke: #1f77b4" />
4316
- <use ns4:href="#md7efaf3aec" x="207.169343" y="415.255495" style="fill: #1f77b4; stroke: #1f77b4" />
4317
- <use ns4:href="#md7efaf3aec" x="238.130381" y="415.345305" style="fill: #1f77b4; stroke: #1f77b4" />
4318
- <use ns4:href="#md7efaf3aec" x="269.091418" y="415.451444" style="fill: #1f77b4; stroke: #1f77b4" />
4319
- <use ns4:href="#md7efaf3aec" x="300.052455" y="415.345305" style="fill: #1f77b4; stroke: #1f77b4" />
4320
- <use ns4:href="#md7efaf3aec" x="331.013493" y="415.17385" style="fill: #1f77b4; stroke: #1f77b4" />
4321
- <use ns4:href="#md7efaf3aec" x="361.97453" y="415.141192" style="fill: #1f77b4; stroke: #1f77b4" />
4322
- <use ns4:href="#md7efaf3aec" x="392.935568" y="414.643155" style="fill: #1f77b4; stroke: #1f77b4" />
4323
- <use ns4:href="#md7efaf3aec" x="423.896605" y="413.916512" style="fill: #1f77b4; stroke: #1f77b4" />
4324
- <use ns4:href="#md7efaf3aec" x="454.857643" y="415.614734" style="fill: #1f77b4; stroke: #1f77b4" />
4325
- <use ns4:href="#md7efaf3aec" x="485.81868" y="415.802519" style="fill: #1f77b4; stroke: #1f77b4" />
4326
- <use ns4:href="#md7efaf3aec" x="516.779718" y="414.790117" style="fill: #1f77b4; stroke: #1f77b4" />
4327
- <use ns4:href="#md7efaf3aec" x="547.740755" y="414.667649" style="fill: #1f77b4; stroke: #1f77b4" />
4328
- <use ns4:href="#md7efaf3aec" x="578.701793" y="414.234929" style="fill: #1f77b4; stroke: #1f77b4" />
4329
- <use ns4:href="#md7efaf3aec" x="609.66283" y="413.843032" style="fill: #1f77b4; stroke: #1f77b4" />
4330
- <use ns4:href="#md7efaf3aec" x="640.623868" y="415.648209" style="fill: #1f77b4; stroke: #1f77b4" />
4331
- <use ns4:href="#md7efaf3aec" x="671.584905" y="414.553346" style="fill: #1f77b4; stroke: #1f77b4" />
4332
- <use ns4:href="#md7efaf3aec" x="702.545943" y="414.977901" style="fill: #1f77b4; stroke: #1f77b4" />
4333
- <use ns4:href="#md7efaf3aec" x="733.50698" y="414.90442" style="fill: #1f77b4; stroke: #1f77b4" />
4334
- <use ns4:href="#md7efaf3aec" x="764.468018" y="415.475937" style="fill: #1f77b4; stroke: #1f77b4" />
4335
- <use ns4:href="#md7efaf3aec" x="795.429055" y="415.026888" style="fill: #1f77b4; stroke: #1f77b4" />
4336
  </g>
4337
  </g>
4338
  <g id="series--torch-eager" class="series">
4339
- <path d="M 83.325193 398.387578 L 114.286231 388.13293 L 145.247268 389.259635 L 176.208306 388.843244 L 207.169343 389.324135 L 238.130381 389.080016 L 269.091418 389.806659 L 300.052455 389.275964 L 331.013493 390.38634 L 361.97453 388.671789 L 392.935568 324.496959 L 423.896605 321.09235 L 454.857643 390.484314 L 485.81868 388.884067 L 516.779718 390.467985 L 547.740755 389.15268 L 578.701793 389.831152 L 609.66283 388.801605 L 640.623868 389.390268 L 671.584905 389.096345 L 702.545943 380.196192 L 733.50698 374.351205 L 764.468018 60.534474 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4340
  <defs>
4341
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4342
  </defs>
4343
  <g clip-path="url(#pb49fc4c8d2)">
4344
- <use ns4:href="#m9b8c54d372" x="83.325193" y="398.387578" style="fill: #ff7f0e; stroke: #ff7f0e" />
4345
- <use ns4:href="#m9b8c54d372" x="114.286231" y="388.13293" style="fill: #ff7f0e; stroke: #ff7f0e" />
4346
- <use ns4:href="#m9b8c54d372" x="145.247268" y="389.259635" style="fill: #ff7f0e; stroke: #ff7f0e" />
4347
- <use ns4:href="#m9b8c54d372" x="176.208306" y="388.843244" style="fill: #ff7f0e; stroke: #ff7f0e" />
4348
- <use ns4:href="#m9b8c54d372" x="207.169343" y="389.324135" style="fill: #ff7f0e; stroke: #ff7f0e" />
4349
- <use ns4:href="#m9b8c54d372" x="238.130381" y="389.080016" style="fill: #ff7f0e; stroke: #ff7f0e" />
4350
- <use ns4:href="#m9b8c54d372" x="269.091418" y="389.806659" style="fill: #ff7f0e; stroke: #ff7f0e" />
4351
- <use ns4:href="#m9b8c54d372" x="300.052455" y="389.275964" style="fill: #ff7f0e; stroke: #ff7f0e" />
4352
- <use ns4:href="#m9b8c54d372" x="331.013493" y="390.38634" style="fill: #ff7f0e; stroke: #ff7f0e" />
4353
- <use ns4:href="#m9b8c54d372" x="361.97453" y="388.671789" style="fill: #ff7f0e; stroke: #ff7f0e" />
4354
- <use ns4:href="#m9b8c54d372" x="392.935568" y="324.496959" style="fill: #ff7f0e; stroke: #ff7f0e" />
4355
- <use ns4:href="#m9b8c54d372" x="423.896605" y="321.09235" style="fill: #ff7f0e; stroke: #ff7f0e" />
4356
- <use ns4:href="#m9b8c54d372" x="454.857643" y="390.484314" style="fill: #ff7f0e; stroke: #ff7f0e" />
4357
- <use ns4:href="#m9b8c54d372" x="485.81868" y="388.884067" style="fill: #ff7f0e; stroke: #ff7f0e" />
4358
- <use ns4:href="#m9b8c54d372" x="516.779718" y="390.467985" style="fill: #ff7f0e; stroke: #ff7f0e" />
4359
- <use ns4:href="#m9b8c54d372" x="547.740755" y="389.15268" style="fill: #ff7f0e; stroke: #ff7f0e" />
4360
- <use ns4:href="#m9b8c54d372" x="578.701793" y="389.831152" style="fill: #ff7f0e; stroke: #ff7f0e" />
4361
- <use ns4:href="#m9b8c54d372" x="609.66283" y="388.801605" style="fill: #ff7f0e; stroke: #ff7f0e" />
4362
- <use ns4:href="#m9b8c54d372" x="640.623868" y="389.390268" style="fill: #ff7f0e; stroke: #ff7f0e" />
4363
- <use ns4:href="#m9b8c54d372" x="671.584905" y="389.096345" style="fill: #ff7f0e; stroke: #ff7f0e" />
4364
- <use ns4:href="#m9b8c54d372" x="702.545943" y="380.196192" style="fill: #ff7f0e; stroke: #ff7f0e" />
4365
- <use ns4:href="#m9b8c54d372" x="733.50698" y="374.351205" style="fill: #ff7f0e; stroke: #ff7f0e" />
4366
- <use ns4:href="#m9b8c54d372" x="764.468018" y="60.534474" style="fill: #ff7f0e; stroke: #ff7f0e" />
4367
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
4368
  </g>
4369
  </g>
@@ -4422,7 +4422,7 @@ body[data-tool="eraser"] .main-content {
4422
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4423
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4424
  </span> |
4425
- Cell: combine | 4.73s
4426
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4427
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4428
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4515,7 +4515,7 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W2 0.05 True
4515
  hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True
4516
  hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True
4517
  hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True
4518
- hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.04 True
4519
  hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.05 True
4520
  hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.05 True
4521
  hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True
@@ -4576,7 +4576,7 @@ Implementations included:
4576
  <div class="uv-install-logs" id="uv-logs-combine">
4577
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4578
  <div class="uv-logs-content" style="display: none;">
4579
- Installed 37 packages in 336ms
4580
  </div>
4581
  </div>
4582
  <div class="cell-artifacts">
@@ -4589,11 +4589,11 @@ Installed 37 packages in 336ms
4589
  <rdf:RDF>
4590
  <ns2:Work>
4591
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4592
- <dc:date>2025-11-10T22:12:01.020731</dc:date>
4593
  <dc:format>image/svg+xml</dc:format>
4594
  <dc:creator>
4595
  <ns2:Agent>
4596
- <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
4597
  </ns2:Agent>
4598
  </dc:creator>
4599
  </ns2:Work>
@@ -4933,70 +4933,70 @@ Installed 37 packages in 336ms
4933
  <g id="matplotlib.axis_2">
4934
  <g id="ytick_1">
4935
  <g id="grid-y--2" class="grid grid-y">
4936
- <path d="M 47.72 373.364114 L 831.034248 373.364114 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4937
  </g>
4938
  <g id="line2d_25">
4939
  <defs>
4940
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4941
  </defs>
4942
  <g>
4943
- <use ns4:href="#m0fca2865ba" x="47.72" y="373.364114" style="stroke: #000000; stroke-width: 0.8" />
4944
  </g>
4945
  </g>
4946
  <g id="text_25">
4947
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="377.163333" transform="rotate(-0 40.72 377.163333)">0.1</text>
4948
  </g>
4949
  </g>
4950
  <g id="ytick_2">
4951
  <g id="grid-y--3" class="grid grid-y">
4952
- <path d="M 47.72 291.718825 L 831.034248 291.718825 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4953
  </g>
4954
  <g id="line2d_26">
4955
  <g>
4956
- <use ns4:href="#m0fca2865ba" x="47.72" y="291.718825" style="stroke: #000000; stroke-width: 0.8" />
4957
  </g>
4958
  </g>
4959
  <g id="text_26">
4960
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="295.518044" transform="rotate(-0 40.72 295.518044)">0.2</text>
4961
  </g>
4962
  </g>
4963
  <g id="ytick_3">
4964
  <g id="grid-y--4" class="grid grid-y">
4965
- <path d="M 47.72 210.073536 L 831.034248 210.073536 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4966
  </g>
4967
  <g id="line2d_27">
4968
  <g>
4969
- <use ns4:href="#m0fca2865ba" x="47.72" y="210.073536" style="stroke: #000000; stroke-width: 0.8" />
4970
  </g>
4971
  </g>
4972
  <g id="text_27">
4973
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.872755" transform="rotate(-0 40.72 213.872755)">0.3</text>
4974
  </g>
4975
  </g>
4976
  <g id="ytick_4">
4977
  <g id="grid-y--5" class="grid grid-y">
4978
- <path d="M 47.72 128.428247 L 831.034248 128.428247 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4979
  </g>
4980
  <g id="line2d_28">
4981
  <g>
4982
- <use ns4:href="#m0fca2865ba" x="47.72" y="128.428247" style="stroke: #000000; stroke-width: 0.8" />
4983
  </g>
4984
  </g>
4985
  <g id="text_28">
4986
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="132.227466" transform="rotate(-0 40.72 132.227466)">0.4</text>
4987
  </g>
4988
  </g>
4989
  <g id="ytick_5">
4990
  <g id="grid-y--6" class="grid grid-y">
4991
- <path d="M 47.72 46.782958 L 831.034248 46.782958 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4992
  </g>
4993
  <g id="line2d_29">
4994
  <g>
4995
- <use ns4:href="#m0fca2865ba" x="47.72" y="46.782958" style="stroke: #000000; stroke-width: 0.8" />
4996
  </g>
4997
  </g>
4998
  <g id="text_29">
4999
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="50.582177" transform="rotate(-0 40.72 50.582177)">0.5</text>
5000
  </g>
5001
  </g>
5002
  <g id="label--y" class="ylabel">
@@ -5004,66 +5004,66 @@ Installed 37 packages in 336ms
5004
  </g>
5005
  </g>
5006
  <g id="series--hf-kernels-causal-conv1d" class="series">
5007
- <path d="M 83.325193 420.186871 L 114.286231 413.573602 L 145.247268 414.178594 L 176.208306 414.545181 L 207.169343 415.255495 L 238.130381 415.345305 L 269.091418 415.451444 L 300.052455 415.345305 L 331.013493 415.17385 L 361.97453 415.141192 L 392.935568 414.643155 L 423.896605 413.916512 L 454.857643 415.614734 L 485.81868 415.802519 L 516.779718 414.790117 L 547.740755 414.667649 L 578.701793 414.234929 L 609.66283 413.843032 L 640.623868 415.648209 L 671.584905 414.553346 L 702.545943 414.977901 L 733.50698 414.90442 L 764.468018 415.475937 L 795.429055 415.026888 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
5008
  <defs>
5009
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
5010
  </defs>
5011
  <g clip-path="url(#pb49fc4c8d2)">
5012
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
5013
- <use ns4:href="#md7efaf3aec" x="114.286231" y="413.573602" style="fill: #1f77b4; stroke: #1f77b4" />
5014
- <use ns4:href="#md7efaf3aec" x="145.247268" y="414.178594" style="fill: #1f77b4; stroke: #1f77b4" />
5015
- <use ns4:href="#md7efaf3aec" x="176.208306" y="414.545181" style="fill: #1f77b4; stroke: #1f77b4" />
5016
- <use ns4:href="#md7efaf3aec" x="207.169343" y="415.255495" style="fill: #1f77b4; stroke: #1f77b4" />
5017
- <use ns4:href="#md7efaf3aec" x="238.130381" y="415.345305" style="fill: #1f77b4; stroke: #1f77b4" />
5018
- <use ns4:href="#md7efaf3aec" x="269.091418" y="415.451444" style="fill: #1f77b4; stroke: #1f77b4" />
5019
- <use ns4:href="#md7efaf3aec" x="300.052455" y="415.345305" style="fill: #1f77b4; stroke: #1f77b4" />
5020
- <use ns4:href="#md7efaf3aec" x="331.013493" y="415.17385" style="fill: #1f77b4; stroke: #1f77b4" />
5021
- <use ns4:href="#md7efaf3aec" x="361.97453" y="415.141192" style="fill: #1f77b4; stroke: #1f77b4" />
5022
- <use ns4:href="#md7efaf3aec" x="392.935568" y="414.643155" style="fill: #1f77b4; stroke: #1f77b4" />
5023
- <use ns4:href="#md7efaf3aec" x="423.896605" y="413.916512" style="fill: #1f77b4; stroke: #1f77b4" />
5024
- <use ns4:href="#md7efaf3aec" x="454.857643" y="415.614734" style="fill: #1f77b4; stroke: #1f77b4" />
5025
- <use ns4:href="#md7efaf3aec" x="485.81868" y="415.802519" style="fill: #1f77b4; stroke: #1f77b4" />
5026
- <use ns4:href="#md7efaf3aec" x="516.779718" y="414.790117" style="fill: #1f77b4; stroke: #1f77b4" />
5027
- <use ns4:href="#md7efaf3aec" x="547.740755" y="414.667649" style="fill: #1f77b4; stroke: #1f77b4" />
5028
- <use ns4:href="#md7efaf3aec" x="578.701793" y="414.234929" style="fill: #1f77b4; stroke: #1f77b4" />
5029
- <use ns4:href="#md7efaf3aec" x="609.66283" y="413.843032" style="fill: #1f77b4; stroke: #1f77b4" />
5030
- <use ns4:href="#md7efaf3aec" x="640.623868" y="415.648209" style="fill: #1f77b4; stroke: #1f77b4" />
5031
- <use ns4:href="#md7efaf3aec" x="671.584905" y="414.553346" style="fill: #1f77b4; stroke: #1f77b4" />
5032
- <use ns4:href="#md7efaf3aec" x="702.545943" y="414.977901" style="fill: #1f77b4; stroke: #1f77b4" />
5033
- <use ns4:href="#md7efaf3aec" x="733.50698" y="414.90442" style="fill: #1f77b4; stroke: #1f77b4" />
5034
- <use ns4:href="#md7efaf3aec" x="764.468018" y="415.475937" style="fill: #1f77b4; stroke: #1f77b4" />
5035
- <use ns4:href="#md7efaf3aec" x="795.429055" y="415.026888" style="fill: #1f77b4; stroke: #1f77b4" />
5036
  </g>
5037
  </g>
5038
  <g id="series--torch-eager" class="series">
5039
- <path d="M 83.325193 398.387578 L 114.286231 388.13293 L 145.247268 389.259635 L 176.208306 388.843244 L 207.169343 389.324135 L 238.130381 389.080016 L 269.091418 389.806659 L 300.052455 389.275964 L 331.013493 390.38634 L 361.97453 388.671789 L 392.935568 324.496959 L 423.896605 321.09235 L 454.857643 390.484314 L 485.81868 388.884067 L 516.779718 390.467985 L 547.740755 389.15268 L 578.701793 389.831152 L 609.66283 388.801605 L 640.623868 389.390268 L 671.584905 389.096345 L 702.545943 380.196192 L 733.50698 374.351205 L 764.468018 60.534474 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5040
  <defs>
5041
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5042
  </defs>
5043
  <g clip-path="url(#pb49fc4c8d2)">
5044
- <use ns4:href="#m9b8c54d372" x="83.325193" y="398.387578" style="fill: #ff7f0e; stroke: #ff7f0e" />
5045
- <use ns4:href="#m9b8c54d372" x="114.286231" y="388.13293" style="fill: #ff7f0e; stroke: #ff7f0e" />
5046
- <use ns4:href="#m9b8c54d372" x="145.247268" y="389.259635" style="fill: #ff7f0e; stroke: #ff7f0e" />
5047
- <use ns4:href="#m9b8c54d372" x="176.208306" y="388.843244" style="fill: #ff7f0e; stroke: #ff7f0e" />
5048
- <use ns4:href="#m9b8c54d372" x="207.169343" y="389.324135" style="fill: #ff7f0e; stroke: #ff7f0e" />
5049
- <use ns4:href="#m9b8c54d372" x="238.130381" y="389.080016" style="fill: #ff7f0e; stroke: #ff7f0e" />
5050
- <use ns4:href="#m9b8c54d372" x="269.091418" y="389.806659" style="fill: #ff7f0e; stroke: #ff7f0e" />
5051
- <use ns4:href="#m9b8c54d372" x="300.052455" y="389.275964" style="fill: #ff7f0e; stroke: #ff7f0e" />
5052
- <use ns4:href="#m9b8c54d372" x="331.013493" y="390.38634" style="fill: #ff7f0e; stroke: #ff7f0e" />
5053
- <use ns4:href="#m9b8c54d372" x="361.97453" y="388.671789" style="fill: #ff7f0e; stroke: #ff7f0e" />
5054
- <use ns4:href="#m9b8c54d372" x="392.935568" y="324.496959" style="fill: #ff7f0e; stroke: #ff7f0e" />
5055
- <use ns4:href="#m9b8c54d372" x="423.896605" y="321.09235" style="fill: #ff7f0e; stroke: #ff7f0e" />
5056
- <use ns4:href="#m9b8c54d372" x="454.857643" y="390.484314" style="fill: #ff7f0e; stroke: #ff7f0e" />
5057
- <use ns4:href="#m9b8c54d372" x="485.81868" y="388.884067" style="fill: #ff7f0e; stroke: #ff7f0e" />
5058
- <use ns4:href="#m9b8c54d372" x="516.779718" y="390.467985" style="fill: #ff7f0e; stroke: #ff7f0e" />
5059
- <use ns4:href="#m9b8c54d372" x="547.740755" y="389.15268" style="fill: #ff7f0e; stroke: #ff7f0e" />
5060
- <use ns4:href="#m9b8c54d372" x="578.701793" y="389.831152" style="fill: #ff7f0e; stroke: #ff7f0e" />
5061
- <use ns4:href="#m9b8c54d372" x="609.66283" y="388.801605" style="fill: #ff7f0e; stroke: #ff7f0e" />
5062
- <use ns4:href="#m9b8c54d372" x="640.623868" y="389.390268" style="fill: #ff7f0e; stroke: #ff7f0e" />
5063
- <use ns4:href="#m9b8c54d372" x="671.584905" y="389.096345" style="fill: #ff7f0e; stroke: #ff7f0e" />
5064
- <use ns4:href="#m9b8c54d372" x="702.545943" y="380.196192" style="fill: #ff7f0e; stroke: #ff7f0e" />
5065
- <use ns4:href="#m9b8c54d372" x="733.50698" y="374.351205" style="fill: #ff7f0e; stroke: #ff7f0e" />
5066
- <use ns4:href="#m9b8c54d372" x="764.468018" y="60.534474" style="fill: #ff7f0e; stroke: #ff7f0e" />
5067
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
5068
  </g>
5069
  </g>
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T19:09:46.065014</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
3896
+ <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
3897
  </ns2:Agent>
3898
  </dc:creator>
3899
  </ns2:Work>
 
4233
  <g id="matplotlib.axis_2">
4234
  <g id="ytick_1">
4235
  <g id="grid-y--2" class="grid grid-y">
4236
+ <path d="M 47.72 375.695489 L 831.034248 375.695489 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4237
  </g>
4238
  <g id="line2d_25">
4239
  <defs>
4240
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4241
  </defs>
4242
  <g>
4243
+ <use ns4:href="#m0fca2865ba" x="47.72" y="375.695489" style="stroke: #000000; stroke-width: 0.8" />
4244
  </g>
4245
  </g>
4246
  <g id="text_25">
4247
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.494708" transform="rotate(-0 40.72 379.494708)">0.1</text>
4248
  </g>
4249
  </g>
4250
  <g id="ytick_2">
4251
  <g id="grid-y--3" class="grid grid-y">
4252
+ <path d="M 47.72 292.764994 L 831.034248 292.764994 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4253
  </g>
4254
  <g id="line2d_26">
4255
  <g>
4256
+ <use ns4:href="#m0fca2865ba" x="47.72" y="292.764994" style="stroke: #000000; stroke-width: 0.8" />
4257
  </g>
4258
  </g>
4259
  <g id="text_26">
4260
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.564213" transform="rotate(-0 40.72 296.564213)">0.2</text>
4261
  </g>
4262
  </g>
4263
  <g id="ytick_3">
4264
  <g id="grid-y--4" class="grid grid-y">
4265
+ <path d="M 47.72 209.834499 L 831.034248 209.834499 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4266
  </g>
4267
  <g id="line2d_27">
4268
  <g>
4269
+ <use ns4:href="#m0fca2865ba" x="47.72" y="209.834499" style="stroke: #000000; stroke-width: 0.8" />
4270
  </g>
4271
  </g>
4272
  <g id="text_27">
4273
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.633718" transform="rotate(-0 40.72 213.633718)">0.3</text>
4274
  </g>
4275
  </g>
4276
  <g id="ytick_4">
4277
  <g id="grid-y--5" class="grid grid-y">
4278
+ <path d="M 47.72 126.904004 L 831.034248 126.904004 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4279
  </g>
4280
  <g id="line2d_28">
4281
  <g>
4282
+ <use ns4:href="#m0fca2865ba" x="47.72" y="126.904004" style="stroke: #000000; stroke-width: 0.8" />
4283
  </g>
4284
  </g>
4285
  <g id="text_28">
4286
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.703223" transform="rotate(-0 40.72 130.703223)">0.4</text>
4287
  </g>
4288
  </g>
4289
  <g id="ytick_5">
4290
  <g id="grid-y--6" class="grid grid-y">
4291
+ <path d="M 47.72 43.973509 L 831.034248 43.973509 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4292
  </g>
4293
  <g id="line2d_29">
4294
  <g>
4295
+ <use ns4:href="#m0fca2865ba" x="47.72" y="43.973509" style="stroke: #000000; stroke-width: 0.8" />
4296
  </g>
4297
  </g>
4298
  <g id="text_29">
4299
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.772728" transform="rotate(-0 40.72 47.772728)">0.5</text>
4300
  </g>
4301
  </g>
4302
  <g id="label--y" class="ylabel">
 
4304
  </g>
4305
  </g>
4306
  <g id="series--hf-kernels-causal-conv1d" class="series">
4307
+ <path d="M 83.325193 420.186871 L 114.286231 415.418367 L 145.247268 416.273381 L 176.208306 416.636616 L 207.169343 417.035512 L 238.130381 417.110979 L 269.091418 417.475043 L 300.052455 417.781886 L 331.013493 417.616025 L 361.97453 417.516509 L 392.935568 417.674077 L 423.896605 417.018926 L 454.857643 417.358941 L 485.81868 417.823352 L 516.779718 416.969997 L 547.740755 416.968338 L 578.701793 417.640904 L 609.66283 417.300889 L 640.623868 417.38382 L 671.584905 417.980919 L 702.545943 416.836479 L 733.50698 417.110149 L 764.468018 417.450164 L 795.429055 417.582853 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4308
  <defs>
4309
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4310
  </defs>
4311
  <g clip-path="url(#pb49fc4c8d2)">
4312
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4313
+ <use ns4:href="#md7efaf3aec" x="114.286231" y="415.418367" style="fill: #1f77b4; stroke: #1f77b4" />
4314
+ <use ns4:href="#md7efaf3aec" x="145.247268" y="416.273381" style="fill: #1f77b4; stroke: #1f77b4" />
4315
+ <use ns4:href="#md7efaf3aec" x="176.208306" y="416.636616" style="fill: #1f77b4; stroke: #1f77b4" />
4316
+ <use ns4:href="#md7efaf3aec" x="207.169343" y="417.035512" style="fill: #1f77b4; stroke: #1f77b4" />
4317
+ <use ns4:href="#md7efaf3aec" x="238.130381" y="417.110979" style="fill: #1f77b4; stroke: #1f77b4" />
4318
+ <use ns4:href="#md7efaf3aec" x="269.091418" y="417.475043" style="fill: #1f77b4; stroke: #1f77b4" />
4319
+ <use ns4:href="#md7efaf3aec" x="300.052455" y="417.781886" style="fill: #1f77b4; stroke: #1f77b4" />
4320
+ <use ns4:href="#md7efaf3aec" x="331.013493" y="417.616025" style="fill: #1f77b4; stroke: #1f77b4" />
4321
+ <use ns4:href="#md7efaf3aec" x="361.97453" y="417.516509" style="fill: #1f77b4; stroke: #1f77b4" />
4322
+ <use ns4:href="#md7efaf3aec" x="392.935568" y="417.674077" style="fill: #1f77b4; stroke: #1f77b4" />
4323
+ <use ns4:href="#md7efaf3aec" x="423.896605" y="417.018926" style="fill: #1f77b4; stroke: #1f77b4" />
4324
+ <use ns4:href="#md7efaf3aec" x="454.857643" y="417.358941" style="fill: #1f77b4; stroke: #1f77b4" />
4325
+ <use ns4:href="#md7efaf3aec" x="485.81868" y="417.823352" style="fill: #1f77b4; stroke: #1f77b4" />
4326
+ <use ns4:href="#md7efaf3aec" x="516.779718" y="416.969997" style="fill: #1f77b4; stroke: #1f77b4" />
4327
+ <use ns4:href="#md7efaf3aec" x="547.740755" y="416.968338" style="fill: #1f77b4; stroke: #1f77b4" />
4328
+ <use ns4:href="#md7efaf3aec" x="578.701793" y="417.640904" style="fill: #1f77b4; stroke: #1f77b4" />
4329
+ <use ns4:href="#md7efaf3aec" x="609.66283" y="417.300889" style="fill: #1f77b4; stroke: #1f77b4" />
4330
+ <use ns4:href="#md7efaf3aec" x="640.623868" y="417.38382" style="fill: #1f77b4; stroke: #1f77b4" />
4331
+ <use ns4:href="#md7efaf3aec" x="671.584905" y="417.980919" style="fill: #1f77b4; stroke: #1f77b4" />
4332
+ <use ns4:href="#md7efaf3aec" x="702.545943" y="416.836479" style="fill: #1f77b4; stroke: #1f77b4" />
4333
+ <use ns4:href="#md7efaf3aec" x="733.50698" y="417.110149" style="fill: #1f77b4; stroke: #1f77b4" />
4334
+ <use ns4:href="#md7efaf3aec" x="764.468018" y="417.450164" style="fill: #1f77b4; stroke: #1f77b4" />
4335
+ <use ns4:href="#md7efaf3aec" x="795.429055" y="417.582853" style="fill: #1f77b4; stroke: #1f77b4" />
4336
  </g>
4337
  </g>
4338
  <g id="series--torch-eager" class="series">
4339
+ <path d="M 83.325193 398.251755 L 114.286231 389.253796 L 145.247268 390.058222 L 176.208306 391.19354 L 207.169343 391.293886 L 238.130381 390.514339 L 269.091418 392.14807 L 300.052455 391.725125 L 331.013493 390.70425 L 361.97453 390.564098 L 392.935568 326.689372 L 423.896605 322.484796 L 454.857643 390.539219 L 485.81868 392.305638 L 516.779718 392.105776 L 547.740755 390.821182 L 578.701793 391.343644 L 609.66283 390.687664 L 640.623868 392.031968 L 671.584905 390.522632 L 702.545943 380.546094 L 733.50698 375.794177 L 764.468018 57.235754 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4340
  <defs>
4341
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4342
  </defs>
4343
  <g clip-path="url(#pb49fc4c8d2)">
4344
+ <use ns4:href="#m9b8c54d372" x="83.325193" y="398.251755" style="fill: #ff7f0e; stroke: #ff7f0e" />
4345
+ <use ns4:href="#m9b8c54d372" x="114.286231" y="389.253796" style="fill: #ff7f0e; stroke: #ff7f0e" />
4346
+ <use ns4:href="#m9b8c54d372" x="145.247268" y="390.058222" style="fill: #ff7f0e; stroke: #ff7f0e" />
4347
+ <use ns4:href="#m9b8c54d372" x="176.208306" y="391.19354" style="fill: #ff7f0e; stroke: #ff7f0e" />
4348
+ <use ns4:href="#m9b8c54d372" x="207.169343" y="391.293886" style="fill: #ff7f0e; stroke: #ff7f0e" />
4349
+ <use ns4:href="#m9b8c54d372" x="238.130381" y="390.514339" style="fill: #ff7f0e; stroke: #ff7f0e" />
4350
+ <use ns4:href="#m9b8c54d372" x="269.091418" y="392.14807" style="fill: #ff7f0e; stroke: #ff7f0e" />
4351
+ <use ns4:href="#m9b8c54d372" x="300.052455" y="391.725125" style="fill: #ff7f0e; stroke: #ff7f0e" />
4352
+ <use ns4:href="#m9b8c54d372" x="331.013493" y="390.70425" style="fill: #ff7f0e; stroke: #ff7f0e" />
4353
+ <use ns4:href="#m9b8c54d372" x="361.97453" y="390.564098" style="fill: #ff7f0e; stroke: #ff7f0e" />
4354
+ <use ns4:href="#m9b8c54d372" x="392.935568" y="326.689372" style="fill: #ff7f0e; stroke: #ff7f0e" />
4355
+ <use ns4:href="#m9b8c54d372" x="423.896605" y="322.484796" style="fill: #ff7f0e; stroke: #ff7f0e" />
4356
+ <use ns4:href="#m9b8c54d372" x="454.857643" y="390.539219" style="fill: #ff7f0e; stroke: #ff7f0e" />
4357
+ <use ns4:href="#m9b8c54d372" x="485.81868" y="392.305638" style="fill: #ff7f0e; stroke: #ff7f0e" />
4358
+ <use ns4:href="#m9b8c54d372" x="516.779718" y="392.105776" style="fill: #ff7f0e; stroke: #ff7f0e" />
4359
+ <use ns4:href="#m9b8c54d372" x="547.740755" y="390.821182" style="fill: #ff7f0e; stroke: #ff7f0e" />
4360
+ <use ns4:href="#m9b8c54d372" x="578.701793" y="391.343644" style="fill: #ff7f0e; stroke: #ff7f0e" />
4361
+ <use ns4:href="#m9b8c54d372" x="609.66283" y="390.687664" style="fill: #ff7f0e; stroke: #ff7f0e" />
4362
+ <use ns4:href="#m9b8c54d372" x="640.623868" y="392.031968" style="fill: #ff7f0e; stroke: #ff7f0e" />
4363
+ <use ns4:href="#m9b8c54d372" x="671.584905" y="390.522632" style="fill: #ff7f0e; stroke: #ff7f0e" />
4364
+ <use ns4:href="#m9b8c54d372" x="702.545943" y="380.546094" style="fill: #ff7f0e; stroke: #ff7f0e" />
4365
+ <use ns4:href="#m9b8c54d372" x="733.50698" y="375.794177" style="fill: #ff7f0e; stroke: #ff7f0e" />
4366
+ <use ns4:href="#m9b8c54d372" x="764.468018" y="57.235754" style="fill: #ff7f0e; stroke: #ff7f0e" />
4367
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
4368
  </g>
4369
  </g>
 
4422
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4423
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4424
  </span> |
4425
+ Cell: combine | 4.61s
4426
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4427
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4428
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4515
  hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True
4516
  hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True
4517
  hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True
4518
+ hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.05 True
4519
  hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.05 True
4520
  hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.05 True
4521
  hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True
 
4576
  <div class="uv-install-logs" id="uv-logs-combine">
4577
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4578
  <div class="uv-logs-content" style="display: none;">
4579
+ Installed 37 packages in 314ms
4580
  </div>
4581
  </div>
4582
  <div class="cell-artifacts">
 
4589
  <rdf:RDF>
4590
  <ns2:Work>
4591
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4592
+ <dc:date>2025-12-19T19:09:46.065014</dc:date>
4593
  <dc:format>image/svg+xml</dc:format>
4594
  <dc:creator>
4595
  <ns2:Agent>
4596
+ <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
4597
  </ns2:Agent>
4598
  </dc:creator>
4599
  </ns2:Work>
 
4933
  <g id="matplotlib.axis_2">
4934
  <g id="ytick_1">
4935
  <g id="grid-y--2" class="grid grid-y">
4936
+ <path d="M 47.72 375.695489 L 831.034248 375.695489 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4937
  </g>
4938
  <g id="line2d_25">
4939
  <defs>
4940
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4941
  </defs>
4942
  <g>
4943
+ <use ns4:href="#m0fca2865ba" x="47.72" y="375.695489" style="stroke: #000000; stroke-width: 0.8" />
4944
  </g>
4945
  </g>
4946
  <g id="text_25">
4947
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.494708" transform="rotate(-0 40.72 379.494708)">0.1</text>
4948
  </g>
4949
  </g>
4950
  <g id="ytick_2">
4951
  <g id="grid-y--3" class="grid grid-y">
4952
+ <path d="M 47.72 292.764994 L 831.034248 292.764994 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4953
  </g>
4954
  <g id="line2d_26">
4955
  <g>
4956
+ <use ns4:href="#m0fca2865ba" x="47.72" y="292.764994" style="stroke: #000000; stroke-width: 0.8" />
4957
  </g>
4958
  </g>
4959
  <g id="text_26">
4960
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.564213" transform="rotate(-0 40.72 296.564213)">0.2</text>
4961
  </g>
4962
  </g>
4963
  <g id="ytick_3">
4964
  <g id="grid-y--4" class="grid grid-y">
4965
+ <path d="M 47.72 209.834499 L 831.034248 209.834499 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4966
  </g>
4967
  <g id="line2d_27">
4968
  <g>
4969
+ <use ns4:href="#m0fca2865ba" x="47.72" y="209.834499" style="stroke: #000000; stroke-width: 0.8" />
4970
  </g>
4971
  </g>
4972
  <g id="text_27">
4973
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.633718" transform="rotate(-0 40.72 213.633718)">0.3</text>
4974
  </g>
4975
  </g>
4976
  <g id="ytick_4">
4977
  <g id="grid-y--5" class="grid grid-y">
4978
+ <path d="M 47.72 126.904004 L 831.034248 126.904004 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4979
  </g>
4980
  <g id="line2d_28">
4981
  <g>
4982
+ <use ns4:href="#m0fca2865ba" x="47.72" y="126.904004" style="stroke: #000000; stroke-width: 0.8" />
4983
  </g>
4984
  </g>
4985
  <g id="text_28">
4986
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.703223" transform="rotate(-0 40.72 130.703223)">0.4</text>
4987
  </g>
4988
  </g>
4989
  <g id="ytick_5">
4990
  <g id="grid-y--6" class="grid grid-y">
4991
+ <path d="M 47.72 43.973509 L 831.034248 43.973509 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4992
  </g>
4993
  <g id="line2d_29">
4994
  <g>
4995
+ <use ns4:href="#m0fca2865ba" x="47.72" y="43.973509" style="stroke: #000000; stroke-width: 0.8" />
4996
  </g>
4997
  </g>
4998
  <g id="text_29">
4999
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.772728" transform="rotate(-0 40.72 47.772728)">0.5</text>
5000
  </g>
5001
  </g>
5002
  <g id="label--y" class="ylabel">
 
5004
  </g>
5005
  </g>
5006
  <g id="series--hf-kernels-causal-conv1d" class="series">
5007
+ <path d="M 83.325193 420.186871 L 114.286231 415.418367 L 145.247268 416.273381 L 176.208306 416.636616 L 207.169343 417.035512 L 238.130381 417.110979 L 269.091418 417.475043 L 300.052455 417.781886 L 331.013493 417.616025 L 361.97453 417.516509 L 392.935568 417.674077 L 423.896605 417.018926 L 454.857643 417.358941 L 485.81868 417.823352 L 516.779718 416.969997 L 547.740755 416.968338 L 578.701793 417.640904 L 609.66283 417.300889 L 640.623868 417.38382 L 671.584905 417.980919 L 702.545943 416.836479 L 733.50698 417.110149 L 764.468018 417.450164 L 795.429055 417.582853 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
5008
  <defs>
5009
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
5010
  </defs>
5011
  <g clip-path="url(#pb49fc4c8d2)">
5012
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
5013
+ <use ns4:href="#md7efaf3aec" x="114.286231" y="415.418367" style="fill: #1f77b4; stroke: #1f77b4" />
5014
+ <use ns4:href="#md7efaf3aec" x="145.247268" y="416.273381" style="fill: #1f77b4; stroke: #1f77b4" />
5015
+ <use ns4:href="#md7efaf3aec" x="176.208306" y="416.636616" style="fill: #1f77b4; stroke: #1f77b4" />
5016
+ <use ns4:href="#md7efaf3aec" x="207.169343" y="417.035512" style="fill: #1f77b4; stroke: #1f77b4" />
5017
+ <use ns4:href="#md7efaf3aec" x="238.130381" y="417.110979" style="fill: #1f77b4; stroke: #1f77b4" />
5018
+ <use ns4:href="#md7efaf3aec" x="269.091418" y="417.475043" style="fill: #1f77b4; stroke: #1f77b4" />
5019
+ <use ns4:href="#md7efaf3aec" x="300.052455" y="417.781886" style="fill: #1f77b4; stroke: #1f77b4" />
5020
+ <use ns4:href="#md7efaf3aec" x="331.013493" y="417.616025" style="fill: #1f77b4; stroke: #1f77b4" />
5021
+ <use ns4:href="#md7efaf3aec" x="361.97453" y="417.516509" style="fill: #1f77b4; stroke: #1f77b4" />
5022
+ <use ns4:href="#md7efaf3aec" x="392.935568" y="417.674077" style="fill: #1f77b4; stroke: #1f77b4" />
5023
+ <use ns4:href="#md7efaf3aec" x="423.896605" y="417.018926" style="fill: #1f77b4; stroke: #1f77b4" />
5024
+ <use ns4:href="#md7efaf3aec" x="454.857643" y="417.358941" style="fill: #1f77b4; stroke: #1f77b4" />
5025
+ <use ns4:href="#md7efaf3aec" x="485.81868" y="417.823352" style="fill: #1f77b4; stroke: #1f77b4" />
5026
+ <use ns4:href="#md7efaf3aec" x="516.779718" y="416.969997" style="fill: #1f77b4; stroke: #1f77b4" />
5027
+ <use ns4:href="#md7efaf3aec" x="547.740755" y="416.968338" style="fill: #1f77b4; stroke: #1f77b4" />
5028
+ <use ns4:href="#md7efaf3aec" x="578.701793" y="417.640904" style="fill: #1f77b4; stroke: #1f77b4" />
5029
+ <use ns4:href="#md7efaf3aec" x="609.66283" y="417.300889" style="fill: #1f77b4; stroke: #1f77b4" />
5030
+ <use ns4:href="#md7efaf3aec" x="640.623868" y="417.38382" style="fill: #1f77b4; stroke: #1f77b4" />
5031
+ <use ns4:href="#md7efaf3aec" x="671.584905" y="417.980919" style="fill: #1f77b4; stroke: #1f77b4" />
5032
+ <use ns4:href="#md7efaf3aec" x="702.545943" y="416.836479" style="fill: #1f77b4; stroke: #1f77b4" />
5033
+ <use ns4:href="#md7efaf3aec" x="733.50698" y="417.110149" style="fill: #1f77b4; stroke: #1f77b4" />
5034
+ <use ns4:href="#md7efaf3aec" x="764.468018" y="417.450164" style="fill: #1f77b4; stroke: #1f77b4" />
5035
+ <use ns4:href="#md7efaf3aec" x="795.429055" y="417.582853" style="fill: #1f77b4; stroke: #1f77b4" />
5036
  </g>
5037
  </g>
5038
  <g id="series--torch-eager" class="series">
5039
+ <path d="M 83.325193 398.251755 L 114.286231 389.253796 L 145.247268 390.058222 L 176.208306 391.19354 L 207.169343 391.293886 L 238.130381 390.514339 L 269.091418 392.14807 L 300.052455 391.725125 L 331.013493 390.70425 L 361.97453 390.564098 L 392.935568 326.689372 L 423.896605 322.484796 L 454.857643 390.539219 L 485.81868 392.305638 L 516.779718 392.105776 L 547.740755 390.821182 L 578.701793 391.343644 L 609.66283 390.687664 L 640.623868 392.031968 L 671.584905 390.522632 L 702.545943 380.546094 L 733.50698 375.794177 L 764.468018 57.235754 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5040
  <defs>
5041
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5042
  </defs>
5043
  <g clip-path="url(#pb49fc4c8d2)">
5044
+ <use ns4:href="#m9b8c54d372" x="83.325193" y="398.251755" style="fill: #ff7f0e; stroke: #ff7f0e" />
5045
+ <use ns4:href="#m9b8c54d372" x="114.286231" y="389.253796" style="fill: #ff7f0e; stroke: #ff7f0e" />
5046
+ <use ns4:href="#m9b8c54d372" x="145.247268" y="390.058222" style="fill: #ff7f0e; stroke: #ff7f0e" />
5047
+ <use ns4:href="#m9b8c54d372" x="176.208306" y="391.19354" style="fill: #ff7f0e; stroke: #ff7f0e" />
5048
+ <use ns4:href="#m9b8c54d372" x="207.169343" y="391.293886" style="fill: #ff7f0e; stroke: #ff7f0e" />
5049
+ <use ns4:href="#m9b8c54d372" x="238.130381" y="390.514339" style="fill: #ff7f0e; stroke: #ff7f0e" />
5050
+ <use ns4:href="#m9b8c54d372" x="269.091418" y="392.14807" style="fill: #ff7f0e; stroke: #ff7f0e" />
5051
+ <use ns4:href="#m9b8c54d372" x="300.052455" y="391.725125" style="fill: #ff7f0e; stroke: #ff7f0e" />
5052
+ <use ns4:href="#m9b8c54d372" x="331.013493" y="390.70425" style="fill: #ff7f0e; stroke: #ff7f0e" />
5053
+ <use ns4:href="#m9b8c54d372" x="361.97453" y="390.564098" style="fill: #ff7f0e; stroke: #ff7f0e" />
5054
+ <use ns4:href="#m9b8c54d372" x="392.935568" y="326.689372" style="fill: #ff7f0e; stroke: #ff7f0e" />
5055
+ <use ns4:href="#m9b8c54d372" x="423.896605" y="322.484796" style="fill: #ff7f0e; stroke: #ff7f0e" />
5056
+ <use ns4:href="#m9b8c54d372" x="454.857643" y="390.539219" style="fill: #ff7f0e; stroke: #ff7f0e" />
5057
+ <use ns4:href="#m9b8c54d372" x="485.81868" y="392.305638" style="fill: #ff7f0e; stroke: #ff7f0e" />
5058
+ <use ns4:href="#m9b8c54d372" x="516.779718" y="392.105776" style="fill: #ff7f0e; stroke: #ff7f0e" />
5059
+ <use ns4:href="#m9b8c54d372" x="547.740755" y="390.821182" style="fill: #ff7f0e; stroke: #ff7f0e" />
5060
+ <use ns4:href="#m9b8c54d372" x="578.701793" y="391.343644" style="fill: #ff7f0e; stroke: #ff7f0e" />
5061
+ <use ns4:href="#m9b8c54d372" x="609.66283" y="390.687664" style="fill: #ff7f0e; stroke: #ff7f0e" />
5062
+ <use ns4:href="#m9b8c54d372" x="640.623868" y="392.031968" style="fill: #ff7f0e; stroke: #ff7f0e" />
5063
+ <use ns4:href="#m9b8c54d372" x="671.584905" y="390.522632" style="fill: #ff7f0e; stroke: #ff7f0e" />
5064
+ <use ns4:href="#m9b8c54d372" x="702.545943" y="380.546094" style="fill: #ff7f0e; stroke: #ff7f0e" />
5065
+ <use ns4:href="#m9b8c54d372" x="733.50698" y="375.794177" style="fill: #ff7f0e; stroke: #ff7f0e" />
5066
+ <use ns4:href="#m9b8c54d372" x="764.468018" y="57.235754" style="fill: #ff7f0e; stroke: #ff7f0e" />
5067
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
5068
  </g>
5069
  </g>
deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl CHANGED
@@ -1,4 +1,4 @@
1
- {"ts": "2025-11-10T21:59:00Z", "run": "af78c748d8aa44afbf8c01edaace0f7f", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q100_H8_E256_L4_P4", "batch_size": 1, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.269501999966451, "p50": 3.2801430000404252, "p90": 3.3024029999637605, "mean": 3.289842799995313, "iqr": 0.02382099995656972, "raw_times": [3.3185839999987365, 3.2801430000404252, 3.269501999966451, 3.278582000007191, 3.3024029999637605], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.341974000022674, "peak_bytes": 5929472, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
2
- {"ts": "2025-11-10T21:59:00Z", "run": "af78c748d8aa44afbf8c01edaace0f7f", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q300_H8_E256_L4_P4", "batch_size": 1, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.9856040000358917, "p50": 4.010704999984682, "p90": 4.045005000023139, "mean": 4.019770599995809, "iqr": 0.05120100007616202, "raw_times": [4.045005000023139, 4.063734999988355, 4.010704999984682, 3.9856040000358917, 3.9938039999469765], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.026463999991847, "peak_bytes": 15161856, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
3
- {"ts": "2025-11-10T21:59:01Z", "run": "af78c748d8aa44afbf8c01edaace0f7f", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q100_H8_E256_L4_P4", "batch_size": 2, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.021324000007098, "p50": 4.030125000042517, "p90": 4.037073999995755, "mean": 4.049654600009944, "iqr": 0.01115999998546613, "raw_times": [4.037073999995755, 4.021324000007098, 4.133835999994062, 4.030125000042517, 4.025914000010289], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.049624999993284, "peak_bytes": 11958784, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
4
- {"ts": "2025-11-10T21:59:01Z", "run": "af78c748d8aa44afbf8c01edaace0f7f", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q300_H8_E256_L4_P4", "batch_size": 2, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.128727000022536, "p50": 4.140276000043741, "p90": 4.142176000016207, "mean": 4.14041620002763, "iqr": 0.006619999965096213, "raw_times": [4.155346000004556, 4.140276000043741, 4.1355560000511105, 4.128727000022536, 4.142176000016207], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.139206999980161, "peak_bytes": 30977024, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
 
1
+ {"ts": "2025-12-19T19:09:31Z", "run": "04da14902c784090beeb85878bc3f422", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_Q100_H8_E256_L4_P4", "batch_size": 1, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.03506000007291732, "p50": 0.03723999998328509, "p90": 0.03738000009434472, "mean": 0.036888000067847315, "iqr": 0.0004900000476482091, "raw_times": [0.03506000007291732, 0.03787000014199293, 0.03738000009434472, 0.036890000046696514, 0.03723999998328509], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04449099992598349, "peak_bytes": 2264064, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.539113701047427e-08, "mse": 6.418638644407112e-15, "ref": "deformable_detr_torch"}, "err": null}
2
+ {"ts": "2025-12-19T19:09:31Z", "run": "04da14902c784090beeb85878bc3f422", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_Q300_H8_E256_L4_P4", "batch_size": 1, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04158100000495324, "p50": 0.04245099989930168, "p90": 0.04307099993638985, "mean": 0.04284299998289498, "iqr": 0.0008199999683711212, "raw_times": [0.04158100000495324, 0.04245099989930168, 0.0448610001058114, 0.04307099993638985, 0.04225099996801873], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.046680999957970926, "peak_bytes": 4004864, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.559346050176828e-08, "mse": 6.4289483059246175e-15, "ref": "deformable_detr_torch"}, "err": null}
3
+ {"ts": "2025-12-19T19:09:31Z", "run": "04da14902c784090beeb85878bc3f422", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_Q100_H8_E256_L4_P4", "batch_size": 2, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04193000017949089, "p50": 0.04463999994186452, "p90": 0.0456009997833462, "mean": 0.044314399929135107, "iqr": 0.001990999862755416, "raw_times": [0.04193000017949089, 0.04463999994186452, 0.04579099982038315, 0.0456009997833462, 0.04360999992059078], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04799999987881165, "peak_bytes": 5459968, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.555110149657594e-08, "mse": 6.418781369458724e-15, "ref": "deformable_detr_torch"}, "err": null}
4
+ {"ts": "2025-12-19T19:09:31Z", "run": "04da14902c784090beeb85878bc3f422", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_Q300_H8_E256_L4_P4", "batch_size": 2, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044340000158626935, "p50": 0.0453610000477056, "p90": 0.045860999989599804, "mean": 0.04539080005088181, "iqr": 0.000529999852005858, "raw_times": [0.045860999989599804, 0.045331000137593946, 0.044340000158626935, 0.04606099992088275, 0.0453610000477056], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04745000001094013, "peak_bytes": 8008704, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.5905669427147586e-08, "mse": 6.485184940875199e-15, "ref": "deformable_detr_torch"}, "err": null}
deformable_detr/impls/cells/benchmark.py CHANGED
@@ -4,6 +4,7 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
@@ -12,107 +13,30 @@
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
15
 
 
 
16
 
17
- def torch_deformable_detr(
 
18
  value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64
19
  ):
20
- """
21
- PyTorch native reference implementation of multi-scale deformable attention.
22
- Uses vectorized bilinear interpolation for reasonable performance.
23
- """
24
- bs, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
25
- _, _, _, channels = value.shape
26
-
27
- output = torch.zeros(bs, num_queries, num_heads, channels, device=value.device, dtype=value.dtype)
28
-
29
- # Split value tensor by levels
30
- value_list = value.split([int(h * w) for h, w in spatial_shapes.tolist()], dim=1)
31
-
32
- # Iterate through each level (can't avoid this loop easily)
33
- for level_idx in range(num_levels):
34
- h, w = spatial_shapes[level_idx].tolist()
35
- value_level = value_list[level_idx] # (bs, h*w, num_heads, channels)
36
-
37
- # Reshape to spatial grid: (bs, num_heads, channels, h, w)
38
- value_spatial = value_level.reshape(bs, h, w, num_heads, channels).permute(0, 3, 4, 1, 2)
39
-
40
- # Get sampling locations and weights for this level
41
- # loc: (bs, num_queries, num_heads, num_points, 2)
42
- loc = sampling_locations[:, :, :, level_idx, :, :]
43
- # weight: (bs, num_queries, num_heads, num_points)
44
- weight = attention_weights[:, :, :, level_idx, :]
45
-
46
- # Convert normalized coordinates to pixel coordinates
47
- # loc[..., 0] is x (width), loc[..., 1] is y (height)
48
- x = loc[..., 0] * w - 0.5 # (bs, num_queries, num_heads, num_points)
49
- y = loc[..., 1] * h - 0.5
50
-
51
- # Get integer coordinates for bilinear interpolation
52
- x0 = torch.floor(x).long()
53
- y0 = torch.floor(y).long()
54
- x1 = x0 + 1
55
- y1 = y0 + 1
56
-
57
- # Compute interpolation weights BEFORE clamping (important!)
58
- lw = x - x0.float() # weight for x direction
59
- lh = y - y0.float() # weight for y direction
60
- hw = 1 - lw
61
- hh = 1 - lh
62
-
63
- # Create mask for valid sample locations
64
- valid = (y > -1) & (x > -1) & (y < h) & (x < w)
65
-
66
- # Create masks for each corner being in bounds
67
- mask_tl = ((y0 >= 0) & (x0 >= 0)).unsqueeze(-1).float()
68
- mask_tr = ((y0 >= 0) & (x1 <= w - 1)).unsqueeze(-1).float()
69
- mask_bl = ((y1 <= h - 1) & (x0 >= 0)).unsqueeze(-1).float()
70
- mask_br = ((y1 <= h - 1) & (x1 <= w - 1)).unsqueeze(-1).float()
71
-
72
- # Clamp coordinates for safe indexing
73
- x0_clamped = torch.clamp(x0, 0, w - 1)
74
- x1_clamped = torch.clamp(x1, 0, w - 1)
75
- y0_clamped = torch.clamp(y0, 0, h - 1)
76
- y1_clamped = torch.clamp(y1, 0, h - 1)
77
-
78
- # Bilinear interpolation weights for all 4 corners
79
- w_tl = (hh * hw).unsqueeze(-1) # top-left: (bs, num_queries, num_heads, num_points, 1)
80
- w_tr = (hh * lw).unsqueeze(-1) # top-right
81
- w_bl = (lh * hw).unsqueeze(-1) # bottom-left
82
- w_br = (lh * lw).unsqueeze(-1) # bottom-right
83
-
84
- # Gather values from the 4 corners using advanced indexing
85
- batch_idx = torch.arange(bs, device=value.device).view(bs, 1, 1, 1).expand(bs, num_queries, num_heads, num_points)
86
- head_idx = torch.arange(num_heads, device=value.device).view(1, 1, num_heads, 1).expand(bs, num_queries, num_heads, num_points)
87
-
88
- # Gather corner values with clamped indices, then apply corner masks
89
- v_tl = value_spatial[batch_idx, head_idx, :, y0_clamped, x0_clamped] * mask_tl
90
- v_tr = value_spatial[batch_idx, head_idx, :, y0_clamped, x1_clamped] * mask_tr
91
- v_bl = value_spatial[batch_idx, head_idx, :, y1_clamped, x0_clamped] * mask_bl
92
- v_br = value_spatial[batch_idx, head_idx, :, y1_clamped, x1_clamped] * mask_br
93
-
94
- # Bilinear interpolation
95
- sampled = w_tl * v_tl + w_tr * v_tr + w_bl * v_bl + w_br * v_br
96
-
97
- # Apply valid mask (only accumulate if entire sample location is valid)
98
- sampled = sampled * valid.unsqueeze(-1).float()
99
-
100
- # Apply attention weights and sum over points
101
- # weight: (bs, num_queries, num_heads, num_points)
102
- # Expand weight: (bs, num_queries, num_heads, num_points, 1)
103
- weighted_sampled = sampled * weight.unsqueeze(-1)
104
-
105
- # Sum over points: (bs, num_queries, num_heads, channels)
106
- output += weighted_sampled.sum(dim=3)
107
-
108
- # Flatten last two dimensions to match kernel output
109
- return output.reshape(bs, num_queries, num_heads * channels)
110
 
111
 
112
  run_benchmark(
113
  kernel_type=KernelTypeEnum.DEFORMABLE_DETR,
114
- impl_name="torch_eager",
115
- impl_tags={"family": "pytorch", "backend": "eager"},
116
- impl_func=torch_deformable_detr,
117
  dtype="float32",
118
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
+ # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
 
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
+ from kernels import get_kernel
17
 
18
+ # Load the deformable DETR kernel
19
+ deformable_detr = get_kernel("kernels-community/deformable-detr")
20
 
21
+
22
+ def hf_kernels_deformable_detr(
23
  value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64
24
  ):
25
+ """HuggingFace Kernels Deformable DETR Multi-Scale Deformable Attention"""
26
+ return deformable_detr.ms_deform_attn_forward(
27
+ value=value,
28
+ spatial_shapes=spatial_shapes,
29
+ level_start_index=level_start_index,
30
+ sampling_loc=sampling_locations,
31
+ attn_weight=attention_weights,
32
+ im2col_step=im2col_step
33
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
 
36
  run_benchmark(
37
  kernel_type=KernelTypeEnum.DEFORMABLE_DETR,
38
+ impl_name="hf_kernels_deformable_detr",
39
+ impl_tags={"family": "hf-kernels", "backend": "cuda"},
40
+ impl_func=hf_kernels_deformable_detr,
41
  dtype="float32",
42
  )
deformable_detr/impls/hf_kernels_deformable_detr.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.22s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3905,16 +3905,16 @@ Cell: nv | 0.22s
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
- <div class="cell-stdout"><pre class="stdout-text">Mon Nov 10 21:58:17 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
- | NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
3912
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3913
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
- | N/A 28C P0 79W / 350W | 0MiB / 46068MiB | 11% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.22s
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
- Cell: benchmark | 8.74s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -4003,24 +4003,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q100_H8_E256_L4_P4
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
- hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 189.823us 748.99% 189.823us 189.823us 1
4007
- hf_kernels_deformable_detr 6.28% 137.822us 99.65% 2.188ms 2.188ms 0.000us 0.00% 26.400us 26.400us 1
4008
- _deformable_detr_57c3d32::ms_deform_attn_forward 3.04% 66.841us 93.38% 2.051ms 683.551us 22.496us 88.76% 26.400us 8.800us 3
4009
- void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.496us 88.76% 22.496us 7.499us 3
4010
- aten::zeros 0.83% 18.191us 87.50% 1.922ms 640.537us 0.000us 0.00% 3.904us 1.301us 3
4011
- aten::zero_ 0.64% 14.160us 85.08% 1.868ms 622.823us 0.000us 0.00% 3.904us 1.301us 3
4012
- aten::fill_ 1.45% 31.860us 84.44% 1.854ms 618.103us 2.848us 11.24% 3.904us 1.301us 3
4013
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.848us 11.24% 2.848us 0.949us 3
4014
- Activity Buffer Request 80.97% 1.778ms 80.97% 1.778ms 1.778ms 1.056us 4.17% 1.056us 1.056us 1
4015
- aten::empty 1.59% 34.950us 1.59% 34.950us 11.650us 0.000us 0.00% 0.000us 0.000us 3
4016
- cudaLaunchKernel 2.83% 62.083us 2.83% 62.083us 10.347us 0.000us 0.00% 0.000us 0.000us 6
4017
- aten::view 0.81% 17.870us 0.81% 17.870us 2.978us 0.000us 0.00% 0.000us 0.000us 6
4018
- aten::select 1.01% 22.200us 1.21% 26.600us 8.867us 0.000us 0.00% 0.000us 0.000us 3
4019
- aten::as_strided 0.20% 4.400us 0.20% 4.400us 1.467us 0.000us 0.00% 0.000us 0.000us 3
4020
- cudaDeviceSynchronize 0.35% 7.640us 0.35% 7.640us 7.640us 0.000us 0.00% 0.000us 0.000us 1
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
- Self CPU time total: 2.196ms
4023
- Self CUDA time total: 25.344us
4024
 
4025
 
4026
 
@@ -4030,24 +4030,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q300_H8_E256_L4_P4
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4032
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4033
- hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 134.592us 507.36% 134.592us 134.592us 1
4034
- hf_kernels_deformable_detr 3.69% 73.590us 99.72% 1.986ms 1.986ms 0.000us 0.00% 27.456us 27.456us 1
4035
- _deformable_detr_57c3d32::ms_deform_attn_forward 1.62% 32.200us 96.02% 1.913ms 637.550us 23.712us 89.38% 27.456us 9.152us 3
4036
- void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 23.712us 89.38% 23.712us 7.904us 3
4037
- aten::zeros 0.41% 8.111us 92.57% 1.844ms 614.623us 0.000us 0.00% 3.744us 1.248us 3
4038
- aten::zero_ 0.44% 8.741us 91.34% 1.819ms 606.446us 0.000us 0.00% 3.744us 1.248us 3
4039
- aten::fill_ 1.32% 26.360us 90.90% 1.811ms 603.533us 2.816us 10.62% 3.744us 1.248us 3
4040
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.816us 10.62% 2.816us 0.939us 3
4041
- Activity Buffer Request 88.30% 1.759ms 88.30% 1.759ms 1.759ms 0.928us 3.50% 0.928us 0.928us 1
4042
- aten::empty 0.82% 16.420us 0.82% 16.420us 5.473us 0.000us 0.00% 0.000us 0.000us 3
4043
- cudaLaunchKernel 2.00% 39.862us 2.00% 39.862us 6.644us 0.000us 0.00% 0.000us 0.000us 6
4044
- aten::view 0.45% 9.050us 0.45% 9.050us 1.508us 0.000us 0.00% 0.000us 0.000us 6
4045
- aten::select 0.54% 10.840us 0.66% 13.190us 4.397us 0.000us 0.00% 0.000us 0.000us 3
4046
- aten::as_strided 0.12% 2.350us 0.12% 2.350us 0.783us 0.000us 0.00% 0.000us 0.000us 3
4047
- cudaDeviceSynchronize 0.28% 5.611us 0.28% 5.611us 5.611us 0.000us 0.00% 0.000us 0.000us 1
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
- Self CPU time total: 1.992ms
4050
- Self CUDA time total: 26.528us
4051
 
4052
 
4053
 
@@ -4057,24 +4057,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q100_H8_E256_L4_P4
4057
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4058
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4059
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
- hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 138.240us 537.98% 138.240us 138.240us 1
4061
- hf_kernels_deformable_detr 3.56% 70.651us 99.71% 1.981ms 1.981ms 0.000us 0.00% 26.624us 26.624us 1
4062
- _deformable_detr_57c3d32::ms_deform_attn_forward 1.67% 33.240us 96.15% 1.910ms 636.753us 22.912us 89.17% 26.624us 8.875us 3
4063
- void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 89.17% 22.912us 7.637us 3
4064
- aten::zeros 0.41% 8.110us 92.55% 1.839ms 612.899us 0.000us 0.00% 3.712us 1.237us 3
4065
- aten::zero_ 0.40% 7.959us 91.32% 1.814ms 604.749us 0.000us 0.00% 3.712us 1.237us 3
4066
- aten::fill_ 1.22% 24.170us 90.92% 1.806ms 602.096us 2.784us 10.83% 3.712us 1.237us 3
4067
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.784us 10.83% 2.784us 0.928us 3
4068
- Activity Buffer Request 88.35% 1.755ms 88.35% 1.755ms 1.755ms 0.928us 3.61% 0.928us 0.928us 1
4069
- aten::empty 0.82% 16.340us 0.82% 16.340us 5.447us 0.000us 0.00% 0.000us 0.000us 3
4070
- cudaLaunchKernel 2.09% 41.501us 2.09% 41.501us 6.917us 0.000us 0.00% 0.000us 0.000us 6
4071
- aten::view 0.44% 8.661us 0.44% 8.661us 1.444us 0.000us 0.00% 0.000us 0.000us 6
4072
- aten::select 0.62% 12.301us 0.75% 14.971us 4.990us 0.000us 0.00% 0.000us 0.000us 3
4073
- aten::as_strided 0.13% 2.670us 0.13% 2.670us 0.890us 0.000us 0.00% 0.000us 0.000us 3
4074
- cudaDeviceSynchronize 0.29% 5.820us 0.29% 5.820us 5.820us 0.000us 0.00% 0.000us 0.000us 1
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
- Self CPU time total: 1.987ms
4077
- Self CUDA time total: 25.696us
4078
 
4079
 
4080
 
@@ -4084,28 +4084,28 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q300_H8_E256_L4_P4
4084
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4085
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4086
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4087
- hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 151.169us 321.37% 151.169us 151.169us 1
4088
- hf_kernels_deformable_detr 3.15% 71.770us 99.78% 2.275ms 2.275ms 0.000us 0.00% 48.031us 48.031us 1
4089
- _deformable_detr_57c3d32::ms_deform_attn_forward 1.55% 35.341us 96.63% 2.204ms 734.529us 44.000us 93.54% 48.031us 16.010us 3
4090
- void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 44.000us 93.54% 44.000us 14.667us 3
4091
- aten::zeros 0.38% 8.571us 93.48% 2.132ms 710.555us 0.000us 0.00% 4.031us 1.344us 3
4092
- aten::zero_ 0.42% 9.580us 92.38% 2.107ms 702.221us 0.000us 0.00% 4.031us 1.344us 3
4093
- aten::fill_ 1.16% 26.560us 91.96% 2.097ms 699.028us 3.039us 6.46% 4.031us 1.344us 3
4094
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.039us 6.46% 3.039us 1.013us 3
4095
- Activity Buffer Request 80.85% 1.844ms 80.85% 1.844ms 1.844ms 0.992us 2.11% 0.992us 0.992us 1
4096
- aten::empty 0.72% 16.430us 0.72% 16.430us 5.477us 0.000us 0.00% 0.000us 0.000us 3
4097
- cudaLaunchKernel 10.56% 240.915us 10.56% 240.915us 40.153us 0.000us 0.00% 0.000us 0.000us 6
4098
- aten::view 0.41% 9.238us 0.41% 9.238us 1.540us 0.000us 0.00% 0.000us 0.000us 6
4099
- aten::select 0.48% 10.832us 0.58% 13.262us 4.421us 0.000us 0.00% 0.000us 0.000us 3
4100
- aten::as_strided 0.11% 2.430us 0.11% 2.430us 0.810us 0.000us 0.00% 0.000us 0.000us 3
4101
- cudaDeviceSynchronize 0.22% 4.990us 0.22% 4.990us 4.990us 0.000us 0.00% 0.000us 0.000us 1
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
- Self CPU time total: 2.280ms
4104
- Self CUDA time total: 47.039us
4105
 
4106
 
4107
  impl wl p50(ms) ok
4108
- hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.03 True
4109
  hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.04 True
4110
  hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.04 True
4111
  hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
@@ -4113,12 +4113,14 @@ hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
4113
  <div class="uv-install-logs" id="uv-logs-benchmark">
4114
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4115
  <div class="uv-logs-content" style="display: none;">
4116
- Installed 52 packages in 262ms
4117
  </div>
4118
  </div>
4119
- <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4120
- Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 9.96it/s]
4121
- Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 13.94it/s]</div>
 
 
4122
  <div class="cell-artifacts">
4123
  <h4>Artifacts:</h4>
4124
  <a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:55:49 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
+ | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
3912
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3913
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
+ | N/A 30C P0 77W / 350W | 0MiB / 46068MiB | 11% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
 
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
+ Cell: benchmark | 4.73s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
+ hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 199.744us 791.13% 199.744us 199.744us 1
4007
+ hf_kernels_deformable_detr 5.99% 129.162us 99.60% 2.148ms 2.148ms 0.000us 0.00% 26.304us 26.304us 1
4008
+ _deformable_detr_57c3d32::ms_deform_attn_forward 3.04% 65.452us 93.61% 2.019ms 672.874us 22.336us 88.47% 26.304us 8.768us 3
4009
+ void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.336us 88.47% 22.336us 7.445us 3
4010
+ aten::zeros 0.91% 19.609us 87.96% 1.897ms 632.230us 0.000us 0.00% 3.968us 1.323us 3
4011
+ aten::zero_ 0.66% 14.208us 85.42% 1.842ms 614.026us 0.000us 0.00% 3.968us 1.323us 3
4012
+ aten::fill_ 1.51% 32.653us 84.76% 1.828ms 609.290us 2.912us 11.53% 3.968us 1.323us 3
4013
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.912us 11.53% 2.912us 0.971us 3
4014
+ Activity Buffer Request 81.38% 1.755ms 81.38% 1.755ms 1.755ms 1.056us 4.18% 1.056us 1.056us 1
4015
+ aten::empty 1.62% 35.003us 1.62% 35.003us 11.668us 0.000us 0.00% 0.000us 0.000us 3
4016
+ cudaLaunchKernel 2.65% 57.140us 2.65% 57.140us 9.523us 0.000us 0.00% 0.000us 0.000us 6
4017
+ aten::view 0.79% 17.140us 0.79% 17.140us 2.857us 0.000us 0.00% 0.000us 0.000us 6
4018
+ aten::select 0.89% 19.100us 1.05% 22.620us 7.540us 0.000us 0.00% 0.000us 0.000us 3
4019
+ aten::as_strided 0.16% 3.520us 0.16% 3.520us 1.173us 0.000us 0.00% 0.000us 0.000us 3
4020
+ cudaDeviceSynchronize 0.40% 8.641us 0.40% 8.641us 8.641us 0.000us 0.00% 0.000us 0.000us 1
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
+ Self CPU time total: 2.156ms
4023
+ Self CUDA time total: 25.248us
4024
 
4025
 
4026
 
 
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4032
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4033
+ hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 136.418us 517.99% 136.418us 136.418us 1
4034
+ hf_kernels_deformable_detr 5.06% 104.032us 99.72% 2.049ms 2.049ms 0.000us 0.00% 27.296us 27.296us 1
4035
+ _deformable_detr_57c3d32::ms_deform_attn_forward 1.59% 32.619us 94.66% 1.945ms 648.480us 23.488us 89.19% 27.296us 9.099us 3
4036
+ void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 23.488us 89.19% 23.488us 7.829us 3
4037
+ aten::zeros 0.44% 8.979us 91.25% 1.875ms 625.117us 0.000us 0.00% 3.808us 1.269us 3
4038
+ aten::zero_ 0.41% 8.351us 89.97% 1.849ms 616.327us 0.000us 0.00% 3.808us 1.269us 3
4039
+ aten::fill_ 1.21% 24.960us 89.56% 1.841ms 613.543us 2.848us 10.81% 3.808us 1.269us 3
4040
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.848us 10.81% 2.848us 0.949us 3
4041
+ Activity Buffer Request 87.10% 1.790ms 87.10% 1.790ms 1.790ms 0.960us 3.65% 0.960us 0.960us 1
4042
+ aten::empty 0.85% 17.391us 0.85% 17.391us 5.797us 0.000us 0.00% 0.000us 0.000us 3
4043
+ cudaLaunchKernel 1.95% 40.151us 1.95% 40.151us 6.692us 0.000us 0.00% 0.000us 0.000us 6
4044
+ aten::view 0.44% 9.121us 0.44% 9.121us 1.520us 0.000us 0.00% 0.000us 0.000us 6
4045
+ aten::select 0.58% 11.950us 0.68% 13.920us 4.640us 0.000us 0.00% 0.000us 0.000us 3
4046
+ aten::as_strided 0.10% 1.970us 0.10% 1.970us 0.657us 0.000us 0.00% 0.000us 0.000us 3
4047
+ cudaDeviceSynchronize 0.28% 5.670us 0.28% 5.670us 5.670us 0.000us 0.00% 0.000us 0.000us 1
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
+ Self CPU time total: 2.055ms
4050
+ Self CUDA time total: 26.336us
4051
 
4052
 
4053
 
 
4057
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4058
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4059
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
+ hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 138.431us 541.44% 138.431us 138.431us 1
4061
+ hf_kernels_deformable_detr 4.88% 96.691us 99.73% 1.977ms 1.977ms 0.000us 0.00% 26.495us 26.495us 1
4062
+ _deformable_detr_57c3d32::ms_deform_attn_forward 1.70% 33.709us 94.86% 1.881ms 626.893us 22.783us 89.11% 26.495us 8.832us 3
4063
+ void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.783us 89.11% 22.783us 7.594us 3
4064
+ aten::zeros 0.43% 8.511us 91.28% 1.810ms 603.293us 0.000us 0.00% 3.712us 1.237us 3
4065
+ aten::zero_ 0.42% 8.319us 90.02% 1.785ms 594.946us 0.000us 0.00% 3.712us 1.237us 3
4066
+ aten::fill_ 1.36% 26.920us 89.60% 1.777ms 592.173us 2.784us 10.89% 3.712us 1.237us 3
4067
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.784us 10.89% 2.784us 0.928us 3
4068
+ Activity Buffer Request 86.99% 1.725ms 86.99% 1.725ms 1.725ms 0.928us 3.63% 0.928us 0.928us 1
4069
+ aten::empty 0.83% 16.530us 0.83% 16.530us 5.510us 0.000us 0.00% 0.000us 0.000us 3
4070
+ cudaLaunchKernel 1.99% 39.553us 1.99% 39.553us 6.592us 0.000us 0.00% 0.000us 0.000us 6
4071
+ aten::view 0.47% 9.270us 0.47% 9.270us 1.545us 0.000us 0.00% 0.000us 0.000us 6
4072
+ aten::select 0.56% 11.070us 0.66% 13.141us 4.380us 0.000us 0.00% 0.000us 0.000us 3
4073
+ aten::as_strided 0.10% 2.071us 0.10% 2.071us 0.690us 0.000us 0.00% 0.000us 0.000us 3
4074
+ cudaDeviceSynchronize 0.27% 5.300us 0.27% 5.300us 5.300us 0.000us 0.00% 0.000us 0.000us 1
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
+ Self CPU time total: 1.983ms
4077
+ Self CUDA time total: 25.567us
4078
 
4079
 
4080
 
 
4084
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4085
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4086
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4087
+ hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 142.528us 304.46% 142.528us 142.528us 1
4088
+ hf_kernels_deformable_detr 4.36% 98.391us 99.78% 2.253ms 2.253ms 0.000us 0.00% 47.838us 47.838us 1
4089
+ _deformable_detr_57c3d32::ms_deform_attn_forward 1.43% 32.311us 95.42% 2.155ms 718.335us 43.743us 93.44% 47.838us 15.946us 3
4090
+ void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 43.743us 93.44% 43.743us 14.581us 3
4091
+ aten::zeros 0.35% 7.869us 92.42% 2.087ms 695.715us 0.000us 0.00% 4.095us 1.365us 3
4092
+ aten::zero_ 0.37% 8.381us 91.32% 2.062ms 687.455us 0.000us 0.00% 4.095us 1.365us 3
4093
+ aten::fill_ 1.13% 25.460us 90.95% 2.054ms 684.661us 3.071us 6.56% 4.095us 1.365us 3
4094
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.071us 6.56% 3.071us 1.024us 3
4095
+ Activity Buffer Request 79.30% 1.791ms 79.30% 1.791ms 1.791ms 1.024us 2.19% 1.024us 1.024us 1
4096
+ aten::empty 0.75% 16.910us 0.75% 16.910us 5.637us 0.000us 0.00% 0.000us 0.000us 3
4097
+ cudaLaunchKernel 11.13% 251.265us 11.13% 251.265us 41.878us 0.000us 0.00% 0.000us 0.000us 6
4098
+ aten::view 0.41% 9.300us 0.41% 9.300us 1.550us 0.000us 0.00% 0.000us 0.000us 6
4099
+ aten::select 0.48% 10.740us 0.56% 12.720us 4.240us 0.000us 0.00% 0.000us 0.000us 3
4100
+ aten::as_strided 0.09% 1.980us 0.09% 1.980us 0.660us 0.000us 0.00% 0.000us 0.000us 3
4101
+ cudaDeviceSynchronize 0.22% 4.929us 0.22% 4.929us 4.929us 0.000us 0.00% 0.000us 0.000us 1
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
+ Self CPU time total: 2.258ms
4104
+ Self CUDA time total: 46.814us
4105
 
4106
 
4107
  impl wl p50(ms) ok
4108
+ hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
4109
  hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.04 True
4110
  hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.04 True
4111
  hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
 
4113
  <div class="uv-install-logs" id="uv-logs-benchmark">
4114
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4115
  <div class="uv-logs-content" style="display: none;">
4116
+ Installed 14 packages in 11ms
4117
  </div>
4118
  </div>
4119
+ <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
4120
+
4121
+ Fetching 7 files: 14%|█▍ | 1/7 [00:00&lt;00:02, 2.99it/s]
4122
+ Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 9.51it/s]
4123
+ Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 11.76it/s]</div>
4124
  <div class="cell-artifacts">
4125
  <h4>Artifacts:</h4>
4126
  <a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
deformable_detr/impls/torch_deformable_detr.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.22s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,16 +3904,16 @@ Cell: nv | 0.22s
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
- <div class="cell-stdout"><pre class="stdout-text">Mon Nov 10 21:58:17 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
- | NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
3911
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3912
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
- | N/A 28C P0 79W / 350W | 0MiB / 46068MiB | 11% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
@@ -3935,9 +3935,9 @@ Cell: nv | 0.22s
3935
  <span class="collapse-indicators">
3936
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3939
  </span> |
3940
- Cell: benchmark | 5.50s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -4077,29 +4077,29 @@ PROFILE TRACE: torch_eager | cuda_B1_Q100_H8_E256_L4_P4
4077
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4078
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 20.386ms 1374.87% 20.386ms 20.386ms 1
4081
- torch_eager 20.04% 4.485ms 99.97% 22.369ms 22.369ms 0.000us 0.00% 1.484ms 1.484ms 1
4082
- aten::index 4.49% 1.004ms 16.23% 3.633ms 75.679us 237.283us 16.00% 370.795us 7.725us 48
4083
- aten::copy_ 4.62% 1.034ms 11.24% 2.516ms 11.489us 365.611us 24.66% 365.611us 1.669us 219
4084
- aten::mul 5.81% 1.299ms 10.43% 2.335ms 12.160us 293.820us 19.82% 293.820us 1.530us 192
4085
- void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 237.283us 16.00% 237.283us 4.943us 48
4086
- aten::to 0.57% 127.097us 11.08% 2.479ms 14.499us 0.000us 0.00% 232.099us 1.357us 171
4087
- aten::_to_copy 2.30% 514.876us 10.51% 2.352ms 19.124us 0.000us 0.00% 232.099us 1.887us 123
4088
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 202.015us 13.62% 202.015us 1.683us 120
4089
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 167.684us 11.31% 167.684us 1.996us 84
4090
- aten::contiguous 0.35% 77.804us 8.37% 1.873ms 19.513us 0.000us 0.00% 133.512us 1.391us 96
4091
- aten::clone 0.74% 165.226us 8.02% 1.795ms 18.702us 0.000us 0.00% 133.512us 1.391us 96
4092
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.512us 9.00% 133.512us 1.391us 96
4093
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.524us 7.79% 115.524us 1.203us 96
4094
- aten::__and__ 1.20% 268.284us 4.94% 1.105ms 13.160us 0.000us 0.00% 99.070us 1.179us 84
4095
- aten::bitwise_and 2.22% 496.516us 3.74% 837.149us 9.966us 99.070us 6.68% 99.070us 1.179us 84
4096
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 99.070us 6.68% 99.070us 1.179us 84
4097
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 86.210us 5.81% 86.210us 1.197us 72
4098
- aten::sub 2.17% 485.693us 3.77% 844.019us 11.722us 79.300us 5.35% 79.300us 1.101us 72
4099
- aten::add 1.71% 382.016us 2.87% 642.388us 10.706us 74.367us 5.02% 74.367us 1.239us 60
4100
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4101
- Self CPU time total: 22.377ms
4102
- Self CUDA time total: 1.483ms
4103
 
4104
 
4105
 
@@ -4109,29 +4109,29 @@ PROFILE TRACE: torch_eager | cuda_B1_Q300_H8_E256_L4_P4
4109
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4110
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4111
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4112
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 18.901ms 1183.82% 18.901ms 18.901ms 1
4113
- torch_eager 19.58% 4.093ms 99.97% 20.894ms 20.894ms 0.000us 0.00% 1.598ms 1.598ms 1
4114
- aten::index 4.47% 934.204us 16.39% 3.425ms 71.358us 251.679us 15.76% 384.126us 8.003us 48
4115
- aten::copy_ 4.82% 1.008ms 11.62% 2.429ms 11.090us 366.752us 22.97% 366.752us 1.675us 219
4116
- aten::mul 6.02% 1.258ms 10.56% 2.208ms 11.499us 358.660us 22.46% 358.660us 1.868us 192
4117
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 266.913us 16.72% 266.913us 2.224us 120
4118
- void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 251.679us 15.76% 251.679us 5.243us 48
4119
- aten::to 0.53% 111.534us 10.80% 2.257ms 13.199us 0.000us 0.00% 234.305us 1.370us 171
4120
- aten::_to_copy 1.86% 389.526us 10.27% 2.146ms 17.443us 0.000us 0.00% 234.305us 1.905us 123
4121
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 169.699us 10.63% 169.699us 2.020us 84
4122
- aten::contiguous 0.36% 76.248us 8.65% 1.808ms 18.835us 0.000us 0.00% 132.447us 1.380us 96
4123
- aten::clone 0.75% 157.022us 8.29% 1.732ms 18.040us 0.000us 0.00% 132.447us 1.380us 96
4124
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 132.447us 8.30% 132.447us 1.380us 96
4125
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 117.700us 7.37% 117.700us 1.226us 96
4126
- aten::__and__ 0.39% 80.574us 4.34% 907.528us 10.804us 0.000us 0.00% 104.931us 1.249us 84
4127
- aten::bitwise_and 2.39% 499.734us 3.96% 826.954us 9.845us 104.931us 6.57% 104.931us 1.249us 84
4128
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.931us 6.57% 104.931us 1.249us 84
4129
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.254us 6.53% 104.254us 1.448us 72
4130
- aten::add 1.76% 366.940us 2.98% 622.302us 10.372us 91.679us 5.74% 91.679us 1.528us 60
4131
- aten::sub 2.26% 472.751us 3.91% 817.040us 11.348us 80.412us 5.04% 80.412us 1.117us 72
4132
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4133
- Self CPU time total: 20.900ms
4134
- Self CUDA time total: 1.597ms
4135
 
4136
 
4137
 
@@ -4141,29 +4141,29 @@ PROFILE TRACE: torch_eager | cuda_B2_Q100_H8_E256_L4_P4
4141
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4142
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4143
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4144
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.237ms 1248.03% 19.237ms 19.237ms 1
4145
- torch_eager 19.69% 4.158ms 99.97% 21.112ms 21.112ms 0.000us 0.00% 1.542ms 1.542ms 1
4146
- aten::index 4.41% 930.777us 16.28% 3.439ms 71.641us 244.707us 15.88% 379.074us 7.897us 48
4147
- aten::copy_ 4.79% 1.012ms 11.88% 2.509ms 11.455us 367.613us 23.85% 367.613us 1.679us 219
4148
- aten::mul 6.03% 1.274ms 10.79% 2.279ms 11.869us 324.897us 21.08% 324.897us 1.692us 192
4149
- void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 244.707us 15.88% 244.707us 5.098us 48
4150
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 233.822us 15.17% 233.822us 1.949us 120
4151
- aten::to 0.53% 111.710us 11.01% 2.324ms 13.591us 0.000us 0.00% 233.246us 1.364us 171
4152
- aten::_to_copy 1.89% 399.701us 10.48% 2.212ms 17.986us 0.000us 0.00% 233.246us 1.896us 123
4153
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 168.798us 10.95% 168.798us 2.010us 84
4154
- aten::contiguous 0.36% 76.215us 8.56% 1.808ms 18.834us 0.000us 0.00% 134.367us 1.400us 96
4155
- aten::clone 0.70% 147.727us 8.20% 1.732ms 18.040us 0.000us 0.00% 134.367us 1.400us 96
4156
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 134.367us 8.72% 134.367us 1.400us 96
4157
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 116.097us 7.53% 116.097us 1.209us 96
4158
- aten::__and__ 0.38% 80.351us 4.40% 929.654us 11.067us 0.000us 0.00% 104.257us 1.241us 84
4159
- aten::bitwise_and 2.34% 493.964us 4.02% 849.303us 10.111us 104.257us 6.76% 104.257us 1.241us 84
4160
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.257us 6.76% 104.257us 1.241us 84
4161
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 96.124us 6.24% 96.124us 1.335us 72
4162
- aten::add 1.63% 344.862us 2.97% 627.717us 10.462us 83.898us 5.44% 83.898us 1.398us 60
4163
- aten::sub 2.25% 476.045us 3.91% 826.060us 11.473us 79.295us 5.14% 79.295us 1.101us 72
4164
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4165
- Self CPU time total: 21.118ms
4166
- Self CUDA time total: 1.541ms
4167
 
4168
 
4169
 
@@ -4173,37 +4173,43 @@ PROFILE TRACE: torch_eager | cuda_B2_Q300_H8_E256_L4_P4
4173
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4174
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4175
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4176
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.519ms 1100.37% 19.519ms 19.519ms 1
4177
- torch_eager 20.47% 4.142ms 99.97% 20.229ms 20.229ms 0.000us 0.00% 1.775ms 1.775ms 1
4178
- aten::mul 6.23% 1.261ms 11.26% 2.279ms 11.871us 452.223us 25.49% 452.223us 2.355us 192
4179
- aten::index 5.19% 1.050ms 17.90% 3.622ms 75.460us 284.479us 16.04% 422.205us 8.796us 48
4180
- aten::copy_ 4.94% 1.000ms 12.35% 2.500ms 11.414us 371.807us 20.96% 371.807us 1.698us 219
4181
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 357.379us 20.15% 357.379us 2.978us 120
4182
- void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 284.479us 16.04% 284.479us 5.927us 48
4183
- aten::to 0.55% 111.602us 11.50% 2.327ms 13.611us 0.000us 0.00% 234.081us 1.369us 171
4184
- aten::_to_copy 2.05% 415.176us 10.95% 2.216ms 18.015us 0.000us 0.00% 234.081us 1.903us 123
4185
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 168.127us 9.48% 168.127us 2.002us 84
4186
- aten::contiguous 0.39% 79.104us 9.03% 1.827ms 19.029us 0.000us 0.00% 137.726us 1.435us 96
4187
- aten::clone 0.75% 151.809us 8.64% 1.748ms 18.205us 0.000us 0.00% 137.726us 1.435us 96
4188
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 137.726us 7.76% 137.726us 1.435us 96
4189
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 129.254us 7.29% 129.254us 1.795us 72
4190
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 120.034us 6.77% 120.034us 1.250us 96
4191
- aten::add 1.70% 344.853us 3.02% 611.127us 10.185us 113.603us 6.40% 113.603us 1.893us 60
4192
- aten::__and__ 0.42% 84.251us 4.73% 957.185us 11.395us 0.000us 0.00% 108.833us 1.296us 84
4193
- aten::bitwise_and 2.53% 511.745us 4.31% 872.934us 10.392us 108.833us 6.14% 108.833us 1.296us 84
4194
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 108.833us 6.14% 108.833us 1.296us 84
4195
- aten::sub 2.33% 472.119us 4.10% 828.789us 11.511us 84.547us 4.77% 84.547us 1.174us 72
4196
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4197
- Self CPU time total: 20.235ms
4198
- Self CUDA time total: 1.774ms
4199
 
4200
 
4201
  impl wl p50(ms) ok
4202
- torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.28 True
4203
- torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.01 True
4204
- torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.03 True
4205
- torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.14 True
4206
  </pre></div>
 
 
 
 
 
 
4207
  <div class="cell-artifacts">
4208
  <h4>Artifacts:</h4>
4209
  <a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:55:49 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
+ | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
3911
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3912
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
+ | N/A 30C P0 77W / 350W | 0MiB / 46068MiB | 11% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
 
3935
  <span class="collapse-indicators">
3936
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3939
  </span> |
3940
+ Cell: benchmark | 9.12s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
4077
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4078
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.928ms 1345.17% 19.928ms 19.928ms 1
4081
+ torch_eager 20.03% 4.383ms 99.97% 21.877ms 21.877ms 0.000us 0.00% 1.483ms 1.483ms 1
4082
+ aten::index 4.57% 999.946us 16.87% 3.693ms 76.930us 235.999us 15.93% 369.535us 7.699us 48
4083
+ aten::copy_ 4.70% 1.029ms 11.50% 2.517ms 11.491us 366.142us 24.72% 366.142us 1.672us 219
4084
+ aten::mul 5.86% 1.283ms 10.10% 2.209ms 11.507us 293.927us 19.84% 293.927us 1.531us 192
4085
+ void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 235.999us 15.93% 235.999us 4.917us 48
4086
+ aten::to 0.58% 126.416us 10.88% 2.380ms 13.921us 0.000us 0.00% 232.606us 1.360us 171
4087
+ aten::_to_copy 1.91% 417.236us 10.30% 2.254ms 18.325us 0.000us 0.00% 232.606us 1.891us 123
4088
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 202.308us 13.66% 202.308us 1.686us 120
4089
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 167.963us 11.34% 167.963us 2.000us 84
4090
+ aten::contiguous 0.37% 80.417us 8.79% 1.925ms 20.049us 0.000us 0.00% 133.536us 1.391us 96
4091
+ aten::clone 0.80% 175.766us 8.43% 1.844ms 19.211us 0.000us 0.00% 133.536us 1.391us 96
4092
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.536us 9.01% 133.536us 1.391us 96
4093
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.352us 7.79% 115.352us 1.202us 96
4094
+ aten::__and__ 0.45% 97.450us 4.50% 984.021us 11.715us 0.000us 0.00% 98.725us 1.175us 84
4095
+ aten::bitwise_and 2.51% 548.975us 4.05% 886.571us 10.554us 98.725us 6.66% 98.725us 1.175us 84
4096
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 98.725us 6.66% 98.725us 1.175us 84
4097
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 86.111us 5.81% 86.111us 1.196us 72
4098
+ aten::sub 2.21% 483.704us 3.73% 817.012us 11.347us 79.134us 5.34% 79.134us 1.099us 72
4099
+ aten::add 1.64% 359.872us 2.73% 597.608us 9.960us 74.367us 5.02% 74.367us 1.239us 60
4100
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4101
+ Self CPU time total: 21.884ms
4102
+ Self CUDA time total: 1.481ms
4103
 
4104
 
4105
 
 
4109
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4110
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4111
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4112
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 18.701ms 1173.82% 18.701ms 18.701ms 1
4113
+ torch_eager 19.86% 4.084ms 99.94% 20.549ms 20.549ms 0.000us 0.00% 1.594ms 1.594ms 1
4114
+ aten::index 4.47% 919.982us 16.50% 3.393ms 70.681us 250.075us 15.70% 381.947us 7.957us 48
4115
+ aten::copy_ 4.90% 1.007ms 11.73% 2.411ms 11.009us 365.571us 22.95% 365.571us 1.669us 219
4116
+ aten::mul 5.89% 1.211ms 10.29% 2.116ms 11.019us 357.953us 22.47% 357.953us 1.864us 192
4117
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 266.175us 16.71% 266.175us 2.218us 120
4118
+ void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 250.075us 15.70% 250.075us 5.210us 48
4119
+ aten::to 0.56% 115.808us 10.96% 2.254ms 13.183us 0.000us 0.00% 233.699us 1.367us 171
4120
+ aten::_to_copy 1.83% 375.992us 10.40% 2.138ms 17.386us 0.000us 0.00% 233.699us 1.900us 123
4121
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 169.346us 10.63% 169.346us 2.016us 84
4122
+ aten::contiguous 0.37% 76.815us 8.72% 1.793ms 18.680us 0.000us 0.00% 131.872us 1.374us 96
4123
+ aten::clone 0.79% 162.290us 8.35% 1.716ms 17.880us 0.000us 0.00% 131.872us 1.374us 96
4124
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 131.872us 8.28% 131.872us 1.374us 96
4125
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 117.600us 7.38% 117.600us 1.225us 96
4126
+ aten::__and__ 0.42% 86.722us 4.57% 939.170us 11.181us 0.000us 0.00% 105.348us 1.254us 84
4127
+ aten::bitwise_and 2.53% 520.363us 4.15% 852.448us 10.148us 105.348us 6.61% 105.348us 1.254us 84
4128
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 105.348us 6.61% 105.348us 1.254us 84
4129
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.196us 6.54% 104.196us 1.447us 72
4130
+ aten::add 1.65% 339.069us 2.79% 573.170us 9.553us 91.619us 5.75% 91.619us 1.527us 60
4131
+ aten::sub 2.16% 443.591us 3.72% 765.420us 10.631us 80.447us 5.05% 80.447us 1.117us 72
4132
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4133
+ Self CPU time total: 20.561ms
4134
+ Self CUDA time total: 1.593ms
4135
 
4136
 
4137
 
 
4141
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4142
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4143
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4144
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.345ms 1257.82% 19.345ms 19.345ms 1
4145
+ torch_eager 19.37% 4.137ms 99.97% 21.351ms 21.351ms 0.000us 0.00% 1.539ms 1.539ms 1
4146
+ aten::index 4.47% 955.266us 16.53% 3.530ms 73.551us 242.625us 15.78% 377.060us 7.855us 48
4147
+ aten::copy_ 4.74% 1.012ms 11.59% 2.476ms 11.307us 367.943us 23.92% 367.943us 1.680us 219
4148
+ aten::mul 5.81% 1.241ms 10.15% 2.167ms 11.287us 324.158us 21.08% 324.158us 1.688us 192
4149
+ void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 242.625us 15.78% 242.625us 5.055us 48
4150
+ aten::to 0.53% 113.722us 11.14% 2.380ms 13.916us 0.000us 0.00% 233.508us 1.366us 171
4151
+ aten::_to_copy 2.07% 441.682us 10.61% 2.266ms 18.422us 0.000us 0.00% 233.508us 1.898us 123
4152
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 233.472us 15.18% 233.472us 1.946us 120
4153
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 168.769us 10.97% 168.769us 2.009us 84
4154
+ aten::contiguous 0.38% 81.343us 8.57% 1.831ms 19.072us 0.000us 0.00% 134.435us 1.400us 96
4155
+ aten::clone 0.71% 151.394us 8.19% 1.750ms 18.225us 0.000us 0.00% 134.435us 1.400us 96
4156
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 134.435us 8.74% 134.435us 1.400us 96
4157
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 116.161us 7.55% 116.161us 1.210us 96
4158
+ aten::__and__ 0.37% 78.366us 4.26% 910.569us 10.840us 0.000us 0.00% 104.128us 1.240us 84
4159
+ aten::bitwise_and 2.32% 495.587us 3.90% 832.203us 9.907us 104.128us 6.77% 104.128us 1.240us 84
4160
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.128us 6.77% 104.128us 1.240us 84
4161
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 95.391us 6.20% 95.391us 1.325us 72
4162
+ aten::add 1.65% 352.101us 2.82% 602.659us 10.044us 83.522us 5.43% 83.522us 1.392us 60
4163
+ aten::sub 2.19% 467.179us 3.78% 806.853us 11.206us 79.169us 5.15% 79.169us 1.100us 72
4164
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4165
+ Self CPU time total: 21.357ms
4166
+ Self CUDA time total: 1.538ms
4167
 
4168
 
4169
 
 
4173
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4174
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4175
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4176
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 18.928ms 1070.52% 18.928ms 18.928ms 1
4177
+ torch_eager 19.00% 4.018ms 99.97% 21.144ms 21.144ms 0.000us 0.00% 1.769ms 1.769ms 1
4178
+ aten::mul 5.84% 1.234ms 10.44% 2.209ms 11.503us 449.959us 25.45% 449.959us 2.344us 192
4179
+ aten::index 4.43% 937.219us 16.19% 3.424ms 71.339us 281.246us 15.91% 418.466us 8.718us 48
4180
+ aten::copy_ 4.75% 1.005ms 11.71% 2.477ms 11.312us 370.923us 20.98% 370.923us 1.694us 219
4181
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 355.583us 20.11% 355.583us 2.963us 120
4182
+ void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 281.246us 15.91% 281.246us 5.859us 48
4183
+ aten::to 0.52% 110.789us 10.89% 2.302ms 13.465us 0.000us 0.00% 233.703us 1.367us 171
4184
+ aten::_to_copy 1.88% 398.545us 10.36% 2.192ms 17.819us 0.000us 0.00% 233.703us 1.900us 123
4185
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 168.070us 9.51% 168.070us 2.001us 84
4186
+ aten::contiguous 0.38% 80.073us 8.57% 1.813ms 18.880us 0.000us 0.00% 137.220us 1.429us 96
4187
+ aten::clone 0.71% 149.477us 8.19% 1.732ms 18.046us 0.000us 0.00% 137.220us 1.429us 96
4188
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 137.220us 7.76% 137.220us 1.429us 96
4189
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 128.960us 7.29% 128.960us 1.791us 72
4190
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 120.326us 6.81% 120.326us 1.253us 96
4191
+ aten::add 1.60% 338.443us 2.84% 599.957us 9.999us 113.407us 6.41% 113.407us 1.890us 60
4192
+ aten::__and__ 0.34% 72.039us 4.35% 919.096us 10.942us 0.000us 0.00% 109.028us 1.298us 84
4193
+ aten::bitwise_and 2.36% 498.512us 4.00% 847.057us 10.084us 109.028us 6.17% 109.028us 1.298us 84
4194
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 109.028us 6.17% 109.028us 1.298us 84
4195
+ aten::sub 2.14% 452.695us 3.86% 815.589us 11.328us 84.674us 4.79% 84.674us 1.176us 72
4196
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4197
+ Self CPU time total: 21.151ms
4198
+ Self CUDA time total: 1.768ms
4199
 
4200
 
4201
  impl wl p50(ms) ok
4202
+ torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.29 True
4203
+ torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.07 True
4204
+ torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.07 True
4205
+ torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.11 True
4206
  </pre></div>
4207
+ <div class="uv-install-logs" id="uv-logs-benchmark">
4208
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4209
+ <div class="uv-logs-content" style="display: none;">
4210
+ Installed 37 packages in 286ms
4211
+ </div>
4212
+ </div>
4213
  <div class="cell-artifacts">
4214
  <h4>Artifacts:</h4>
4215
  <a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
deformable_detr/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 695c4932ff6f8af1541de47b9136d11cce1234f978728ac618aa9e13e86e4875
  • Pointer size: 130 Bytes
  • Size of remote file: 14.9 kB

Git LFS Details

  • SHA256: 8ba4297c6f7aa344148bb08f308ffa1b2639ebb4a639b03f7139f745563a6d78
  • Pointer size: 130 Bytes
  • Size of remote file: 17.8 kB
deformable_detr/results/combined_results.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3889,11 +3889,11 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-11-10T22:11:56.387573</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
3896
- <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
3897
  </ns2:Agent>
3898
  </dc:creator>
3899
  </ns2:Work>
@@ -3908,208 +3908,260 @@ body[data-tool="eraser"] .main-content {
3908
  </g>
3909
  <g id="axes--1" class="axes">
3910
  <g id="patch_2">
3911
- <path d="M 39.870649 425.105974 L 824.19299 425.105974 L 824.19299 26.88 L 39.870649 26.88 L 39.870649 425.105974 z " style="fill: none" />
3912
  </g>
3913
  <g id="matplotlib.axis_1">
3914
  <g id="xtick_1">
3915
  <g id="grid-x--1" class="grid grid-x">
3916
- <path d="M 75.521665 425.105974 L 75.521665 26.88 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3917
  </g>
3918
  <g id="line2d_1">
3919
  <defs>
3920
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
3921
  </defs>
3922
  <g>
3923
- <use ns4:href="#mafb3703e5b" x="75.521665" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
3924
  </g>
3925
  </g>
3926
  <g id="text_1">
3927
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(21.473848 549.280197) rotate(-45)">cuda_B1_Q100_H8_E256_L4_P4</text>
3928
  </g>
3929
  </g>
3930
  <g id="xtick_2">
3931
  <g id="grid-x--2" class="grid grid-x">
3932
- <path d="M 313.195102 425.105974 L 313.195102 26.88 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3933
  </g>
3934
  <g id="line2d_2">
3935
  <g>
3936
- <use ns4:href="#mafb3703e5b" x="313.195102" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
3937
  </g>
3938
  </g>
3939
  <g id="text_2">
3940
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(259.147284 549.280197) rotate(-45)">cuda_B1_Q300_H8_E256_L4_P4</text>
3941
  </g>
3942
  </g>
3943
  <g id="xtick_3">
3944
  <g id="grid-x--3" class="grid grid-x">
3945
- <path d="M 550.868538 425.105974 L 550.868538 26.88 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3946
  </g>
3947
  <g id="line2d_3">
3948
  <g>
3949
- <use ns4:href="#mafb3703e5b" x="550.868538" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
3950
  </g>
3951
  </g>
3952
  <g id="text_3">
3953
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(496.820721 549.280197) rotate(-45)">cuda_B2_Q100_H8_E256_L4_P4</text>
3954
  </g>
3955
  </g>
3956
  <g id="xtick_4">
3957
  <g id="grid-x--4" class="grid grid-x">
3958
- <path d="M 788.541975 425.105974 L 788.541975 26.88 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3959
  </g>
3960
  <g id="line2d_4">
3961
  <g>
3962
- <use ns4:href="#mafb3703e5b" x="788.541975" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
3963
  </g>
3964
  </g>
3965
  <g id="text_4">
3966
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(734.494157 549.280197) rotate(-45)">cuda_B2_Q300_H8_E256_L4_P4</text>
3967
  </g>
3968
  </g>
3969
  <g id="label--x" class="xlabel">
3970
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="432.03182" y="562.545859" transform="rotate(-0 432.03182 562.545859)">Workload</text>
3971
  </g>
3972
  </g>
3973
  <g id="matplotlib.axis_2">
3974
  <g id="ytick_1">
3975
  <g id="grid-y--2" class="grid grid-y">
3976
- <path d="M 39.870649 410.033467 L 824.19299 410.033467 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3977
  </g>
3978
  <g id="line2d_5">
3979
  <defs>
3980
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3981
  </defs>
3982
  <g>
3983
- <use ns4:href="#m0fca2865ba" x="39.870649" y="410.033467" style="stroke: #000000; stroke-width: 0.8" />
3984
  </g>
3985
  </g>
3986
  <g id="text_5">
3987
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="413.832686" transform="rotate(-0 32.870649 413.832686)">0</text>
3988
  </g>
3989
  </g>
3990
  <g id="ytick_2">
3991
  <g id="grid-y--3" class="grid grid-y">
3992
- <path d="M 39.870649 321.862464 L 824.19299 321.862464 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3993
  </g>
3994
  <g id="line2d_6">
3995
  <g>
3996
- <use ns4:href="#m0fca2865ba" x="39.870649" y="321.862464" style="stroke: #000000; stroke-width: 0.8" />
3997
  </g>
3998
  </g>
3999
  <g id="text_6">
4000
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="325.661683" transform="rotate(-0 32.870649 325.661683)">1</text>
4001
  </g>
4002
  </g>
4003
  <g id="ytick_3">
4004
  <g id="grid-y--4" class="grid grid-y">
4005
- <path d="M 39.870649 233.691462 L 824.19299 233.691462 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4006
  </g>
4007
  <g id="line2d_7">
4008
  <g>
4009
- <use ns4:href="#m0fca2865ba" x="39.870649" y="233.691462" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="237.49068" transform="rotate(-0 32.870649 237.49068)">2</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_4">
4017
  <g id="grid-y--5" class="grid grid-y">
4018
- <path d="M 39.870649 145.520459 L 824.19299 145.520459 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
- <use ns4:href="#m0fca2865ba" x="39.870649" y="145.520459" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="149.319678" transform="rotate(-0 32.870649 149.319678)">3</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_5">
4030
  <g id="grid-y--6" class="grid grid-y">
4031
- <path d="M 39.870649 57.349456 L 824.19299 57.349456 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
- <use ns4:href="#m0fca2865ba" x="39.870649" y="57.349456" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="61.148675" transform="rotate(-0 32.870649 61.148675)">4</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4040
  </g>
4041
  </g>
4042
  <g id="label--y" class="ylabel">
4043
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="20.428462" y="225.992987" transform="rotate(-90 20.428462 225.992987)">Latency P50 (ms)</text>
4044
  </g>
4045
  </g>
4046
  <g id="series--hf-kernels-deformable-detr" class="series">
4047
- <path d="M 75.521665 407.004793 L 313.195102 406.116823 L 550.868538 406.08517 L 788.541975 405.961642 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4048
  <defs>
4049
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4050
  </defs>
4051
- <g clip-path="url(#pbac879f81a)">
4052
- <use ns4:href="#md7efaf3aec" x="75.521665" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
4053
- <use ns4:href="#md7efaf3aec" x="313.195102" y="406.116823" style="fill: #1f77b4; stroke: #1f77b4" />
4054
- <use ns4:href="#md7efaf3aec" x="550.868538" y="406.08517" style="fill: #1f77b4; stroke: #1f77b4" />
4055
- <use ns4:href="#md7efaf3aec" x="788.541975" y="405.961642" style="fill: #1f77b4; stroke: #1f77b4" />
4056
  </g>
4057
  </g>
4058
  <g id="series--torch-eager" class="series">
4059
- <path d="M 75.521665 120.81997 L 313.195102 56.405586 L 550.868538 54.693305 L 788.541975 44.981181 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4060
  <defs>
4061
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4062
  </defs>
4063
- <g clip-path="url(#pbac879f81a)">
4064
- <use ns4:href="#m9b8c54d372" x="75.521665" y="120.81997" style="fill: #ff7f0e; stroke: #ff7f0e" />
4065
- <use ns4:href="#m9b8c54d372" x="313.195102" y="56.405586" style="fill: #ff7f0e; stroke: #ff7f0e" />
4066
- <use ns4:href="#m9b8c54d372" x="550.868538" y="54.693305" style="fill: #ff7f0e; stroke: #ff7f0e" />
4067
- <use ns4:href="#m9b8c54d372" x="788.541975" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4068
  </g>
4069
  </g>
4070
  <g id="patch_3">
4071
- <path d="M 39.870649 425.105974 L 39.870649 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4072
  </g>
4073
  <g id="patch_4">
4074
  <path d="M 824.19299 425.105974 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4075
  </g>
4076
  <g id="patch_5">
4077
- <path d="M 39.870649 425.105974 L 824.19299 425.105974 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4078
  </g>
4079
  <g id="patch_6">
4080
- <path d="M 39.870649 26.88 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4081
  </g>
4082
- <g id="text_10">
4083
- <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="432.03182" y="20.88" transform="rotate(-0 432.03182 20.88)">Attention Implementation Latency</text>
4084
  </g>
4085
  <g id="legend" class="legend">
4086
  <g id="patch_7">
4087
- <path d="M 46.870649 64.7925 L 217.481587 64.7925 Q 219.481587 64.7925 219.481587 62.7925 L 219.481587 33.88 Q 219.481587 31.88 217.481587 31.88 L 46.870649 31.88 Q 44.870649 31.88 44.870649 33.88 L 44.870649 62.7925 Q 44.870649 64.7925 46.870649 64.7925 L 46.870649 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4088
  </g>
4089
- <g id="line2d_10">
4090
- <path d="M 48.870649 39.978438 L 58.870649 39.978438 L 68.870649 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4091
  <g>
4092
- <use ns4:href="#md7efaf3aec" x="58.870649" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4093
  </g>
4094
  </g>
4095
  <g id="legend-label--hf-kernels-deformable-detr" class="legend">
4096
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="76.870649" y="43.478438" transform="rotate(-0 76.870649 43.478438)">hf_kernels_deformable_detr</text>
4097
  </g>
4098
- <g id="line2d_11">
4099
- <path d="M 48.870649 54.934687 L 58.870649 54.934687 L 68.870649 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4100
  <g>
4101
- <use ns4:href="#m9b8c54d372" x="58.870649" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4102
  </g>
4103
  </g>
4104
  <g id="legend-label--torch-eager" class="legend">
4105
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="76.870649" y="58.434687" transform="rotate(-0 76.870649 58.434687)">torch_eager</text>
4106
  </g>
4107
  </g>
4108
  </g>
4109
  </g>
4110
  <defs>
4111
- <clipPath id="pbac879f81a">
4112
- <rect x="39.870649" y="26.88" width="784.322341" height="398.225974" />
4113
  </clipPath>
4114
  </defs>
4115
  </svg>
@@ -4122,7 +4174,7 @@ body[data-tool="eraser"] .main-content {
4122
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4123
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4124
  </span> |
4125
- Cell: combine | 4.45s
4126
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4127
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4128
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4209,14 +4261,14 @@ Summary: 2 found, 0 skipped, 0 missing
4209
  COMBINED BENCHMARK SUMMARY
4210
 
4211
  impl wl p50(ms) ok
4212
- hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.03 True
4213
  hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.04 True
4214
  hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.04 True
4215
  hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
4216
- torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.28 True
4217
- torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.01 True
4218
- torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.03 True
4219
- torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.14 True
4220
 
4221
  GENERATING COMBINED VISUALIZATION
4222
 
@@ -4236,7 +4288,7 @@ Implementations included:
4236
  <div class="uv-install-logs" id="uv-logs-combine">
4237
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4238
  <div class="uv-logs-content" style="display: none;">
4239
- Installed 37 packages in 288ms
4240
  </div>
4241
  </div>
4242
  <div class="cell-artifacts">
@@ -4249,11 +4301,11 @@ Installed 37 packages in 288ms
4249
  <rdf:RDF>
4250
  <ns2:Work>
4251
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4252
- <dc:date>2025-11-10T22:11:56.387573</dc:date>
4253
  <dc:format>image/svg+xml</dc:format>
4254
  <dc:creator>
4255
  <ns2:Agent>
4256
- <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
4257
  </ns2:Agent>
4258
  </dc:creator>
4259
  </ns2:Work>
@@ -4268,208 +4320,260 @@ Installed 37 packages in 288ms
4268
  </g>
4269
  <g id="axes--1" class="axes">
4270
  <g id="patch_2">
4271
- <path d="M 39.870649 425.105974 L 824.19299 425.105974 L 824.19299 26.88 L 39.870649 26.88 L 39.870649 425.105974 z " style="fill: none" />
4272
  </g>
4273
  <g id="matplotlib.axis_1">
4274
  <g id="xtick_1">
4275
  <g id="grid-x--1" class="grid grid-x">
4276
- <path d="M 75.521665 425.105974 L 75.521665 26.88 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4277
  </g>
4278
  <g id="line2d_1">
4279
  <defs>
4280
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
4281
  </defs>
4282
  <g>
4283
- <use ns4:href="#mafb3703e5b" x="75.521665" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
4284
  </g>
4285
  </g>
4286
  <g id="text_1">
4287
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(21.473848 549.280197) rotate(-45)">cuda_B1_Q100_H8_E256_L4_P4</text>
4288
  </g>
4289
  </g>
4290
  <g id="xtick_2">
4291
  <g id="grid-x--2" class="grid grid-x">
4292
- <path d="M 313.195102 425.105974 L 313.195102 26.88 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4293
  </g>
4294
  <g id="line2d_2">
4295
  <g>
4296
- <use ns4:href="#mafb3703e5b" x="313.195102" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
4297
  </g>
4298
  </g>
4299
  <g id="text_2">
4300
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(259.147284 549.280197) rotate(-45)">cuda_B1_Q300_H8_E256_L4_P4</text>
4301
  </g>
4302
  </g>
4303
  <g id="xtick_3">
4304
  <g id="grid-x--3" class="grid grid-x">
4305
- <path d="M 550.868538 425.105974 L 550.868538 26.88 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4306
  </g>
4307
  <g id="line2d_3">
4308
  <g>
4309
- <use ns4:href="#mafb3703e5b" x="550.868538" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
4310
  </g>
4311
  </g>
4312
  <g id="text_3">
4313
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(496.820721 549.280197) rotate(-45)">cuda_B2_Q100_H8_E256_L4_P4</text>
4314
  </g>
4315
  </g>
4316
  <g id="xtick_4">
4317
  <g id="grid-x--4" class="grid grid-x">
4318
- <path d="M 788.541975 425.105974 L 788.541975 26.88 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4319
  </g>
4320
  <g id="line2d_4">
4321
  <g>
4322
- <use ns4:href="#mafb3703e5b" x="788.541975" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
4323
  </g>
4324
  </g>
4325
  <g id="text_4">
4326
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(734.494157 549.280197) rotate(-45)">cuda_B2_Q300_H8_E256_L4_P4</text>
4327
  </g>
4328
  </g>
4329
  <g id="label--x" class="xlabel">
4330
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="432.03182" y="562.545859" transform="rotate(-0 432.03182 562.545859)">Workload</text>
4331
  </g>
4332
  </g>
4333
  <g id="matplotlib.axis_2">
4334
  <g id="ytick_1">
4335
  <g id="grid-y--2" class="grid grid-y">
4336
- <path d="M 39.870649 410.033467 L 824.19299 410.033467 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4337
  </g>
4338
  <g id="line2d_5">
4339
  <defs>
4340
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4341
  </defs>
4342
  <g>
4343
- <use ns4:href="#m0fca2865ba" x="39.870649" y="410.033467" style="stroke: #000000; stroke-width: 0.8" />
4344
  </g>
4345
  </g>
4346
  <g id="text_5">
4347
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="413.832686" transform="rotate(-0 32.870649 413.832686)">0</text>
4348
  </g>
4349
  </g>
4350
  <g id="ytick_2">
4351
  <g id="grid-y--3" class="grid grid-y">
4352
- <path d="M 39.870649 321.862464 L 824.19299 321.862464 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4353
  </g>
4354
  <g id="line2d_6">
4355
  <g>
4356
- <use ns4:href="#m0fca2865ba" x="39.870649" y="321.862464" style="stroke: #000000; stroke-width: 0.8" />
4357
  </g>
4358
  </g>
4359
  <g id="text_6">
4360
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="325.661683" transform="rotate(-0 32.870649 325.661683)">1</text>
4361
  </g>
4362
  </g>
4363
  <g id="ytick_3">
4364
  <g id="grid-y--4" class="grid grid-y">
4365
- <path d="M 39.870649 233.691462 L 824.19299 233.691462 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4366
  </g>
4367
  <g id="line2d_7">
4368
  <g>
4369
- <use ns4:href="#m0fca2865ba" x="39.870649" y="233.691462" style="stroke: #000000; stroke-width: 0.8" />
4370
  </g>
4371
  </g>
4372
  <g id="text_7">
4373
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="237.49068" transform="rotate(-0 32.870649 237.49068)">2</text>
4374
  </g>
4375
  </g>
4376
  <g id="ytick_4">
4377
  <g id="grid-y--5" class="grid grid-y">
4378
- <path d="M 39.870649 145.520459 L 824.19299 145.520459 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4379
  </g>
4380
  <g id="line2d_8">
4381
  <g>
4382
- <use ns4:href="#m0fca2865ba" x="39.870649" y="145.520459" style="stroke: #000000; stroke-width: 0.8" />
4383
  </g>
4384
  </g>
4385
  <g id="text_8">
4386
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="149.319678" transform="rotate(-0 32.870649 149.319678)">3</text>
4387
  </g>
4388
  </g>
4389
  <g id="ytick_5">
4390
  <g id="grid-y--6" class="grid grid-y">
4391
- <path d="M 39.870649 57.349456 L 824.19299 57.349456 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4392
  </g>
4393
  <g id="line2d_9">
4394
  <g>
4395
- <use ns4:href="#m0fca2865ba" x="39.870649" y="57.349456" style="stroke: #000000; stroke-width: 0.8" />
4396
  </g>
4397
  </g>
4398
  <g id="text_9">
4399
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="61.148675" transform="rotate(-0 32.870649 61.148675)">4</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4400
  </g>
4401
  </g>
4402
  <g id="label--y" class="ylabel">
4403
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="20.428462" y="225.992987" transform="rotate(-90 20.428462 225.992987)">Latency P50 (ms)</text>
4404
  </g>
4405
  </g>
4406
  <g id="series--hf-kernels-deformable-detr" class="series">
4407
- <path d="M 75.521665 407.004793 L 313.195102 406.116823 L 550.868538 406.08517 L 788.541975 405.961642 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4408
  <defs>
4409
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4410
  </defs>
4411
- <g clip-path="url(#pbac879f81a)">
4412
- <use ns4:href="#md7efaf3aec" x="75.521665" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
4413
- <use ns4:href="#md7efaf3aec" x="313.195102" y="406.116823" style="fill: #1f77b4; stroke: #1f77b4" />
4414
- <use ns4:href="#md7efaf3aec" x="550.868538" y="406.08517" style="fill: #1f77b4; stroke: #1f77b4" />
4415
- <use ns4:href="#md7efaf3aec" x="788.541975" y="405.961642" style="fill: #1f77b4; stroke: #1f77b4" />
4416
  </g>
4417
  </g>
4418
  <g id="series--torch-eager" class="series">
4419
- <path d="M 75.521665 120.81997 L 313.195102 56.405586 L 550.868538 54.693305 L 788.541975 44.981181 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4420
  <defs>
4421
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4422
  </defs>
4423
- <g clip-path="url(#pbac879f81a)">
4424
- <use ns4:href="#m9b8c54d372" x="75.521665" y="120.81997" style="fill: #ff7f0e; stroke: #ff7f0e" />
4425
- <use ns4:href="#m9b8c54d372" x="313.195102" y="56.405586" style="fill: #ff7f0e; stroke: #ff7f0e" />
4426
- <use ns4:href="#m9b8c54d372" x="550.868538" y="54.693305" style="fill: #ff7f0e; stroke: #ff7f0e" />
4427
- <use ns4:href="#m9b8c54d372" x="788.541975" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4428
  </g>
4429
  </g>
4430
  <g id="patch_3">
4431
- <path d="M 39.870649 425.105974 L 39.870649 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4432
  </g>
4433
  <g id="patch_4">
4434
  <path d="M 824.19299 425.105974 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4435
  </g>
4436
  <g id="patch_5">
4437
- <path d="M 39.870649 425.105974 L 824.19299 425.105974 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4438
  </g>
4439
  <g id="patch_6">
4440
- <path d="M 39.870649 26.88 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4441
  </g>
4442
- <g id="text_10">
4443
- <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="432.03182" y="20.88" transform="rotate(-0 432.03182 20.88)">Attention Implementation Latency</text>
4444
  </g>
4445
  <g id="legend" class="legend">
4446
  <g id="patch_7">
4447
- <path d="M 46.870649 64.7925 L 217.481587 64.7925 Q 219.481587 64.7925 219.481587 62.7925 L 219.481587 33.88 Q 219.481587 31.88 217.481587 31.88 L 46.870649 31.88 Q 44.870649 31.88 44.870649 33.88 L 44.870649 62.7925 Q 44.870649 64.7925 46.870649 64.7925 L 46.870649 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4448
  </g>
4449
- <g id="line2d_10">
4450
- <path d="M 48.870649 39.978438 L 58.870649 39.978438 L 68.870649 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4451
  <g>
4452
- <use ns4:href="#md7efaf3aec" x="58.870649" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4453
  </g>
4454
  </g>
4455
  <g id="legend-label--hf-kernels-deformable-detr" class="legend">
4456
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="76.870649" y="43.478438" transform="rotate(-0 76.870649 43.478438)">hf_kernels_deformable_detr</text>
4457
  </g>
4458
- <g id="line2d_11">
4459
- <path d="M 48.870649 54.934687 L 58.870649 54.934687 L 68.870649 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4460
  <g>
4461
- <use ns4:href="#m9b8c54d372" x="58.870649" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4462
  </g>
4463
  </g>
4464
  <g id="legend-label--torch-eager" class="legend">
4465
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="76.870649" y="58.434687" transform="rotate(-0 76.870649 58.434687)">torch_eager</text>
4466
  </g>
4467
  </g>
4468
  </g>
4469
  </g>
4470
  <defs>
4471
- <clipPath id="pbac879f81a">
4472
- <rect x="39.870649" y="26.88" width="784.322341" height="398.225974" />
4473
  </clipPath>
4474
  </defs>
4475
  </svg>
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T19:10:04.668129</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
3896
+ <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
3897
  </ns2:Agent>
3898
  </dc:creator>
3899
  </ns2:Work>
 
3908
  </g>
3909
  <g id="axes--1" class="axes">
3910
  <g id="patch_2">
3911
+ <path d="M 47.72 425.105974 L 824.19299 425.105974 L 824.19299 26.88 L 47.72 26.88 L 47.72 425.105974 z " style="fill: none" />
3912
  </g>
3913
  <g id="matplotlib.axis_1">
3914
  <g id="xtick_1">
3915
  <g id="grid-x--1" class="grid grid-x">
3916
+ <path d="M 83.014227 425.105974 L 83.014227 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3917
  </g>
3918
  <g id="line2d_1">
3919
  <defs>
3920
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
3921
  </defs>
3922
  <g>
3923
+ <use ns4:href="#mafb3703e5b" x="83.014227" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
3924
  </g>
3925
  </g>
3926
  <g id="text_1">
3927
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(28.96641 549.280197) rotate(-45)">cuda_B1_Q100_H8_E256_L4_P4</text>
3928
  </g>
3929
  </g>
3930
  <g id="xtick_2">
3931
  <g id="grid-x--2" class="grid grid-x">
3932
+ <path d="M 318.309072 425.105974 L 318.309072 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3933
  </g>
3934
  <g id="line2d_2">
3935
  <g>
3936
+ <use ns4:href="#mafb3703e5b" x="318.309072" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
3937
  </g>
3938
  </g>
3939
  <g id="text_2">
3940
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(264.261255 549.280197) rotate(-45)">cuda_B1_Q300_H8_E256_L4_P4</text>
3941
  </g>
3942
  </g>
3943
  <g id="xtick_3">
3944
  <g id="grid-x--3" class="grid grid-x">
3945
+ <path d="M 553.603918 425.105974 L 553.603918 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3946
  </g>
3947
  <g id="line2d_3">
3948
  <g>
3949
+ <use ns4:href="#mafb3703e5b" x="553.603918" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
3950
  </g>
3951
  </g>
3952
  <g id="text_3">
3953
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(499.556101 549.280197) rotate(-45)">cuda_B2_Q100_H8_E256_L4_P4</text>
3954
  </g>
3955
  </g>
3956
  <g id="xtick_4">
3957
  <g id="grid-x--4" class="grid grid-x">
3958
+ <path d="M 788.898763 425.105974 L 788.898763 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3959
  </g>
3960
  <g id="line2d_4">
3961
  <g>
3962
+ <use ns4:href="#mafb3703e5b" x="788.898763" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
3963
  </g>
3964
  </g>
3965
  <g id="text_4">
3966
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(734.850946 549.280197) rotate(-45)">cuda_B2_Q300_H8_E256_L4_P4</text>
3967
  </g>
3968
  </g>
3969
  <g id="label--x" class="xlabel">
3970
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.956495" y="562.545859" transform="rotate(-0 435.956495 562.545859)">Workload</text>
3971
  </g>
3972
  </g>
3973
  <g id="matplotlib.axis_2">
3974
  <g id="ytick_1">
3975
  <g id="grid-y--2" class="grid grid-y">
3976
+ <path d="M 47.72 410.313695 L 824.19299 410.313695 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3977
  </g>
3978
  <g id="line2d_5">
3979
  <defs>
3980
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3981
  </defs>
3982
  <g>
3983
+ <use ns4:href="#m0fca2865ba" x="47.72" y="410.313695" style="stroke: #000000; stroke-width: 0.8" />
3984
  </g>
3985
  </g>
3986
  <g id="text_5">
3987
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="414.112914" transform="rotate(-0 40.72 414.112914)">0.0</text>
3988
  </g>
3989
  </g>
3990
  <g id="ytick_2">
3991
  <g id="grid-y--3" class="grid grid-y">
3992
+ <path d="M 47.72 365.88698 L 824.19299 365.88698 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3993
  </g>
3994
  <g id="line2d_6">
3995
  <g>
3996
+ <use ns4:href="#m0fca2865ba" x="47.72" y="365.88698" style="stroke: #000000; stroke-width: 0.8" />
3997
  </g>
3998
  </g>
3999
  <g id="text_6">
4000
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="369.686199" transform="rotate(-0 40.72 369.686199)">0.5</text>
4001
  </g>
4002
  </g>
4003
  <g id="ytick_3">
4004
  <g id="grid-y--4" class="grid grid-y">
4005
+ <path d="M 47.72 321.460266 L 824.19299 321.460266 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4006
  </g>
4007
  <g id="line2d_7">
4008
  <g>
4009
+ <use ns4:href="#m0fca2865ba" x="47.72" y="321.460266" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="325.259484" transform="rotate(-0 40.72 325.259484)">1.0</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_4">
4017
  <g id="grid-y--5" class="grid grid-y">
4018
+ <path d="M 47.72 277.033551 L 824.19299 277.033551 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
+ <use ns4:href="#m0fca2865ba" x="47.72" y="277.033551" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="280.83277" transform="rotate(-0 40.72 280.83277)">1.5</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_5">
4030
  <g id="grid-y--6" class="grid grid-y">
4031
+ <path d="M 47.72 232.606836 L 824.19299 232.606836 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
+ <use ns4:href="#m0fca2865ba" x="47.72" y="232.606836" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="236.406055" transform="rotate(-0 40.72 236.406055)">2.0</text>
4040
+ </g>
4041
+ </g>
4042
+ <g id="ytick_6">
4043
+ <g id="grid-y--7" class="grid grid-y">
4044
+ <path d="M 47.72 188.180122 L 824.19299 188.180122 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4045
+ </g>
4046
+ <g id="line2d_10">
4047
+ <g>
4048
+ <use ns4:href="#m0fca2865ba" x="47.72" y="188.180122" style="stroke: #000000; stroke-width: 0.8" />
4049
+ </g>
4050
+ </g>
4051
+ <g id="text_10">
4052
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="191.97934" transform="rotate(-0 40.72 191.97934)">2.5</text>
4053
+ </g>
4054
+ </g>
4055
+ <g id="ytick_7">
4056
+ <g id="grid-y--8" class="grid grid-y">
4057
+ <path d="M 47.72 143.753407 L 824.19299 143.753407 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
+ </g>
4059
+ <g id="line2d_11">
4060
+ <g>
4061
+ <use ns4:href="#m0fca2865ba" x="47.72" y="143.753407" style="stroke: #000000; stroke-width: 0.8" />
4062
+ </g>
4063
+ </g>
4064
+ <g id="text_11">
4065
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="147.552626" transform="rotate(-0 40.72 147.552626)">3.0</text>
4066
+ </g>
4067
+ </g>
4068
+ <g id="ytick_8">
4069
+ <g id="grid-y--9" class="grid grid-y">
4070
+ <path d="M 47.72 99.326692 L 824.19299 99.326692 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
+ </g>
4072
+ <g id="line2d_12">
4073
+ <g>
4074
+ <use ns4:href="#m0fca2865ba" x="47.72" y="99.326692" style="stroke: #000000; stroke-width: 0.8" />
4075
+ </g>
4076
+ </g>
4077
+ <g id="text_12">
4078
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="103.125911" transform="rotate(-0 40.72 103.125911)">3.5</text>
4079
+ </g>
4080
+ </g>
4081
+ <g id="ytick_9">
4082
+ <g id="grid-y--10" class="grid grid-y">
4083
+ <path d="M 47.72 54.899978 L 824.19299 54.899978 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
+ </g>
4085
+ <g id="line2d_13">
4086
+ <g>
4087
+ <use ns4:href="#m0fca2865ba" x="47.72" y="54.899978" style="stroke: #000000; stroke-width: 0.8" />
4088
+ </g>
4089
+ </g>
4090
+ <g id="text_13">
4091
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="58.699197" transform="rotate(-0 40.72 58.699197)">4.0</text>
4092
  </g>
4093
  </g>
4094
  <g id="label--y" class="ylabel">
4095
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="225.992987" transform="rotate(-90 18.737188 225.992987)">Latency P50 (ms)</text>
4096
  </g>
4097
  </g>
4098
  <g id="series--hf-kernels-deformable-detr" class="series">
4099
+ <path d="M 83.014227 407.004793 L 318.309072 406.541778 L 553.603918 406.347278 L 788.898763 406.283214 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4100
  <defs>
4101
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4102
  </defs>
4103
+ <g clip-path="url(#pb5c8282ea4)">
4104
+ <use ns4:href="#md7efaf3aec" x="83.014227" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
4105
+ <use ns4:href="#md7efaf3aec" x="318.309072" y="406.541778" style="fill: #1f77b4; stroke: #1f77b4" />
4106
+ <use ns4:href="#md7efaf3aec" x="553.603918" y="406.347278" style="fill: #1f77b4; stroke: #1f77b4" />
4107
+ <use ns4:href="#md7efaf3aec" x="788.898763" y="406.283214" style="fill: #1f77b4; stroke: #1f77b4" />
4108
  </g>
4109
  </g>
4110
  <g id="series--torch-eager" class="series">
4111
+ <path d="M 83.014227 118.130211 L 318.309072 48.708671 L 553.603918 48.49098 L 788.898763 44.981181 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4112
  <defs>
4113
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4114
  </defs>
4115
+ <g clip-path="url(#pb5c8282ea4)">
4116
+ <use ns4:href="#m9b8c54d372" x="83.014227" y="118.130211" style="fill: #ff7f0e; stroke: #ff7f0e" />
4117
+ <use ns4:href="#m9b8c54d372" x="318.309072" y="48.708671" style="fill: #ff7f0e; stroke: #ff7f0e" />
4118
+ <use ns4:href="#m9b8c54d372" x="553.603918" y="48.49098" style="fill: #ff7f0e; stroke: #ff7f0e" />
4119
+ <use ns4:href="#m9b8c54d372" x="788.898763" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4120
  </g>
4121
  </g>
4122
  <g id="patch_3">
4123
+ <path d="M 47.72 425.105974 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4124
  </g>
4125
  <g id="patch_4">
4126
  <path d="M 824.19299 425.105974 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4127
  </g>
4128
  <g id="patch_5">
4129
+ <path d="M 47.72 425.105974 L 824.19299 425.105974 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4130
  </g>
4131
  <g id="patch_6">
4132
+ <path d="M 47.72 26.88 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4133
  </g>
4134
+ <g id="text_14">
4135
+ <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.956495" y="20.88" transform="rotate(-0 435.956495 20.88)">Attention Implementation Latency</text>
4136
  </g>
4137
  <g id="legend" class="legend">
4138
  <g id="patch_7">
4139
+ <path d="M 54.72 64.7925 L 225.330938 64.7925 Q 227.330938 64.7925 227.330938 62.7925 L 227.330938 33.88 Q 227.330938 31.88 225.330938 31.88 L 54.72 31.88 Q 52.72 31.88 52.72 33.88 L 52.72 62.7925 Q 52.72 64.7925 54.72 64.7925 L 54.72 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4140
  </g>
4141
+ <g id="line2d_14">
4142
+ <path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4143
  <g>
4144
+ <use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4145
  </g>
4146
  </g>
4147
  <g id="legend-label--hf-kernels-deformable-detr" class="legend">
4148
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">hf_kernels_deformable_detr</text>
4149
  </g>
4150
+ <g id="line2d_15">
4151
+ <path d="M 56.72 54.934687 L 66.72 54.934687 L 76.72 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4152
  <g>
4153
+ <use ns4:href="#m9b8c54d372" x="66.72" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4154
  </g>
4155
  </g>
4156
  <g id="legend-label--torch-eager" class="legend">
4157
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="58.434687" transform="rotate(-0 84.72 58.434687)">torch_eager</text>
4158
  </g>
4159
  </g>
4160
  </g>
4161
  </g>
4162
  <defs>
4163
+ <clipPath id="pb5c8282ea4">
4164
+ <rect x="47.72" y="26.88" width="776.47299" height="398.225974" />
4165
  </clipPath>
4166
  </defs>
4167
  </svg>
 
4174
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4175
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4176
  </span> |
4177
+ Cell: combine | 4.43s
4178
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4179
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4180
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4261
  COMBINED BENCHMARK SUMMARY
4262
 
4263
  impl wl p50(ms) ok
4264
+ hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
4265
  hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.04 True
4266
  hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.04 True
4267
  hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
4268
+ torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.29 True
4269
+ torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.07 True
4270
+ torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.07 True
4271
+ torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.11 True
4272
 
4273
  GENERATING COMBINED VISUALIZATION
4274
 
 
4288
  <div class="uv-install-logs" id="uv-logs-combine">
4289
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4290
  <div class="uv-logs-content" style="display: none;">
4291
+ Installed 37 packages in 282ms
4292
  </div>
4293
  </div>
4294
  <div class="cell-artifacts">
 
4301
  <rdf:RDF>
4302
  <ns2:Work>
4303
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4304
+ <dc:date>2025-12-19T19:10:04.668129</dc:date>
4305
  <dc:format>image/svg+xml</dc:format>
4306
  <dc:creator>
4307
  <ns2:Agent>
4308
+ <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
4309
  </ns2:Agent>
4310
  </dc:creator>
4311
  </ns2:Work>
 
4320
  </g>
4321
  <g id="axes--1" class="axes">
4322
  <g id="patch_2">
4323
+ <path d="M 47.72 425.105974 L 824.19299 425.105974 L 824.19299 26.88 L 47.72 26.88 L 47.72 425.105974 z " style="fill: none" />
4324
  </g>
4325
  <g id="matplotlib.axis_1">
4326
  <g id="xtick_1">
4327
  <g id="grid-x--1" class="grid grid-x">
4328
+ <path d="M 83.014227 425.105974 L 83.014227 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4329
  </g>
4330
  <g id="line2d_1">
4331
  <defs>
4332
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
4333
  </defs>
4334
  <g>
4335
+ <use ns4:href="#mafb3703e5b" x="83.014227" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
4336
  </g>
4337
  </g>
4338
  <g id="text_1">
4339
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(28.96641 549.280197) rotate(-45)">cuda_B1_Q100_H8_E256_L4_P4</text>
4340
  </g>
4341
  </g>
4342
  <g id="xtick_2">
4343
  <g id="grid-x--2" class="grid grid-x">
4344
+ <path d="M 318.309072 425.105974 L 318.309072 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4345
  </g>
4346
  <g id="line2d_2">
4347
  <g>
4348
+ <use ns4:href="#mafb3703e5b" x="318.309072" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
4349
  </g>
4350
  </g>
4351
  <g id="text_2">
4352
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(264.261255 549.280197) rotate(-45)">cuda_B1_Q300_H8_E256_L4_P4</text>
4353
  </g>
4354
  </g>
4355
  <g id="xtick_3">
4356
  <g id="grid-x--3" class="grid grid-x">
4357
+ <path d="M 553.603918 425.105974 L 553.603918 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4358
  </g>
4359
  <g id="line2d_3">
4360
  <g>
4361
+ <use ns4:href="#mafb3703e5b" x="553.603918" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
4362
  </g>
4363
  </g>
4364
  <g id="text_3">
4365
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(499.556101 549.280197) rotate(-45)">cuda_B2_Q100_H8_E256_L4_P4</text>
4366
  </g>
4367
  </g>
4368
  <g id="xtick_4">
4369
  <g id="grid-x--4" class="grid grid-x">
4370
+ <path d="M 788.898763 425.105974 L 788.898763 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4371
  </g>
4372
  <g id="line2d_4">
4373
  <g>
4374
+ <use ns4:href="#mafb3703e5b" x="788.898763" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
4375
  </g>
4376
  </g>
4377
  <g id="text_4">
4378
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(734.850946 549.280197) rotate(-45)">cuda_B2_Q300_H8_E256_L4_P4</text>
4379
  </g>
4380
  </g>
4381
  <g id="label--x" class="xlabel">
4382
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.956495" y="562.545859" transform="rotate(-0 435.956495 562.545859)">Workload</text>
4383
  </g>
4384
  </g>
4385
  <g id="matplotlib.axis_2">
4386
  <g id="ytick_1">
4387
  <g id="grid-y--2" class="grid grid-y">
4388
+ <path d="M 47.72 410.313695 L 824.19299 410.313695 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4389
  </g>
4390
  <g id="line2d_5">
4391
  <defs>
4392
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4393
  </defs>
4394
  <g>
4395
+ <use ns4:href="#m0fca2865ba" x="47.72" y="410.313695" style="stroke: #000000; stroke-width: 0.8" />
4396
  </g>
4397
  </g>
4398
  <g id="text_5">
4399
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="414.112914" transform="rotate(-0 40.72 414.112914)">0.0</text>
4400
  </g>
4401
  </g>
4402
  <g id="ytick_2">
4403
  <g id="grid-y--3" class="grid grid-y">
4404
+ <path d="M 47.72 365.88698 L 824.19299 365.88698 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4405
  </g>
4406
  <g id="line2d_6">
4407
  <g>
4408
+ <use ns4:href="#m0fca2865ba" x="47.72" y="365.88698" style="stroke: #000000; stroke-width: 0.8" />
4409
  </g>
4410
  </g>
4411
  <g id="text_6">
4412
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="369.686199" transform="rotate(-0 40.72 369.686199)">0.5</text>
4413
  </g>
4414
  </g>
4415
  <g id="ytick_3">
4416
  <g id="grid-y--4" class="grid grid-y">
4417
+ <path d="M 47.72 321.460266 L 824.19299 321.460266 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4418
  </g>
4419
  <g id="line2d_7">
4420
  <g>
4421
+ <use ns4:href="#m0fca2865ba" x="47.72" y="321.460266" style="stroke: #000000; stroke-width: 0.8" />
4422
  </g>
4423
  </g>
4424
  <g id="text_7">
4425
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="325.259484" transform="rotate(-0 40.72 325.259484)">1.0</text>
4426
  </g>
4427
  </g>
4428
  <g id="ytick_4">
4429
  <g id="grid-y--5" class="grid grid-y">
4430
+ <path d="M 47.72 277.033551 L 824.19299 277.033551 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4431
  </g>
4432
  <g id="line2d_8">
4433
  <g>
4434
+ <use ns4:href="#m0fca2865ba" x="47.72" y="277.033551" style="stroke: #000000; stroke-width: 0.8" />
4435
  </g>
4436
  </g>
4437
  <g id="text_8">
4438
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="280.83277" transform="rotate(-0 40.72 280.83277)">1.5</text>
4439
  </g>
4440
  </g>
4441
  <g id="ytick_5">
4442
  <g id="grid-y--6" class="grid grid-y">
4443
+ <path d="M 47.72 232.606836 L 824.19299 232.606836 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4444
  </g>
4445
  <g id="line2d_9">
4446
  <g>
4447
+ <use ns4:href="#m0fca2865ba" x="47.72" y="232.606836" style="stroke: #000000; stroke-width: 0.8" />
4448
  </g>
4449
  </g>
4450
  <g id="text_9">
4451
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="236.406055" transform="rotate(-0 40.72 236.406055)">2.0</text>
4452
+ </g>
4453
+ </g>
4454
+ <g id="ytick_6">
4455
+ <g id="grid-y--7" class="grid grid-y">
4456
+ <path d="M 47.72 188.180122 L 824.19299 188.180122 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4457
+ </g>
4458
+ <g id="line2d_10">
4459
+ <g>
4460
+ <use ns4:href="#m0fca2865ba" x="47.72" y="188.180122" style="stroke: #000000; stroke-width: 0.8" />
4461
+ </g>
4462
+ </g>
4463
+ <g id="text_10">
4464
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="191.97934" transform="rotate(-0 40.72 191.97934)">2.5</text>
4465
+ </g>
4466
+ </g>
4467
+ <g id="ytick_7">
4468
+ <g id="grid-y--8" class="grid grid-y">
4469
+ <path d="M 47.72 143.753407 L 824.19299 143.753407 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4470
+ </g>
4471
+ <g id="line2d_11">
4472
+ <g>
4473
+ <use ns4:href="#m0fca2865ba" x="47.72" y="143.753407" style="stroke: #000000; stroke-width: 0.8" />
4474
+ </g>
4475
+ </g>
4476
+ <g id="text_11">
4477
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="147.552626" transform="rotate(-0 40.72 147.552626)">3.0</text>
4478
+ </g>
4479
+ </g>
4480
+ <g id="ytick_8">
4481
+ <g id="grid-y--9" class="grid grid-y">
4482
+ <path d="M 47.72 99.326692 L 824.19299 99.326692 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4483
+ </g>
4484
+ <g id="line2d_12">
4485
+ <g>
4486
+ <use ns4:href="#m0fca2865ba" x="47.72" y="99.326692" style="stroke: #000000; stroke-width: 0.8" />
4487
+ </g>
4488
+ </g>
4489
+ <g id="text_12">
4490
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="103.125911" transform="rotate(-0 40.72 103.125911)">3.5</text>
4491
+ </g>
4492
+ </g>
4493
+ <g id="ytick_9">
4494
+ <g id="grid-y--10" class="grid grid-y">
4495
+ <path d="M 47.72 54.899978 L 824.19299 54.899978 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4496
+ </g>
4497
+ <g id="line2d_13">
4498
+ <g>
4499
+ <use ns4:href="#m0fca2865ba" x="47.72" y="54.899978" style="stroke: #000000; stroke-width: 0.8" />
4500
+ </g>
4501
+ </g>
4502
+ <g id="text_13">
4503
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="58.699197" transform="rotate(-0 40.72 58.699197)">4.0</text>
4504
  </g>
4505
  </g>
4506
  <g id="label--y" class="ylabel">
4507
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="225.992987" transform="rotate(-90 18.737188 225.992987)">Latency P50 (ms)</text>
4508
  </g>
4509
  </g>
4510
  <g id="series--hf-kernels-deformable-detr" class="series">
4511
+ <path d="M 83.014227 407.004793 L 318.309072 406.541778 L 553.603918 406.347278 L 788.898763 406.283214 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4512
  <defs>
4513
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4514
  </defs>
4515
+ <g clip-path="url(#pb5c8282ea4)">
4516
+ <use ns4:href="#md7efaf3aec" x="83.014227" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
4517
+ <use ns4:href="#md7efaf3aec" x="318.309072" y="406.541778" style="fill: #1f77b4; stroke: #1f77b4" />
4518
+ <use ns4:href="#md7efaf3aec" x="553.603918" y="406.347278" style="fill: #1f77b4; stroke: #1f77b4" />
4519
+ <use ns4:href="#md7efaf3aec" x="788.898763" y="406.283214" style="fill: #1f77b4; stroke: #1f77b4" />
4520
  </g>
4521
  </g>
4522
  <g id="series--torch-eager" class="series">
4523
+ <path d="M 83.014227 118.130211 L 318.309072 48.708671 L 553.603918 48.49098 L 788.898763 44.981181 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4524
  <defs>
4525
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4526
  </defs>
4527
+ <g clip-path="url(#pb5c8282ea4)">
4528
+ <use ns4:href="#m9b8c54d372" x="83.014227" y="118.130211" style="fill: #ff7f0e; stroke: #ff7f0e" />
4529
+ <use ns4:href="#m9b8c54d372" x="318.309072" y="48.708671" style="fill: #ff7f0e; stroke: #ff7f0e" />
4530
+ <use ns4:href="#m9b8c54d372" x="553.603918" y="48.49098" style="fill: #ff7f0e; stroke: #ff7f0e" />
4531
+ <use ns4:href="#m9b8c54d372" x="788.898763" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4532
  </g>
4533
  </g>
4534
  <g id="patch_3">
4535
+ <path d="M 47.72 425.105974 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4536
  </g>
4537
  <g id="patch_4">
4538
  <path d="M 824.19299 425.105974 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4539
  </g>
4540
  <g id="patch_5">
4541
+ <path d="M 47.72 425.105974 L 824.19299 425.105974 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4542
  </g>
4543
  <g id="patch_6">
4544
+ <path d="M 47.72 26.88 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4545
  </g>
4546
+ <g id="text_14">
4547
+ <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.956495" y="20.88" transform="rotate(-0 435.956495 20.88)">Attention Implementation Latency</text>
4548
  </g>
4549
  <g id="legend" class="legend">
4550
  <g id="patch_7">
4551
+ <path d="M 54.72 64.7925 L 225.330938 64.7925 Q 227.330938 64.7925 227.330938 62.7925 L 227.330938 33.88 Q 227.330938 31.88 225.330938 31.88 L 54.72 31.88 Q 52.72 31.88 52.72 33.88 L 52.72 62.7925 Q 52.72 64.7925 54.72 64.7925 L 54.72 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4552
  </g>
4553
+ <g id="line2d_14">
4554
+ <path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4555
  <g>
4556
+ <use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4557
  </g>
4558
  </g>
4559
  <g id="legend-label--hf-kernels-deformable-detr" class="legend">
4560
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">hf_kernels_deformable_detr</text>
4561
  </g>
4562
+ <g id="line2d_15">
4563
+ <path d="M 56.72 54.934687 L 66.72 54.934687 L 76.72 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4564
  <g>
4565
+ <use ns4:href="#m9b8c54d372" x="66.72" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4566
  </g>
4567
  </g>
4568
  <g id="legend-label--torch-eager" class="legend">
4569
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="58.434687" transform="rotate(-0 84.72 58.434687)">torch_eager</text>
4570
  </g>
4571
  </g>
4572
  </g>
4573
  </g>
4574
  <defs>
4575
+ <clipPath id="pb5c8282ea4">
4576
+ <rect x="47.72" y="26.88" width="776.47299" height="398.225974" />
4577
  </clipPath>
4578
  </defs>
4579
  </svg>
flash_attn/impls/artifacts/benchmark/attention.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-11-10T22:11:46Z", "run": "8d69ef94c7594eb581f8f1e4fd6b3eef", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9336540001640969, "p50": 0.938484000016615, "p90": 0.9400730000379554, "mean": 0.9383200000229408, "iqr": 0.00204800016945228, "raw_times": [0.9413640000275336, 0.938484000016615, 0.9380249998685031, 0.9400730000379554, 0.9336540001640969], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9436739999273414, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-11-10T22:11:46Z", "run": "8d69ef94c7594eb581f8f1e4fd6b3eef", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9720749999360123, "p50": 0.9796540000479581, "p90": 0.9886739999274141, "mean": 0.9813904000111506, "iqr": 0.011588999768719077, "raw_times": [0.9894639999856736, 0.9720749999360123, 0.9886739999274141, 0.977085000158695, 0.9796540000479581], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9749249998094456, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-11-10T22:11:46Z", "run": "8d69ef94c7594eb581f8f1e4fd6b3eef", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.026366000132839, "p50": 1.0466650001035305, "p90": 1.048316000151317, "mean": 1.0439156000302319, "iqr": 0.012310000329307513, "raw_times": [1.0622249999414635, 1.0466650001035305, 1.026366000132839, 1.048316000151317, 1.0360059998220095], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0340549999909854, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-11-10T22:11:46Z", "run": "8d69ef94c7594eb581f8f1e4fd6b3eef", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.031215000011798, "p50": 1.0415550000288931, "p90": 1.0416950001399528, "mean": 1.0391672000423569, "iqr": 0.009410000075149583, "raw_times": [1.031215000011798, 1.0322850000648032, 1.0416950001399528, 1.0415550000288931, 1.0490859999663371], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0389750000285858, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-11-10T22:11:46Z", "run": "8d69ef94c7594eb581f8f1e4fd6b3eef", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2092779998056358, "p50": 1.2178079998648172, "p90": 1.2180080000234739, "mean": 1.21775799993884, "iqr": 0.002989999984492897, "raw_times": [1.2286779999612918, 1.2180080000234739, 1.2178079998648172, 1.215018000038981, 1.2092779998056358], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2385289999201632, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-11-10T22:11:47Z", "run": "8d69ef94c7594eb581f8f1e4fd6b3eef", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.203338000095755, "p50": 1.2106680001124914, "p90": 1.218707999896651, "mean": 1.2165860000095563, "iqr": 0.014340000006995979, "raw_times": [1.204367999889655, 1.218707999896651, 1.203338000095755, 1.2106680001124914, 1.245848000053229], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2345879999884346, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
2
+ {"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
3
+ {"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
4
+ {"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
5
+ {"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
6
+ {"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
flash_attn/impls/cells/benchmark.py CHANGED
@@ -3,8 +3,8 @@
3
  # dependencies = [
4
  # "numpy",
5
  # "torch==2.8.0",
6
- # "kernels-benchmark-tools",
7
  # "kernels",
 
8
  # ]
9
  #
10
  # [tool.uv.sources]
@@ -15,17 +15,18 @@ import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
  from kernels import get_kernel
17
 
18
- # Load the flash attention 3 kernel
19
- hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
20
 
21
 
22
- def hf_flash_attention3(query, key, value):
23
- return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
 
24
 
25
 
26
  run_benchmark(
27
  kernel_type=KernelTypeEnum.ATTENTION,
28
- impl_name="hf_kernels_flash_attn3",
29
- impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
30
- impl_func=hf_flash_attention3,
31
  )
 
3
  # dependencies = [
4
  # "numpy",
5
  # "torch==2.8.0",
 
6
  # "kernels",
7
+ # "kernels-benchmark-tools",
8
  # ]
9
  #
10
  # [tool.uv.sources]
 
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
  from kernels import get_kernel
17
 
18
+ # Load the sage attention kernel
19
+ hf_kernels_sage_attn = get_kernel("kernels-community/sage_attention")
20
 
21
 
22
+ def sage_attention(query, key, value):
23
+ """SageAttention with INT8 Q/K quantization and FP16 P/V"""
24
+ return hf_kernels_sage_attn.fwd(query, key, value, is_causal=False)[0]
25
 
26
 
27
  run_benchmark(
28
  kernel_type=KernelTypeEnum.ATTENTION,
29
+ impl_name="sage_int8_fp16",
30
+ impl_tags={"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"},
31
+ impl_func=sage_attention,
32
  )
flash_attn/impls/flash_attention.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.26s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3905,16 +3905,16 @@ Cell: nv | 0.26s
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
- <div class="cell-stdout"><pre class="stdout-text">Mon Nov 10 21:58:51 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
- | NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
3912
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3913
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
- | N/A 32C P0 139W / 350W | 0MiB / 46068MiB | 83% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.26s
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3940
  </span> |
3941
- Cell: benchmark | 4.03s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3989,29 +3989,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.628ms 101.57% 3.628ms 3.628ms 1
3993
- torch_flash_ma 5.67% 314.697us 48.49% 2.689ms 2.689ms 0.000us 0.00% 3.612ms 3.612ms 1
3994
- aten::scaled_dot_product_attention 0.72% 39.870us 3.84% 213.234us 71.078us 0.000us 0.00% 2.845ms 948.416us 3
3995
- aten::_scaled_dot_product_flash_attention 0.43% 24.020us 3.13% 173.364us 57.788us 0.000us 0.00% 2.845ms 948.416us 3
3996
- aten::_flash_attention_forward 0.70% 39.034us 2.33% 129.042us 43.014us 2.845ms 79.65% 2.845ms 948.416us 3
3997
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.845ms 79.65% 2.845ms 948.416us 3
3998
- aten::contiguous 0.22% 12.191us 37.88% 2.101ms 175.086us 0.000us 0.00% 766.879us 63.907us 12
3999
- aten::clone 0.59% 32.480us 37.66% 2.089ms 174.070us 0.000us 0.00% 766.879us 63.907us 12
4000
- aten::copy_ 1.56% 86.776us 35.66% 1.978ms 164.799us 726.879us 20.35% 766.879us 63.907us 12
4001
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 726.879us 20.35% 726.879us 60.573us 12
4002
- Activity Buffer Request 32.26% 1.789ms 32.26% 1.789ms 1.789ms 40.000us 1.12% 40.000us 40.000us 1
4003
- aten::transpose 1.07% 59.612us 1.46% 80.772us 3.365us 0.000us 0.00% 0.000us 0.000us 24
4004
- aten::as_strided 0.38% 21.160us 0.38% 21.160us 0.882us 0.000us 0.00% 0.000us 0.000us 24
4005
- aten::empty_like 0.40% 22.459us 1.80% 99.659us 6.644us 0.000us 0.00% 0.000us 0.000us 15
4006
- aten::empty 1.66% 92.037us 1.66% 92.037us 3.835us 0.000us 0.00% 0.000us 0.000us 24
4007
- cudaLaunchKernel 2.29% 126.900us 2.29% 126.900us 8.460us 0.000us 0.00% 0.000us 0.000us 15
4008
- aten::empty_strided 0.28% 15.620us 0.28% 15.620us 5.207us 0.000us 0.00% 0.000us 0.000us 3
4009
- cudaDeviceGetAttribute 0.04% 2.280us 0.04% 2.280us 0.380us 0.000us 0.00% 0.000us 0.000us 6
4010
- cudaFuncSetAttribute 0.20% 11.200us 0.20% 11.200us 3.733us 0.000us 0.00% 0.000us 0.000us 3
4011
- cudaDeviceSynchronize 51.51% 2.857ms 51.51% 2.857ms 2.857ms 0.000us 0.00% 0.000us 0.000us 1
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
- Self CPU time total: 5.546ms
4014
- Self CUDA time total: 3.572ms
4015
 
4016
 
4017
 
@@ -4021,29 +4021,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4023
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4024
- torch_flash_ma 4.57% 259.472us 46.25% 2.626ms 2.626ms 0.000us 0.00% 3.786ms 3.786ms 1
4025
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.742ms 100.27% 3.742ms 3.742ms 1
4026
- aten::scaled_dot_product_attention 0.42% 24.011us 3.41% 193.713us 64.571us 0.000us 0.00% 2.968ms 989.492us 3
4027
- aten::_scaled_dot_product_flash_attention 0.33% 18.660us 2.99% 169.702us 56.567us 0.000us 0.00% 2.968ms 989.492us 3
4028
- aten::_flash_attention_forward 0.83% 47.240us 2.21% 125.672us 41.891us 2.968ms 79.55% 2.968ms 989.492us 3
4029
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.968ms 79.55% 2.968ms 989.492us 3
4030
- aten::contiguous 0.19% 10.613us 37.48% 2.128ms 177.333us 0.000us 0.00% 817.342us 68.112us 12
4031
- aten::clone 0.52% 29.369us 37.29% 2.117ms 176.448us 0.000us 0.00% 817.342us 68.112us 12
4032
- aten::copy_ 1.41% 80.272us 35.64% 2.023ms 168.619us 762.942us 20.45% 817.342us 68.112us 12
4033
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 762.942us 20.45% 762.942us 63.579us 12
4034
- Activity Buffer Request 32.67% 1.855ms 32.67% 1.855ms 1.855ms 54.400us 1.46% 54.400us 54.400us 1
4035
- aten::transpose 0.90% 51.353us 1.23% 69.912us 2.913us 0.000us 0.00% 0.000us 0.000us 24
4036
- aten::as_strided 0.33% 18.559us 0.33% 18.559us 0.773us 0.000us 0.00% 0.000us 0.000us 24
4037
- aten::empty_like 0.37% 20.909us 1.47% 83.391us 5.559us 0.000us 0.00% 0.000us 0.000us 15
4038
- aten::empty 1.39% 78.982us 1.39% 78.982us 3.291us 0.000us 0.00% 0.000us 0.000us 24
4039
- cudaLaunchKernel 1.94% 110.382us 1.94% 110.382us 7.359us 0.000us 0.00% 0.000us 0.000us 15
4040
- aten::empty_strided 0.24% 13.461us 0.24% 13.461us 4.487us 0.000us 0.00% 0.000us 0.000us 3
4041
- cudaDeviceGetAttribute 0.05% 2.710us 0.05% 2.710us 0.452us 0.000us 0.00% 0.000us 0.000us 6
4042
- cudaFuncSetAttribute 0.09% 4.940us 0.09% 4.940us 1.647us 0.000us 0.00% 0.000us 0.000us 3
4043
- cudaDeviceSynchronize 53.75% 3.052ms 53.75% 3.052ms 3.052ms 0.000us 0.00% 0.000us 0.000us 1
4044
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4045
- Self CPU time total: 5.678ms
4046
- Self CUDA time total: 3.731ms
4047
 
4048
 
4049
 
@@ -4053,29 +4053,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4055
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4056
- torch_flash_ma 4.60% 260.065us 44.20% 2.500ms 2.500ms 0.000us 0.00% 3.871ms 3.871ms 1
4057
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.823ms 100.27% 3.823ms 3.823ms 1
4058
- aten::scaled_dot_product_attention 0.46% 25.840us 3.28% 185.632us 61.877us 0.000us 0.00% 3.035ms 1.012ms 3
4059
- aten::_scaled_dot_product_flash_attention 0.32% 17.999us 2.82% 159.792us 53.264us 0.000us 0.00% 3.035ms 1.012ms 3
4060
- aten::_flash_attention_forward 0.73% 41.121us 2.09% 118.472us 39.491us 3.035ms 79.59% 3.035ms 1.012ms 3
4061
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.035ms 79.59% 3.035ms 1.012ms 3
4062
- aten::contiguous 0.19% 10.499us 35.53% 2.010ms 167.521us 0.000us 0.00% 836.093us 69.674us 12
4063
- aten::clone 0.50% 28.109us 35.35% 2.000ms 166.646us 0.000us 0.00% 836.093us 69.674us 12
4064
- aten::copy_ 1.42% 80.472us 33.72% 1.908ms 158.959us 778.333us 20.41% 836.093us 69.674us 12
4065
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 778.333us 20.41% 778.333us 64.861us 12
4066
- Activity Buffer Request 30.89% 1.747ms 30.89% 1.747ms 1.747ms 57.760us 1.51% 57.760us 57.760us 1
4067
- aten::transpose 0.88% 49.936us 1.20% 67.813us 2.826us 0.000us 0.00% 0.000us 0.000us 24
4068
- aten::as_strided 0.32% 17.877us 0.32% 17.877us 0.745us 0.000us 0.00% 0.000us 0.000us 24
4069
- aten::empty_like 0.36% 20.321us 1.47% 83.262us 5.551us 0.000us 0.00% 0.000us 0.000us 15
4070
- aten::empty 1.37% 77.333us 1.37% 77.333us 3.222us 0.000us 0.00% 0.000us 0.000us 24
4071
- cudaLaunchKernel 1.81% 102.481us 1.81% 102.481us 6.832us 0.000us 0.00% 0.000us 0.000us 15
4072
- aten::empty_strided 0.25% 14.120us 0.25% 14.120us 4.707us 0.000us 0.00% 0.000us 0.000us 3
4073
- cudaDeviceGetAttribute 0.03% 1.688us 0.03% 1.688us 0.281us 0.000us 0.00% 0.000us 0.000us 6
4074
- cudaFuncSetAttribute 0.09% 5.331us 0.09% 5.331us 1.777us 0.000us 0.00% 0.000us 0.000us 3
4075
- cudaDeviceSynchronize 55.80% 3.157ms 55.80% 3.157ms 3.157ms 0.000us 0.00% 0.000us 0.000us 1
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
- Self CPU time total: 5.657ms
4078
- Self CUDA time total: 3.813ms
4079
 
4080
 
4081
 
@@ -4085,29 +4085,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
4085
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4086
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
- torch_flash_ma 4.36% 258.876us 46.43% 2.758ms 2.758ms 0.000us 0.00% 3.960ms 3.960ms 1
4089
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.911ms 100.27% 3.911ms 3.911ms 1
4090
- aten::scaled_dot_product_attention 0.42% 24.860us 4.02% 238.593us 79.531us 0.000us 0.00% 3.109ms 1.036ms 3
4091
- aten::_scaled_dot_product_flash_attention 0.32% 19.211us 3.60% 213.733us 71.244us 0.000us 0.00% 3.109ms 1.036ms 3
4092
- aten::_flash_attention_forward 0.74% 43.768us 2.88% 170.772us 56.924us 3.109ms 79.70% 3.109ms 1.036ms 3
4093
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.109ms 79.70% 3.109ms 1.036ms 3
4094
- aten::contiguous 0.17% 10.099us 37.27% 2.213ms 184.454us 0.000us 0.00% 850.560us 70.880us 12
4095
- aten::clone 0.48% 28.250us 37.10% 2.203ms 183.613us 0.000us 0.00% 850.560us 70.880us 12
4096
- aten::copy_ 1.36% 80.903us 35.54% 2.111ms 175.896us 791.680us 20.30% 850.560us 70.880us 12
4097
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 791.680us 20.30% 791.680us 65.973us 12
4098
- Activity Buffer Request 29.13% 1.730ms 29.13% 1.730ms 1.730ms 58.880us 1.51% 58.880us 58.880us 1
4099
- aten::transpose 0.86% 50.781us 1.18% 70.362us 2.932us 0.000us 0.00% 0.000us 0.000us 24
4100
- aten::as_strided 0.33% 19.581us 0.33% 19.581us 0.816us 0.000us 0.00% 0.000us 0.000us 24
4101
- aten::empty_like 0.35% 20.589us 1.40% 83.331us 5.555us 0.000us 0.00% 0.000us 0.000us 15
4102
- aten::empty 1.32% 78.663us 1.32% 78.663us 3.278us 0.000us 0.00% 0.000us 0.000us 24
4103
- cudaLaunchKernel 5.47% 324.743us 5.47% 324.743us 21.650us 0.000us 0.00% 0.000us 0.000us 15
4104
- aten::empty_strided 0.23% 13.800us 0.23% 13.800us 4.600us 0.000us 0.00% 0.000us 0.000us 3
4105
- cudaDeviceGetAttribute 0.80% 47.662us 0.80% 47.662us 7.944us 0.000us 0.00% 0.000us 0.000us 6
4106
- cudaFuncSetAttribute 0.10% 5.930us 0.10% 5.930us 1.977us 0.000us 0.00% 0.000us 0.000us 3
4107
- cudaDeviceSynchronize 53.57% 3.181ms 53.57% 3.181ms 3.181ms 0.000us 0.00% 0.000us 0.000us 1
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
- Self CPU time total: 5.939ms
4110
- Self CUDA time total: 3.901ms
4111
 
4112
 
4113
 
@@ -4117,29 +4117,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
4117
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4118
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4119
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4120
- torch_flash_ma 4.85% 313.852us 44.01% 2.846ms 2.846ms 0.000us 0.00% 4.405ms 4.405ms 1
4121
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.356ms 100.24% 4.356ms 4.356ms 1
4122
- aten::scaled_dot_product_attention 0.40% 25.602us 2.92% 188.673us 62.891us 0.000us 0.00% 3.542ms 1.181ms 3
4123
- aten::_scaled_dot_product_flash_attention 0.29% 18.450us 2.52% 163.071us 54.357us 0.000us 0.00% 3.542ms 1.181ms 3
4124
- aten::_flash_attention_forward 0.66% 42.791us 1.88% 121.422us 40.474us 3.542ms 81.52% 3.542ms 1.181ms 3
4125
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.542ms 81.52% 3.542ms 1.181ms 3
4126
- aten::contiguous 0.15% 9.702us 35.55% 2.299ms 191.596us 0.000us 0.00% 862.461us 71.872us 12
4127
- aten::clone 0.45% 28.857us 35.40% 2.289ms 190.788us 0.000us 0.00% 862.461us 71.872us 12
4128
- aten::copy_ 1.23% 79.423us 33.92% 2.194ms 182.809us 803.166us 18.48% 862.461us 71.872us 12
4129
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 803.166us 18.48% 803.166us 66.930us 12
4130
- Activity Buffer Request 28.18% 1.822ms 28.18% 1.822ms 1.822ms 59.295us 1.36% 59.295us 59.295us 1
4131
- aten::transpose 0.77% 49.902us 1.04% 67.461us 2.811us 0.000us 0.00% 0.000us 0.000us 24
4132
- aten::as_strided 0.27% 17.559us 0.27% 17.559us 0.732us 0.000us 0.00% 0.000us 0.000us 24
4133
- aten::empty_like 0.33% 21.611us 1.34% 86.704us 5.780us 0.000us 0.00% 0.000us 0.000us 15
4134
- aten::empty 1.24% 80.042us 1.24% 80.042us 3.335us 0.000us 0.00% 0.000us 0.000us 24
4135
- cudaLaunchKernel 4.86% 314.554us 4.86% 314.554us 20.970us 0.000us 0.00% 0.000us 0.000us 15
4136
- aten::empty_strided 0.23% 14.691us 0.23% 14.691us 4.897us 0.000us 0.00% 0.000us 0.000us 3
4137
- cudaDeviceGetAttribute 0.03% 1.700us 0.03% 1.700us 0.283us 0.000us 0.00% 0.000us 0.000us 6
4138
- cudaFuncSetAttribute 0.08% 4.940us 0.08% 4.940us 1.647us 0.000us 0.00% 0.000us 0.000us 3
4139
- cudaDeviceSynchronize 55.99% 3.621ms 55.99% 3.621ms 3.621ms 0.000us 0.00% 0.000us 0.000us 1
4140
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4141
- Self CPU time total: 6.467ms
4142
- Self CUDA time total: 4.345ms
4143
 
4144
 
4145
 
@@ -4149,38 +4149,38 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
4149
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4150
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4151
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4152
- torch_flash_ma 3.49% 226.744us 41.30% 2.682ms 2.682ms 0.000us 0.00% 4.507ms 4.507ms 1
4153
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.456ms 100.23% 4.456ms 4.456ms 1
4154
- aten::scaled_dot_product_attention 0.39% 25.000us 2.68% 173.753us 57.918us 0.000us 0.00% 3.635ms 1.212ms 3
4155
- aten::_scaled_dot_product_flash_attention 0.28% 18.340us 2.29% 148.753us 49.584us 0.000us 0.00% 3.635ms 1.212ms 3
4156
- aten::_flash_attention_forward 0.53% 34.164us 1.68% 109.263us 36.421us 3.635ms 81.77% 3.635ms 1.212ms 3
4157
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.635ms 81.77% 3.635ms 1.212ms 3
4158
- aten::contiguous 0.14% 8.821us 34.49% 2.240ms 186.626us 0.000us 0.00% 871.422us 72.619us 12
4159
- aten::clone 0.41% 26.612us 34.36% 2.231ms 185.890us 0.000us 0.00% 871.422us 72.619us 12
4160
- aten::copy_ 1.18% 76.909us 32.95% 2.140ms 178.308us 810.270us 18.23% 871.422us 72.619us 12
4161
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 810.270us 18.23% 810.270us 67.523us 12
4162
- Activity Buffer Request 27.48% 1.784ms 27.48% 1.784ms 1.784ms 61.152us 1.38% 61.152us 61.152us 1
4163
- aten::transpose 0.71% 45.940us 0.97% 63.019us 2.626us 0.000us 0.00% 0.000us 0.000us 24
4164
- aten::as_strided 0.26% 17.079us 0.26% 17.079us 0.712us 0.000us 0.00% 0.000us 0.000us 24
4165
- aten::empty_like 0.30% 19.781us 1.27% 82.742us 5.516us 0.000us 0.00% 0.000us 0.000us 15
4166
- aten::empty 1.21% 78.423us 1.21% 78.423us 3.268us 0.000us 0.00% 0.000us 0.000us 24
4167
- cudaLaunchKernel 4.62% 300.294us 4.62% 300.294us 20.020us 0.000us 0.00% 0.000us 0.000us 15
4168
- aten::empty_strided 0.21% 13.430us 0.21% 13.430us 4.477us 0.000us 0.00% 0.000us 0.000us 3
4169
- cudaDeviceGetAttribute 0.02% 1.610us 0.02% 1.610us 0.268us 0.000us 0.00% 0.000us 0.000us 6
4170
- cudaFuncSetAttribute 0.07% 4.648us 0.07% 4.648us 1.549us 0.000us 0.00% 0.000us 0.000us 3
4171
- cudaDeviceSynchronize 58.70% 3.811ms 58.70% 3.811ms 3.811ms 0.000us 0.00% 0.000us 0.000us 1
4172
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4173
- Self CPU time total: 6.493ms
4174
- Self CUDA time total: 4.445ms
4175
 
4176
 
4177
  impl wl p50(ms) ok
4178
- torch_flash_ma cuda_attn_L128_bfloat16 1.23 True
4179
- torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
4180
- torch_flash_ma cuda_attn_L320_bfloat16 1.30 True
4181
- torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
4182
- torch_flash_ma cuda_attn_L448_bfloat16 1.48 True
4183
- torch_flash_ma cuda_attn_L512_bfloat16 1.52 True
4184
  </pre></div>
4185
  <div class="cell-artifacts">
4186
  <h4>Artifacts:</h4>
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.28s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:57:02 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
+ | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
3912
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3913
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
+ | N/A 34C P0 103W / 350W | 0MiB / 46068MiB | 31% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
 
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3940
  </span> |
3941
+ Cell: benchmark | 4.28s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.564ms 102.03% 3.564ms 3.564ms 1
3993
+ torch_flash_ma 5.92% 322.864us 49.31% 2.690ms 2.690ms 0.000us 0.00% 3.533ms 3.533ms 1
3994
+ aten::scaled_dot_product_attention 0.71% 38.601us 3.97% 216.634us 72.211us 0.000us 0.00% 2.778ms 926.157us 3
3995
+ aten::_scaled_dot_product_flash_attention 0.48% 26.049us 3.26% 178.033us 59.344us 0.000us 0.00% 2.778ms 926.157us 3
3996
+ aten::_flash_attention_forward 0.70% 38.244us 2.37% 129.043us 43.014us 2.778ms 79.53% 2.778ms 926.157us 3
3997
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.778ms 79.53% 2.778ms 926.157us 3
3998
+ aten::contiguous 0.25% 13.590us 38.20% 2.084ms 173.652us 0.000us 0.00% 754.825us 62.902us 12
3999
+ aten::clone 0.64% 35.000us 37.95% 2.070ms 172.519us 0.000us 0.00% 754.825us 62.902us 12
4000
+ aten::copy_ 1.68% 91.923us 35.78% 1.952ms 162.645us 715.017us 20.47% 754.825us 62.902us 12
4001
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 715.017us 20.47% 715.017us 59.585us 12
4002
+ Activity Buffer Request 32.25% 1.760ms 32.25% 1.760ms 1.760ms 39.808us 1.14% 39.808us 39.808us 1
4003
+ aten::transpose 1.21% 66.005us 1.65% 89.944us 3.748us 0.000us 0.00% 0.000us 0.000us 24
4004
+ aten::as_strided 0.44% 23.939us 0.44% 23.939us 0.997us 0.000us 0.00% 0.000us 0.000us 24
4005
+ aten::empty_like 0.46% 24.998us 1.93% 105.512us 7.034us 0.000us 0.00% 0.000us 0.000us 15
4006
+ aten::empty 1.74% 94.901us 1.74% 94.901us 3.954us 0.000us 0.00% 0.000us 0.000us 24
4007
+ cudaLaunchKernel 2.30% 125.662us 2.30% 125.662us 8.377us 0.000us 0.00% 0.000us 0.000us 15
4008
+ aten::empty_strided 0.30% 16.192us 0.30% 16.192us 5.397us 0.000us 0.00% 0.000us 0.000us 3
4009
+ cudaDeviceGetAttribute 0.04% 2.360us 0.04% 2.360us 0.393us 0.000us 0.00% 0.000us 0.000us 6
4010
+ cudaFuncSetAttribute 0.19% 10.450us 0.19% 10.450us 3.483us 0.000us 0.00% 0.000us 0.000us 3
4011
+ cudaDeviceSynchronize 50.69% 2.765ms 50.69% 2.765ms 2.765ms 0.000us 0.00% 0.000us 0.000us 1
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
+ Self CPU time total: 5.456ms
4014
+ Self CUDA time total: 3.493ms
4015
 
4016
 
4017
 
 
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4023
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4024
+ torch_flash_ma 4.71% 256.956us 44.29% 2.416ms 2.416ms 0.000us 0.00% 3.774ms 3.774ms 1
4025
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.728ms 100.28% 3.728ms 3.728ms 1
4026
+ aten::scaled_dot_product_attention 0.47% 25.660us 3.51% 191.364us 63.788us 0.000us 0.00% 2.953ms 984.270us 3
4027
+ aten::_scaled_dot_product_flash_attention 0.35% 18.860us 3.04% 165.704us 55.235us 0.000us 0.00% 2.953ms 984.270us 3
4028
+ aten::_flash_attention_forward 0.82% 44.462us 2.27% 123.662us 41.221us 2.953ms 79.43% 2.953ms 984.270us 3
4029
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.953ms 79.43% 2.953ms 984.270us 3
4030
+ aten::contiguous 0.19% 10.628us 35.19% 1.920ms 159.985us 0.000us 0.00% 820.970us 68.414us 12
4031
+ aten::clone 0.57% 30.960us 35.00% 1.909ms 159.100us 0.000us 0.00% 820.970us 68.414us 12
4032
+ aten::copy_ 1.50% 81.693us 33.25% 1.814ms 151.145us 764.809us 20.57% 820.970us 68.414us 12
4033
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 764.809us 20.57% 764.809us 63.734us 12
4034
+ Activity Buffer Request 30.19% 1.647ms 30.19% 1.647ms 1.647ms 56.161us 1.51% 56.161us 56.161us 1
4035
+ aten::transpose 0.93% 50.867us 1.30% 70.984us 2.958us 0.000us 0.00% 0.000us 0.000us 24
4036
+ aten::as_strided 0.37% 20.117us 0.37% 20.117us 0.838us 0.000us 0.00% 0.000us 0.000us 24
4037
+ aten::empty_like 0.39% 21.390us 1.52% 82.920us 5.528us 0.000us 0.00% 0.000us 0.000us 15
4038
+ aten::empty 1.43% 78.110us 1.43% 78.110us 3.255us 0.000us 0.00% 0.000us 0.000us 24
4039
+ cudaLaunchKernel 2.02% 110.102us 2.02% 110.102us 7.340us 0.000us 0.00% 0.000us 0.000us 15
4040
+ aten::empty_strided 0.25% 13.480us 0.25% 13.480us 4.493us 0.000us 0.00% 0.000us 0.000us 3
4041
+ cudaDeviceGetAttribute 0.03% 1.800us 0.03% 1.800us 0.300us 0.000us 0.00% 0.000us 0.000us 6
4042
+ cudaFuncSetAttribute 0.07% 4.010us 0.07% 4.010us 1.337us 0.000us 0.00% 0.000us 0.000us 3
4043
+ cudaDeviceSynchronize 55.71% 3.039ms 55.71% 3.039ms 3.039ms 0.000us 0.00% 0.000us 0.000us 1
4044
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4045
+ Self CPU time total: 5.455ms
4046
+ Self CUDA time total: 3.718ms
4047
 
4048
 
4049
 
 
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4055
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4056
+ torch_flash_ma 4.83% 269.985us 44.74% 2.500ms 2.500ms 0.000us 0.00% 3.834ms 3.834ms 1
4057
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.786ms 100.29% 3.786ms 3.786ms 1
4058
+ aten::scaled_dot_product_attention 0.43% 24.011us 3.55% 198.294us 66.098us 0.000us 0.00% 2.997ms 999.122us 3
4059
+ aten::_scaled_dot_product_flash_attention 0.34% 19.010us 3.12% 174.283us 58.094us 0.000us 0.00% 2.997ms 999.122us 3
4060
+ aten::_flash_attention_forward 0.79% 43.958us 2.36% 131.713us 43.904us 2.997ms 79.40% 2.997ms 999.122us 3
4061
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.997ms 79.40% 2.997ms 999.122us 3
4062
+ aten::contiguous 0.20% 11.122us 35.53% 1.985ms 165.423us 0.000us 0.00% 837.094us 69.758us 12
4063
+ aten::clone 0.53% 29.350us 35.33% 1.974ms 164.496us 0.000us 0.00% 837.094us 69.758us 12
4064
+ aten::copy_ 1.44% 80.718us 33.66% 1.880ms 156.702us 777.862us 20.60% 837.094us 69.758us 12
4065
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 777.862us 20.60% 777.862us 64.822us 12
4066
+ Activity Buffer Request 30.68% 1.714ms 30.68% 1.714ms 1.714ms 59.232us 1.57% 59.232us 59.232us 1
4067
+ aten::transpose 0.92% 51.150us 1.25% 70.010us 2.917us 0.000us 0.00% 0.000us 0.000us 24
4068
+ aten::as_strided 0.34% 18.860us 0.34% 18.860us 0.786us 0.000us 0.00% 0.000us 0.000us 24
4069
+ aten::empty_like 0.37% 20.561us 1.52% 84.672us 5.645us 0.000us 0.00% 0.000us 0.000us 15
4070
+ aten::empty 1.54% 85.833us 1.54% 85.833us 3.576us 0.000us 0.00% 0.000us 0.000us 24
4071
+ cudaLaunchKernel 1.95% 109.214us 1.95% 109.214us 7.281us 0.000us 0.00% 0.000us 0.000us 15
4072
+ aten::empty_strided 0.27% 15.280us 0.27% 15.280us 5.093us 0.000us 0.00% 0.000us 0.000us 3
4073
+ cudaDeviceGetAttribute 0.04% 2.120us 0.04% 2.120us 0.353us 0.000us 0.00% 0.000us 0.000us 6
4074
+ cudaFuncSetAttribute 0.08% 4.293us 0.08% 4.293us 1.431us 0.000us 0.00% 0.000us 0.000us 3
4075
+ cudaDeviceSynchronize 55.26% 3.087ms 55.26% 3.087ms 3.087ms 0.000us 0.00% 0.000us 0.000us 1
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
+ Self CPU time total: 5.587ms
4078
+ Self CUDA time total: 3.775ms
4079
 
4080
 
4081
 
 
4085
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4086
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
+ torch_flash_ma 4.54% 264.303us 45.63% 2.655ms 2.655ms 0.000us 0.00% 3.910ms 3.910ms 1
4089
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.865ms 100.29% 3.865ms 3.865ms 1
4090
+ aten::scaled_dot_product_attention 0.44% 25.860us 3.27% 190.173us 63.391us 0.000us 0.00% 3.076ms 1.025ms 3
4091
+ aten::_scaled_dot_product_flash_attention 0.31% 18.100us 2.82% 164.313us 54.771us 0.000us 0.00% 3.076ms 1.025ms 3
4092
+ aten::_flash_attention_forward 0.70% 40.710us 2.10% 122.383us 40.794us 3.076ms 79.82% 3.076ms 1.025ms 3
4093
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.076ms 79.82% 3.076ms 1.025ms 3
4094
+ aten::contiguous 0.17% 9.789us 37.00% 2.153ms 179.384us 0.000us 0.00% 833.826us 69.486us 12
4095
+ aten::clone 0.51% 29.519us 36.83% 2.143ms 178.569us 0.000us 0.00% 833.826us 69.486us 12
4096
+ aten::copy_ 1.40% 81.625us 35.17% 2.046ms 170.539us 777.953us 20.18% 833.826us 69.486us 12
4097
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 777.953us 20.18% 777.953us 64.829us 12
4098
+ Activity Buffer Request 28.32% 1.648ms 28.32% 1.648ms 1.648ms 55.873us 1.45% 55.873us 55.873us 1
4099
+ aten::transpose 0.90% 52.082us 1.23% 71.483us 2.978us 0.000us 0.00% 0.000us 0.000us 24
4100
+ aten::as_strided 0.33% 19.401us 0.33% 19.401us 0.808us 0.000us 0.00% 0.000us 0.000us 24
4101
+ aten::empty_like 0.38% 21.851us 1.50% 87.141us 5.809us 0.000us 0.00% 0.000us 0.000us 15
4102
+ aten::empty 1.38% 80.371us 1.38% 80.371us 3.349us 0.000us 0.00% 0.000us 0.000us 24
4103
+ cudaLaunchKernel 5.88% 342.407us 5.88% 342.407us 22.827us 0.000us 0.00% 0.000us 0.000us 15
4104
+ aten::empty_strided 0.26% 14.910us 0.26% 14.910us 4.970us 0.000us 0.00% 0.000us 0.000us 3
4105
+ cudaDeviceGetAttribute 0.03% 1.811us 0.03% 1.811us 0.302us 0.000us 0.00% 0.000us 0.000us 6
4106
+ cudaFuncSetAttribute 0.07% 4.181us 0.07% 4.181us 1.394us 0.000us 0.00% 0.000us 0.000us 3
4107
+ cudaDeviceSynchronize 54.37% 3.164ms 54.37% 3.164ms 3.164ms 0.000us 0.00% 0.000us 0.000us 1
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
+ Self CPU time total: 5.818ms
4110
+ Self CUDA time total: 3.854ms
4111
 
4112
 
4113
 
 
4117
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4118
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4119
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4120
+ torch_flash_ma 4.87% 306.708us 43.18% 2.718ms 2.718ms 0.000us 0.00% 4.364ms 4.364ms 1
4121
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.314ms 100.24% 4.314ms 4.314ms 1
4122
+ aten::scaled_dot_product_attention 0.42% 26.322us 3.04% 191.625us 63.875us 0.000us 0.00% 3.500ms 1.167ms 3
4123
+ aten::_scaled_dot_product_flash_attention 0.31% 19.398us 2.63% 165.303us 55.101us 0.000us 0.00% 3.500ms 1.167ms 3
4124
+ aten::_flash_attention_forward 0.65% 40.750us 1.93% 121.261us 40.420us 3.500ms 81.33% 3.500ms 1.167ms 3
4125
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.500ms 81.33% 3.500ms 1.167ms 3
4126
+ aten::contiguous 0.18% 11.020us 34.50% 2.172ms 180.965us 0.000us 0.00% 863.467us 71.956us 12
4127
+ aten::clone 0.46% 28.711us 34.33% 2.161ms 180.047us 0.000us 0.00% 863.467us 71.956us 12
4128
+ aten::copy_ 1.29% 81.309us 32.83% 2.066ms 172.192us 803.338us 18.67% 863.467us 71.956us 12
4129
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 803.338us 18.67% 803.338us 66.945us 12
4130
+ Activity Buffer Request 26.76% 1.684ms 26.76% 1.684ms 1.684ms 60.129us 1.40% 60.129us 60.129us 1
4131
+ aten::transpose 0.83% 52.430us 1.15% 72.394us 3.016us 0.000us 0.00% 0.000us 0.000us 24
4132
+ aten::as_strided 0.32% 19.964us 0.32% 19.964us 0.832us 0.000us 0.00% 0.000us 0.000us 24
4133
+ aten::empty_like 0.32% 19.960us 1.35% 84.930us 5.662us 0.000us 0.00% 0.000us 0.000us 15
4134
+ aten::empty 1.27% 80.061us 1.27% 80.061us 3.336us 0.000us 0.00% 0.000us 0.000us 24
4135
+ cudaLaunchKernel 5.16% 325.017us 5.16% 325.017us 21.668us 0.000us 0.00% 0.000us 0.000us 15
4136
+ aten::empty_strided 0.23% 14.460us 0.23% 14.460us 4.820us 0.000us 0.00% 0.000us 0.000us 3
4137
+ cudaDeviceGetAttribute 0.04% 2.690us 0.04% 2.690us 0.448us 0.000us 0.00% 0.000us 0.000us 6
4138
+ cudaFuncSetAttribute 0.07% 4.660us 0.07% 4.660us 1.553us 0.000us 0.00% 0.000us 0.000us 3
4139
+ cudaDeviceSynchronize 56.82% 3.576ms 56.82% 3.576ms 3.576ms 0.000us 0.00% 0.000us 0.000us 1
4140
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4141
+ Self CPU time total: 6.294ms
4142
+ Self CUDA time total: 4.304ms
4143
 
4144
 
4145
 
 
4149
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4150
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4151
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4152
+ torch_flash_ma 3.61% 231.105us 41.57% 2.662ms 2.662ms 0.000us 0.00% 4.461ms 4.461ms 1
4153
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.411ms 100.25% 4.411ms 4.411ms 1
4154
+ aten::scaled_dot_product_attention 0.40% 25.770us 2.78% 178.013us 59.338us 0.000us 0.00% 3.582ms 1.194ms 3
4155
+ aten::_scaled_dot_product_flash_attention 0.28% 17.960us 2.38% 152.243us 50.748us 0.000us 0.00% 3.582ms 1.194ms 3
4156
+ aten::_flash_attention_forward 0.51% 32.421us 1.73% 110.913us 36.971us 3.582ms 81.42% 3.582ms 1.194ms 3
4157
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.582ms 81.42% 3.582ms 1.194ms 3
4158
+ aten::contiguous 0.14% 9.230us 34.45% 2.206ms 183.815us 0.000us 0.00% 878.374us 73.198us 12
4159
+ aten::clone 0.41% 26.011us 34.30% 2.197ms 183.046us 0.000us 0.00% 878.374us 73.198us 12
4160
+ aten::copy_ 1.29% 82.861us 32.91% 2.107ms 175.603us 817.702us 18.58% 878.374us 73.198us 12
4161
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 817.702us 18.58% 817.702us 68.142us 12
4162
+ Activity Buffer Request 26.91% 1.723ms 26.91% 1.723ms 1.723ms 60.672us 1.38% 60.672us 60.672us 1
4163
+ aten::transpose 0.81% 51.890us 1.10% 70.690us 2.945us 0.000us 0.00% 0.000us 0.000us 24
4164
+ aten::as_strided 0.29% 18.800us 0.29% 18.800us 0.783us 0.000us 0.00% 0.000us 0.000us 24
4165
+ aten::empty_like 0.29% 18.829us 1.29% 82.771us 5.518us 0.000us 0.00% 0.000us 0.000us 15
4166
+ aten::empty 1.23% 78.733us 1.23% 78.733us 3.281us 0.000us 0.00% 0.000us 0.000us 24
4167
+ cudaLaunchKernel 5.08% 325.239us 5.08% 325.239us 21.683us 0.000us 0.00% 0.000us 0.000us 15
4168
+ aten::empty_strided 0.23% 14.690us 0.23% 14.690us 4.897us 0.000us 0.00% 0.000us 0.000us 3
4169
+ cudaDeviceGetAttribute 0.03% 1.808us 0.03% 1.808us 0.301us 0.000us 0.00% 0.000us 0.000us 6
4170
+ cudaFuncSetAttribute 0.06% 3.871us 0.06% 3.871us 1.290us 0.000us 0.00% 0.000us 0.000us 3
4171
+ cudaDeviceSynchronize 58.43% 3.741ms 58.43% 3.741ms 3.741ms 0.000us 0.00% 0.000us 0.000us 1
4172
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4173
+ Self CPU time total: 6.404ms
4174
+ Self CUDA time total: 4.400ms
4175
 
4176
 
4177
  impl wl p50(ms) ok
4178
+ torch_flash_ma cuda_attn_L128_bfloat16 1.20 True
4179
+ torch_flash_ma cuda_attn_L256_bfloat16 1.26 True
4180
+ torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
4181
+ torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
4182
+ torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
4183
+ torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
4184
  </pre></div>
4185
  <div class="cell-artifacts">
4186
  <h4>Artifacts:</h4>
flash_attn/impls/hf_kernels_flash_attn.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3886,9 +3886,9 @@ body[data-tool="eraser"] .main-content {
3886
  <span class="collapse-indicators">
3887
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: benchmark | 10.24s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3943,21 +3943,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3945
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3946
- hf_kernels_flash_attn 3.32% 167.194us 47.96% 2.415ms 2.415ms 0.000us 0.00% 3.817ms 3.817ms 1
3947
- _flash_attn_9e27194::fwd 1.37% 69.029us 44.64% 2.247ms 749.145us 2.847ms 100.00% 3.817ms 1.272ms 3
3948
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.849ms 100.05% 2.849ms 2.849ms 1
3949
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.847ms 100.00% 2.847ms 949.099us 3
3950
- Activity Buffer Request 39.70% 1.999ms 39.70% 1.999ms 1.999ms 970.081us 34.07% 970.081us 970.081us 1
3951
- cudaDeviceGetAttribute 0.09% 4.410us 0.09% 4.410us 0.294us 0.000us 0.00% 0.000us 0.000us 15
3952
- aten::empty_like 0.38% 19.301us 1.08% 54.311us 18.104us 0.000us 0.00% 0.000us 0.000us 3
3953
- aten::empty_strided 0.70% 35.010us 0.70% 35.010us 11.670us 0.000us 0.00% 0.000us 0.000us 3
3954
- aten::empty 0.51% 25.771us 0.51% 25.771us 2.863us 0.000us 0.00% 0.000us 0.000us 9
3955
- cudaFuncSetAttribute 1.06% 53.231us 1.06% 53.231us 17.744us 0.000us 0.00% 0.000us 0.000us 3
3956
- cudaLaunchKernel 0.83% 41.840us 0.83% 41.840us 13.947us 0.000us 0.00% 0.000us 0.000us 3
3957
- cudaDeviceSynchronize 52.04% 2.620ms 52.04% 2.620ms 2.620ms 0.000us 0.00% 0.000us 0.000us 1
3958
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3959
- Self CPU time total: 5.035ms
3960
- Self CUDA time total: 2.847ms
3961
 
3962
 
3963
 
@@ -3967,21 +3967,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
3967
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3968
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3969
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3970
- hf_kernels_flash_attn 1.71% 88.920us 43.78% 2.280ms 2.280ms 0.000us 0.00% 4.110ms 4.110ms 1
3971
- _flash_attn_9e27194::fwd 0.90% 46.653us 42.07% 2.191ms 730.229us 3.068ms 100.00% 4.110ms 1.370ms 3
3972
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.070ms 100.05% 3.070ms 3.070ms 1
3973
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.068ms 100.00% 3.068ms 1.023ms 3
3974
- Activity Buffer Request 39.69% 2.067ms 39.69% 2.067ms 2.067ms 1.041ms 33.93% 1.041ms 1.041ms 1
3975
- cudaDeviceGetAttribute 0.07% 3.649us 0.07% 3.649us 0.243us 0.000us 0.00% 0.000us 0.000us 15
3976
- aten::empty_like 0.14% 7.310us 0.43% 22.581us 7.527us 0.000us 0.00% 0.000us 0.000us 3
3977
- aten::empty_strided 0.29% 15.271us 0.29% 15.271us 5.090us 0.000us 0.00% 0.000us 0.000us 3
3978
- aten::empty 0.41% 21.500us 0.41% 21.500us 2.389us 0.000us 0.00% 0.000us 0.000us 9
3979
- cudaFuncSetAttribute 0.07% 3.620us 0.07% 3.620us 1.207us 0.000us 0.00% 0.000us 0.000us 3
3980
- cudaLaunchKernel 0.50% 25.800us 0.50% 25.800us 8.600us 0.000us 0.00% 0.000us 0.000us 3
3981
- cudaDeviceSynchronize 56.22% 2.927ms 56.22% 2.927ms 2.927ms 0.000us 0.00% 0.000us 0.000us 1
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
- Self CPU time total: 5.207ms
3984
- Self CUDA time total: 3.068ms
3985
 
3986
 
3987
 
@@ -3991,21 +3991,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
- hf_kernels_flash_attn 1.71% 88.010us 40.24% 2.065ms 2.065ms 0.000us 0.00% 4.290ms 4.290ms 1
3995
- _flash_attn_9e27194::fwd 1.03% 52.730us 38.53% 1.977ms 659.108us 3.209ms 100.00% 4.290ms 1.430ms 3
3996
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.211ms 100.05% 3.211ms 3.211ms 1
3997
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.209ms 100.00% 3.209ms 1.070ms 3
3998
- Activity Buffer Request 35.96% 1.846ms 35.96% 1.846ms 1.846ms 1.081ms 33.68% 1.081ms 1.081ms 1
3999
- cudaDeviceGetAttribute 0.07% 3.699us 0.07% 3.699us 0.247us 0.000us 0.00% 0.000us 0.000us 15
4000
- aten::empty_like 0.13% 6.760us 0.45% 22.961us 7.654us 0.000us 0.00% 0.000us 0.000us 3
4001
- aten::empty_strided 0.32% 16.201us 0.32% 16.201us 5.400us 0.000us 0.00% 0.000us 0.000us 3
4002
- aten::empty 0.41% 20.833us 0.41% 20.833us 2.315us 0.000us 0.00% 0.000us 0.000us 9
4003
- cudaFuncSetAttribute 0.07% 3.580us 0.07% 3.580us 1.193us 0.000us 0.00% 0.000us 0.000us 3
4004
- cudaLaunchKernel 0.54% 27.851us 0.54% 27.851us 9.284us 0.000us 0.00% 0.000us 0.000us 3
4005
- cudaDeviceSynchronize 59.76% 3.067ms 59.76% 3.067ms 3.067ms 0.000us 0.00% 0.000us 0.000us 1
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
- Self CPU time total: 5.132ms
4008
- Self CUDA time total: 3.209ms
4009
 
4010
 
4011
 
@@ -4015,21 +4015,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
- hf_kernels_flash_attn 2.41% 90.762us 19.01% 717.141us 717.141us 0.000us 0.00% 4.279ms 4.279ms 1
4019
- _flash_attn_9e27194::fwd 1.23% 46.533us 16.60% 626.379us 208.793us 3.197ms 100.00% 4.279ms 1.426ms 3
4020
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.199ms 100.05% 3.199ms 3.199ms 1
4021
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.197ms 100.00% 3.197ms 1.066ms 3
4022
- Activity Buffer Request 7.66% 288.965us 7.66% 288.965us 288.965us 1.082ms 33.83% 1.082ms 1.082ms 1
4023
- cudaDeviceGetAttribute 0.10% 3.648us 0.10% 3.648us 0.243us 0.000us 0.00% 0.000us 0.000us 15
4024
- aten::empty_like 0.18% 6.920us 0.61% 22.930us 7.643us 0.000us 0.00% 0.000us 0.000us 3
4025
- aten::empty_strided 0.42% 16.010us 0.42% 16.010us 5.337us 0.000us 0.00% 0.000us 0.000us 3
4026
- aten::empty 0.56% 21.260us 0.56% 21.260us 2.362us 0.000us 0.00% 0.000us 0.000us 9
4027
- cudaFuncSetAttribute 0.10% 3.650us 0.10% 3.650us 1.217us 0.000us 0.00% 0.000us 0.000us 3
4028
- cudaLaunchKernel 6.35% 239.393us 6.35% 239.393us 79.798us 0.000us 0.00% 0.000us 0.000us 3
4029
- cudaDeviceSynchronize 80.99% 3.055ms 80.99% 3.055ms 3.055ms 0.000us 0.00% 0.000us 0.000us 1
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
- Self CPU time total: 3.772ms
4032
- Self CUDA time total: 3.197ms
4033
 
4034
 
4035
 
@@ -4039,21 +4039,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4041
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4042
- hf_kernels_flash_attn 1.57% 90.561us 37.72% 2.178ms 2.178ms 0.000us 0.00% 4.999ms 4.999ms 1
4043
- _flash_attn_9e27194::fwd 0.83% 48.040us 36.16% 2.087ms 695.661us 3.741ms 100.00% 4.999ms 1.666ms 3
4044
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.743ms 100.05% 3.743ms 3.743ms 1
4045
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.741ms 100.00% 3.741ms 1.247ms 3
4046
- Activity Buffer Request 30.45% 1.758ms 30.45% 1.758ms 1.758ms 1.258ms 33.63% 1.258ms 1.258ms 1
4047
- cudaDeviceGetAttribute 0.06% 3.722us 0.06% 3.722us 0.248us 0.000us 0.00% 0.000us 0.000us 15
4048
- aten::empty_like 0.14% 7.831us 0.41% 23.771us 7.924us 0.000us 0.00% 0.000us 0.000us 3
4049
- aten::empty_strided 0.28% 15.940us 0.28% 15.940us 5.313us 0.000us 0.00% 0.000us 0.000us 3
4050
- aten::empty 0.36% 20.578us 0.36% 20.578us 2.286us 0.000us 0.00% 0.000us 0.000us 9
4051
- cudaFuncSetAttribute 0.06% 3.590us 0.06% 3.590us 1.197us 0.000us 0.00% 0.000us 0.000us 3
4052
- cudaLaunchKernel 3.98% 229.604us 3.98% 229.604us 76.535us 0.000us 0.00% 0.000us 0.000us 3
4053
- cudaDeviceSynchronize 62.28% 3.595ms 62.28% 3.595ms 3.595ms 0.000us 0.00% 0.000us 0.000us 1
4054
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4055
- Self CPU time total: 5.772ms
4056
- Self CUDA time total: 3.741ms
4057
 
4058
 
4059
 
@@ -4063,40 +4063,37 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
4063
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4064
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
- hf_kernels_flash_attn 2.13% 89.030us 15.70% 656.370us 656.370us 0.000us 0.00% 4.900ms 4.900ms 1
4067
- _flash_attn_9e27194::fwd 1.15% 48.015us 13.57% 567.340us 189.113us 3.667ms 100.00% 4.900ms 1.633ms 3
4068
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.669ms 100.04% 3.669ms 3.669ms 1
4069
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.667ms 100.00% 3.667ms 1.222ms 3
4070
- Activity Buffer Request 5.94% 248.154us 5.94% 248.154us 248.154us 1.233ms 33.62% 1.233ms 1.233ms 1
4071
- cudaDeviceGetAttribute 0.08% 3.539us 0.08% 3.539us 0.236us 0.000us 0.00% 0.000us 0.000us 15
4072
- aten::empty_like 0.16% 6.860us 0.56% 23.209us 7.736us 0.000us 0.00% 0.000us 0.000us 3
4073
- aten::empty_strided 0.39% 16.349us 0.39% 16.349us 5.450us 0.000us 0.00% 0.000us 0.000us 3
4074
- aten::empty 0.49% 20.571us 0.49% 20.571us 2.286us 0.000us 0.00% 0.000us 0.000us 9
4075
- cudaFuncSetAttribute 0.09% 3.630us 0.09% 3.630us 1.210us 0.000us 0.00% 0.000us 0.000us 3
4076
- cudaLaunchKernel 5.27% 220.222us 5.27% 220.222us 73.407us 0.000us 0.00% 0.000us 0.000us 3
4077
- cudaDeviceSynchronize 84.30% 3.524ms 84.30% 3.524ms 3.524ms 0.000us 0.00% 0.000us 0.000us 1
4078
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4079
- Self CPU time total: 4.180ms
4080
- Self CUDA time total: 3.667ms
4081
 
4082
 
4083
  impl wl p50(ms) ok
4084
- hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.99 True
4085
- hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.04 True
4086
- hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.07 True
4087
- hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.08 True
4088
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.26 True
4089
- hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.25 True
4090
  </pre></div>
4091
- <div class="uv-install-logs" id="uv-logs-benchmark">
4092
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4093
- <div class="uv-logs-content" style="display: none;">
4094
- Installed 52 packages in 274ms
 
4095
  </div>
4096
- </div>
4097
- <div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4098
- Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:17, 1.01it/s]
4099
- Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 10.06it/s]</div>
4100
  <div class="cell-artifacts">
4101
  <h4>Artifacts:</h4>
4102
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3886
  <span class="collapse-indicators">
3887
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: benchmark | 6.12s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3945
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3946
+ hf_kernels_flash_attn 3.19% 147.591us 44.59% 2.062ms 2.062ms 0.000us 0.00% 3.719ms 3.719ms 1
3947
+ _flash_attn_9e27194::fwd 1.32% 60.849us 41.40% 1.914ms 638.151us 2.771ms 100.00% 3.719ms 1.240ms 3
3948
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.773ms 100.06% 2.773ms 2.773ms 1
3949
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.771ms 100.00% 2.771ms 923.713us 3
3950
+ Activity Buffer Request 37.16% 1.718ms 37.16% 1.718ms 1.718ms 947.777us 34.20% 947.777us 947.777us 1
3951
+ cudaDeviceGetAttribute 0.09% 4.211us 0.09% 4.211us 0.281us 0.000us 0.00% 0.000us 0.000us 15
3952
+ aten::empty_like 0.37% 16.891us 1.10% 50.702us 16.901us 0.000us 0.00% 0.000us 0.000us 3
3953
+ aten::empty_strided 0.73% 33.811us 0.73% 33.811us 11.270us 0.000us 0.00% 0.000us 0.000us 3
3954
+ aten::empty 0.54% 24.922us 0.54% 24.922us 2.769us 0.000us 0.00% 0.000us 0.000us 9
3955
+ cudaFuncSetAttribute 0.27% 12.349us 0.27% 12.349us 4.116us 0.000us 0.00% 0.000us 0.000us 3
3956
+ cudaLaunchKernel 0.93% 42.971us 0.93% 42.971us 14.324us 0.000us 0.00% 0.000us 0.000us 3
3957
+ cudaDeviceSynchronize 55.41% 2.563ms 55.41% 2.563ms 2.563ms 0.000us 0.00% 0.000us 0.000us 1
3958
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3959
+ Self CPU time total: 4.625ms
3960
+ Self CUDA time total: 2.771ms
3961
 
3962
 
3963
 
 
3967
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3968
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3969
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3970
+ hf_kernels_flash_attn 1.95% 91.420us 40.89% 1.916ms 1.916ms 0.000us 0.00% 3.901ms 3.901ms 1
3971
+ _flash_attn_9e27194::fwd 0.98% 45.792us 38.94% 1.825ms 608.181us 2.914ms 100.00% 3.901ms 1.300ms 3
3972
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.916ms 100.05% 2.916ms 2.916ms 1
3973
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.914ms 100.00% 2.914ms 971.481us 3
3974
+ Activity Buffer Request 36.29% 1.700ms 36.29% 1.700ms 1.700ms 986.884us 33.86% 986.884us 986.884us 1
3975
+ cudaDeviceGetAttribute 0.07% 3.500us 0.07% 3.500us 0.233us 0.000us 0.00% 0.000us 0.000us 15
3976
+ aten::empty_like 0.15% 6.960us 0.52% 24.320us 8.107us 0.000us 0.00% 0.000us 0.000us 3
3977
+ aten::empty_strided 0.37% 17.360us 0.37% 17.360us 5.787us 0.000us 0.00% 0.000us 0.000us 3
3978
+ aten::empty 0.45% 21.021us 0.45% 21.021us 2.336us 0.000us 0.00% 0.000us 0.000us 9
3979
+ cudaFuncSetAttribute 0.08% 3.519us 0.08% 3.519us 1.173us 0.000us 0.00% 0.000us 0.000us 3
3980
+ cudaLaunchKernel 0.55% 25.931us 0.55% 25.931us 8.644us 0.000us 0.00% 0.000us 0.000us 3
3981
+ cudaDeviceSynchronize 59.11% 2.770ms 59.11% 2.770ms 2.770ms 0.000us 0.00% 0.000us 0.000us 1
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
+ Self CPU time total: 4.686ms
3984
+ Self CUDA time total: 2.914ms
3985
 
3986
 
3987
 
 
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
+ hf_kernels_flash_attn 2.13% 103.462us 40.42% 1.967ms 1.967ms 0.000us 0.00% 4.069ms 4.069ms 1
3995
+ _flash_attn_9e27194::fwd 0.94% 45.522us 38.30% 1.863ms 621.134us 3.040ms 100.00% 4.069ms 1.356ms 3
3996
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.041ms 100.05% 3.041ms 3.041ms 1
3997
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.040ms 100.00% 3.040ms 1.013ms 3
3998
+ Activity Buffer Request 35.70% 1.737ms 35.70% 1.737ms 1.737ms 1.029ms 33.84% 1.029ms 1.029ms 1
3999
+ cudaDeviceGetAttribute 0.07% 3.488us 0.07% 3.488us 0.233us 0.000us 0.00% 0.000us 0.000us 15
4000
+ aten::empty_like 0.13% 6.550us 0.49% 24.010us 8.003us 0.000us 0.00% 0.000us 0.000us 3
4001
+ aten::empty_strided 0.36% 17.460us 0.36% 17.460us 5.820us 0.000us 0.00% 0.000us 0.000us 3
4002
+ aten::empty 0.47% 22.651us 0.47% 22.651us 2.517us 0.000us 0.00% 0.000us 0.000us 9
4003
+ cudaFuncSetAttribute 0.07% 3.621us 0.07% 3.621us 1.207us 0.000us 0.00% 0.000us 0.000us 3
4004
+ cudaLaunchKernel 0.55% 26.960us 0.55% 26.960us 8.987us 0.000us 0.00% 0.000us 0.000us 3
4005
+ cudaDeviceSynchronize 59.58% 2.899ms 59.58% 2.899ms 2.899ms 0.000us 0.00% 0.000us 0.000us 1
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
+ Self CPU time total: 4.866ms
4008
+ Self CUDA time total: 3.040ms
4009
 
4010
 
4011
 
 
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
+ hf_kernels_flash_attn 2.03% 100.371us 41.00% 2.032ms 2.032ms 0.000us 0.00% 4.098ms 4.098ms 1
4019
+ _flash_attn_9e27194::fwd 0.92% 45.401us 38.98% 1.931ms 643.821us 3.066ms 100.00% 4.098ms 1.366ms 3
4020
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.068ms 100.05% 3.068ms 3.068ms 1
4021
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.066ms 100.00% 3.066ms 1.022ms 3
4022
+ Activity Buffer Request 32.94% 1.632ms 32.94% 1.632ms 1.632ms 1.032ms 33.68% 1.032ms 1.032ms 1
4023
+ cudaDeviceGetAttribute 0.07% 3.502us 0.07% 3.502us 0.233us 0.000us 0.00% 0.000us 0.000us 15
4024
+ aten::empty_like 0.14% 6.780us 0.47% 23.270us 7.757us 0.000us 0.00% 0.000us 0.000us 3
4025
+ aten::empty_strided 0.33% 16.490us 0.33% 16.490us 5.497us 0.000us 0.00% 0.000us 0.000us 3
4026
+ aten::empty 0.45% 22.299us 0.45% 22.299us 2.478us 0.000us 0.00% 0.000us 0.000us 9
4027
+ cudaFuncSetAttribute 0.09% 4.220us 0.09% 4.220us 1.407us 0.000us 0.00% 0.000us 0.000us 3
4028
+ cudaLaunchKernel 4.04% 200.304us 4.04% 200.304us 66.768us 0.000us 0.00% 0.000us 0.000us 3
4029
+ cudaDeviceSynchronize 59.00% 2.924ms 59.00% 2.924ms 2.924ms 0.000us 0.00% 0.000us 0.000us 1
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
+ Self CPU time total: 4.956ms
4032
+ Self CUDA time total: 3.066ms
4033
 
4034
 
4035
 
 
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4041
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4042
+ hf_kernels_flash_attn 2.01% 110.531us 38.27% 2.104ms 2.104ms 0.000us 0.00% 4.721ms 4.721ms 1
4043
+ _flash_attn_9e27194::fwd 0.85% 46.845us 36.26% 1.993ms 664.435us 3.536ms 100.00% 4.721ms 1.574ms 3
4044
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.537ms 100.04% 3.537ms 3.537ms 1
4045
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.536ms 100.00% 3.536ms 1.179ms 3
4046
+ Activity Buffer Request 31.52% 1.733ms 31.52% 1.733ms 1.733ms 1.186ms 33.53% 1.186ms 1.186ms 1
4047
+ cudaDeviceGetAttribute 0.07% 3.850us 0.07% 3.850us 0.257us 0.000us 0.00% 0.000us 0.000us 15
4048
+ aten::empty_like 0.13% 7.081us 0.42% 23.120us 7.707us 0.000us 0.00% 0.000us 0.000us 3
4049
+ aten::empty_strided 0.29% 16.039us 0.29% 16.039us 5.346us 0.000us 0.00% 0.000us 0.000us 3
4050
+ aten::empty 0.38% 21.099us 0.38% 21.099us 2.344us 0.000us 0.00% 0.000us 0.000us 9
4051
+ cudaFuncSetAttribute 0.07% 3.738us 0.07% 3.738us 1.246us 0.000us 0.00% 0.000us 0.000us 3
4052
+ cudaLaunchKernel 2.95% 161.933us 2.95% 161.933us 53.978us 0.000us 0.00% 0.000us 0.000us 3
4053
+ cudaDeviceSynchronize 61.73% 3.393ms 61.73% 3.393ms 3.393ms 0.000us 0.00% 0.000us 0.000us 1
4054
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4055
+ Self CPU time total: 5.497ms
4056
+ Self CUDA time total: 3.536ms
4057
 
4058
 
4059
 
 
4063
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4064
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
+ hf_kernels_flash_attn 1.92% 105.962us 36.83% 2.036ms 2.036ms 0.000us 0.00% 4.864ms 4.864ms 1
4067
+ _flash_attn_9e27194::fwd 0.86% 47.350us 34.91% 1.930ms 643.481us 3.642ms 100.00% 4.864ms 1.621ms 3
4068
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.643ms 100.04% 3.643ms 3.643ms 1
4069
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.642ms 100.00% 3.642ms 1.214ms 3
4070
+ Activity Buffer Request 30.16% 1.668ms 30.16% 1.668ms 1.668ms 1.222ms 33.55% 1.222ms 1.222ms 1
4071
+ cudaDeviceGetAttribute 0.06% 3.551us 0.06% 3.551us 0.237us 0.000us 0.00% 0.000us 0.000us 15
4072
+ aten::empty_like 0.12% 6.900us 0.42% 23.180us 7.727us 0.000us 0.00% 0.000us 0.000us 3
4073
+ aten::empty_strided 0.29% 16.280us 0.29% 16.280us 5.427us 0.000us 0.00% 0.000us 0.000us 3
4074
+ aten::empty 0.40% 21.939us 0.40% 21.939us 2.438us 0.000us 0.00% 0.000us 0.000us 9
4075
+ cudaFuncSetAttribute 0.07% 3.861us 0.07% 3.861us 1.287us 0.000us 0.00% 0.000us 0.000us 3
4076
+ cudaLaunchKernel 2.95% 163.043us 2.95% 163.043us 54.348us 0.000us 0.00% 0.000us 0.000us 3
4077
+ cudaDeviceSynchronize 63.17% 3.493ms 63.17% 3.493ms 3.493ms 0.000us 0.00% 0.000us 0.000us 1
4078
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4079
+ Self CPU time total: 5.529ms
4080
+ Self CUDA time total: 3.642ms
4081
 
4082
 
4083
  impl wl p50(ms) ok
4084
+ hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
4085
+ hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.02 True
4086
+ hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
4087
+ hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
4088
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
4089
+ hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.22 True
4090
  </pre></div>
4091
+ <div class="cell-stderr">
4092
+ Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4093
+ Fetching 20 files: 5%|▌ | 1/20 [00:00&lt;00:03, 6.04it/s]
4094
+ Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:20, 1.14s/it]
4095
+ Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 10.05it/s]
4096
  </div>
 
 
 
 
4097
  <div class="cell-artifacts">
4098
  <h4>Artifacts:</h4>
4099
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/hf_kernels_flash_attn3.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3886,9 +3886,9 @@ body[data-tool="eraser"] .main-content {
3886
  <span class="collapse-indicators">
3887
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: benchmark | 5.83s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3942,19 +3942,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
3942
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3943
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3944
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3945
- hf_kernels_flash_attn3 3.53% 162.212us 48.20% 2.217ms 2.217ms 0.000us 0.00% 3.575ms 3.575ms 1
3946
- FlashAttnFunc 2.60% 119.532us 44.67% 2.055ms 684.947us 0.000us 0.00% 3.575ms 1.192ms 3
3947
- _flash_attn3_48fe103_dirty::fwd 1.56% 71.632us 42.08% 1.935ms 645.103us 2.671ms 100.00% 3.575ms 1.192ms 3
3948
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.672ms 100.06% 2.672ms 2.672ms 1
3949
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.671ms 100.00% 2.671ms 890.241us 3
3950
- Activity Buffer Request 38.25% 1.759ms 38.25% 1.759ms 1.759ms 904.001us 33.85% 904.001us 904.001us 1
3951
- aten::empty 0.93% 42.731us 0.93% 42.731us 7.122us 0.000us 0.00% 0.000us 0.000us 6
3952
- cudaFuncSetAttribute 0.32% 14.640us 0.32% 14.640us 4.880us 0.000us 0.00% 0.000us 0.000us 3
3953
- cudaLaunchKernel 1.03% 47.150us 1.03% 47.150us 15.717us 0.000us 0.00% 0.000us 0.000us 3
3954
- cudaDeviceSynchronize 51.80% 2.383ms 51.80% 2.383ms 2.383ms 0.000us 0.00% 0.000us 0.000us 1
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
- Self CPU time total: 4.600ms
3957
- Self CUDA time total: 2.671ms
3958
 
3959
 
3960
 
@@ -3964,19 +3964,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
3964
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3965
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3966
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3967
- hf_kernels_flash_attn3 2.14% 101.412us 45.76% 2.172ms 2.172ms 0.000us 0.00% 3.747ms 3.747ms 1
3968
- FlashAttnFunc 1.91% 90.691us 43.62% 2.071ms 690.247us 0.000us 0.00% 3.747ms 1.249ms 3
3969
- _flash_attn3_48fe103_dirty::fwd 1.11% 52.911us 41.71% 1.980ms 660.016us 2.794ms 100.00% 3.747ms 1.249ms 3
3970
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.796ms 100.06% 2.796ms 2.796ms 1
3971
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.794ms 100.00% 2.794ms 931.376us 3
3972
- Activity Buffer Request 39.32% 1.866ms 39.32% 1.866ms 1.866ms 953.126us 34.11% 953.126us 953.126us 1
3973
- aten::empty 0.55% 26.341us 0.55% 26.341us 4.390us 0.000us 0.00% 0.000us 0.000us 6
3974
- cudaFuncSetAttribute 0.11% 5.160us 0.11% 5.160us 1.720us 0.000us 0.00% 0.000us 0.000us 3
3975
- cudaLaunchKernel 0.62% 29.260us 0.62% 29.260us 9.753us 0.000us 0.00% 0.000us 0.000us 3
3976
- cudaDeviceSynchronize 54.24% 2.575ms 54.24% 2.575ms 2.575ms 0.000us 0.00% 0.000us 0.000us 1
3977
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3978
- Self CPU time total: 4.747ms
3979
- Self CUDA time total: 2.794ms
3980
 
3981
 
3982
 
@@ -3986,19 +3986,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
- hf_kernels_flash_attn3 2.17% 102.652us 42.70% 2.019ms 2.019ms 0.000us 0.00% 3.920ms 3.920ms 1
3990
- FlashAttnFunc 1.91% 90.472us 40.53% 1.916ms 638.683us 0.000us 0.00% 3.920ms 1.307ms 3
3991
- _flash_attn3_48fe103_dirty::fwd 0.99% 47.030us 38.62% 1.826ms 608.525us 2.928ms 100.00% 3.920ms 1.307ms 3
3992
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.930ms 100.05% 2.930ms 2.930ms 1
3993
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.928ms 100.00% 2.928ms 976.037us 3
3994
- Activity Buffer Request 36.27% 1.715ms 36.27% 1.715ms 1.715ms 991.995us 33.88% 991.995us 991.995us 1
3995
- aten::empty 0.57% 26.980us 0.57% 26.980us 4.497us 0.000us 0.00% 0.000us 0.000us 6
3996
- cudaFuncSetAttribute 0.11% 4.990us 0.11% 4.990us 1.663us 0.000us 0.00% 0.000us 0.000us 3
3997
- cudaLaunchKernel 0.68% 32.070us 0.68% 32.070us 10.690us 0.000us 0.00% 0.000us 0.000us 3
3998
- cudaDeviceSynchronize 57.30% 2.709ms 57.30% 2.709ms 2.709ms 0.000us 0.00% 0.000us 0.000us 1
3999
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4000
- Self CPU time total: 4.728ms
4001
- Self CUDA time total: 2.928ms
4002
 
4003
 
4004
 
@@ -4008,19 +4008,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
- hf_kernels_flash_attn3 2.33% 117.613us 45.39% 2.290ms 2.290ms 0.000us 0.00% 3.984ms 3.984ms 1
4012
- FlashAttnFunc 1.82% 91.609us 43.06% 2.172ms 724.120us 0.000us 0.00% 3.984ms 1.328ms 3
4013
- _flash_attn3_48fe103_dirty::fwd 0.95% 47.941us 41.24% 2.081ms 693.584us 2.967ms 100.00% 3.984ms 1.328ms 3
4014
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.968ms 100.05% 2.968ms 2.968ms 1
4015
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.967ms 100.00% 2.967ms 988.843us 3
4016
- Activity Buffer Request 35.42% 1.787ms 35.42% 1.787ms 1.787ms 1.017ms 34.30% 1.017ms 1.017ms 1
4017
- aten::empty 0.56% 28.180us 0.56% 28.180us 4.697us 0.000us 0.00% 0.000us 0.000us 6
4018
- cudaFuncSetAttribute 0.10% 5.080us 0.10% 5.080us 1.693us 0.000us 0.00% 0.000us 0.000us 3
4019
- cudaLaunchKernel 4.21% 212.544us 4.21% 212.544us 70.848us 0.000us 0.00% 0.000us 0.000us 3
4020
- cudaDeviceSynchronize 54.61% 2.755ms 54.61% 2.755ms 2.755ms 0.000us 0.00% 0.000us 0.000us 1
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
- Self CPU time total: 5.045ms
4023
- Self CUDA time total: 2.967ms
4024
 
4025
 
4026
 
@@ -4030,19 +4030,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4032
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4033
- hf_kernels_flash_attn3 2.35% 128.980us 39.64% 2.179ms 2.179ms 0.000us 0.00% 4.722ms 4.722ms 1
4034
- FlashAttnFunc 1.64% 90.214us 37.30% 2.050ms 683.484us 0.000us 0.00% 4.722ms 1.574ms 3
4035
- _flash_attn3_48fe103_dirty::fwd 0.87% 47.980us 35.66% 1.960ms 653.413us 3.530ms 100.00% 4.722ms 1.574ms 3
4036
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.532ms 100.04% 3.532ms 3.532ms 1
4037
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.530ms 100.00% 3.530ms 1.177ms 3
4038
- Activity Buffer Request 31.21% 1.716ms 31.21% 1.716ms 1.716ms 1.192ms 33.75% 1.192ms 1.192ms 1
4039
- aten::empty 0.49% 26.830us 0.49% 26.830us 4.472us 0.000us 0.00% 0.000us 0.000us 6
4040
- cudaFuncSetAttribute 0.09% 5.100us 0.09% 5.100us 1.700us 0.000us 0.00% 0.000us 0.000us 3
4041
- cudaLaunchKernel 2.99% 164.492us 2.99% 164.492us 54.831us 0.000us 0.00% 0.000us 0.000us 3
4042
- cudaDeviceSynchronize 60.36% 3.318ms 60.36% 3.318ms 3.318ms 0.000us 0.00% 0.000us 0.000us 1
4043
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
- Self CPU time total: 5.497ms
4045
- Self CUDA time total: 3.530ms
4046
 
4047
 
4048
 
@@ -4052,34 +4052,38 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
4052
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4053
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4054
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4055
- hf_kernels_flash_attn3 2.16% 118.543us 39.14% 2.150ms 2.150ms 0.000us 0.00% 4.736ms 4.736ms 1
4056
- FlashAttnFunc 1.66% 91.361us 36.98% 2.032ms 677.186us 0.000us 0.00% 4.736ms 1.579ms 3
4057
- _flash_attn3_48fe103_dirty::fwd 0.85% 46.593us 35.32% 1.940ms 646.733us 3.555ms 100.00% 4.736ms 1.579ms 3
4058
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.556ms 100.04% 3.556ms 3.556ms 1
4059
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.555ms 100.00% 3.555ms 1.185ms 3
4060
- Activity Buffer Request 30.64% 1.683ms 30.64% 1.683ms 1.683ms 1.181ms 33.22% 1.181ms 1.181ms 1
4061
- aten::empty 0.50% 27.560us 0.50% 27.560us 4.593us 0.000us 0.00% 0.000us 0.000us 6
4062
- cudaFuncSetAttribute 0.09% 5.069us 0.09% 5.069us 1.690us 0.000us 0.00% 0.000us 0.000us 3
4063
- cudaLaunchKernel 3.23% 177.672us 3.23% 177.672us 59.224us 0.000us 0.00% 0.000us 0.000us 3
4064
- cudaDeviceSynchronize 60.86% 3.344ms 60.86% 3.344ms 3.344ms 0.000us 0.00% 0.000us 0.000us 1
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
- Self CPU time total: 5.494ms
4067
- Self CUDA time total: 3.555ms
4068
 
4069
 
4070
  impl wl p50(ms) ok
4071
- hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.94 True
4072
- hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
4073
- hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.05 True
4074
- hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.04 True
4075
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.22 True
4076
- hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.21 True
4077
  </pre></div>
4078
- <div class="cell-stderr">
4079
- Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4080
- Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.28it/s]
4081
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.57it/s]
4082
  </div>
 
 
 
 
4083
  <div class="cell-artifacts">
4084
  <h4>Artifacts:</h4>
4085
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3886
  <span class="collapse-indicators">
3887
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: benchmark | 6.42s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3942
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3943
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3944
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3945
+ hf_kernels_flash_attn3 3.60% 164.063us 47.53% 2.169ms 2.169ms 0.000us 0.00% 3.577ms 3.577ms 1
3946
+ FlashAttnFunc 2.65% 121.151us 43.94% 2.005ms 668.341us 0.000us 0.00% 3.577ms 1.192ms 3
3947
+ _flash_attn3_1d39a44::fwd 1.62% 73.763us 41.28% 1.884ms 627.958us 2.686ms 100.00% 3.577ms 1.192ms 3
3948
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.688ms 100.07% 2.688ms 2.688ms 1
3949
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.686ms 100.00% 2.686ms 895.374us 3
3950
+ Activity Buffer Request 37.38% 1.706ms 37.38% 1.706ms 1.706ms 891.299us 33.18% 891.299us 891.299us 1
3951
+ aten::empty 0.94% 42.930us 0.94% 42.930us 7.155us 0.000us 0.00% 0.000us 0.000us 6
3952
+ cudaFuncSetAttribute 0.33% 14.999us 0.33% 14.999us 5.000us 0.000us 0.00% 0.000us 0.000us 3
3953
+ cudaLaunchKernel 1.02% 46.432us 1.02% 46.432us 15.477us 0.000us 0.00% 0.000us 0.000us 3
3954
+ cudaDeviceSynchronize 52.47% 2.394ms 52.47% 2.394ms 2.394ms 0.000us 0.00% 0.000us 0.000us 1
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
+ Self CPU time total: 4.563ms
3957
+ Self CUDA time total: 2.686ms
3958
 
3959
 
3960
 
 
3964
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3965
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3966
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3967
+ hf_kernels_flash_attn3 2.68% 123.103us 45.27% 2.082ms 2.082ms 0.000us 0.00% 3.670ms 3.670ms 1
3968
+ FlashAttnFunc 2.03% 93.300us 42.60% 1.959ms 653.024us 0.000us 0.00% 3.670ms 1.223ms 3
3969
+ _flash_attn3_1d39a44::fwd 1.05% 48.412us 40.57% 1.866ms 621.924us 2.738ms 100.00% 3.670ms 1.223ms 3
3970
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.739ms 100.06% 2.739ms 2.739ms 1
3971
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.738ms 100.00% 2.738ms 912.629us 3
3972
+ Activity Buffer Request 38.14% 1.754ms 38.14% 1.754ms 1.754ms 932.416us 34.06% 932.416us 932.416us 1
3973
+ aten::empty 0.59% 27.041us 0.59% 27.041us 4.507us 0.000us 0.00% 0.000us 0.000us 6
3974
+ cudaFuncSetAttribute 0.14% 6.480us 0.14% 6.480us 2.160us 0.000us 0.00% 0.000us 0.000us 3
3975
+ cudaLaunchKernel 0.64% 29.621us 0.64% 29.621us 9.874us 0.000us 0.00% 0.000us 0.000us 3
3976
+ cudaDeviceSynchronize 54.73% 2.517ms 54.73% 2.517ms 2.517ms 0.000us 0.00% 0.000us 0.000us 1
3977
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3978
+ Self CPU time total: 4.599ms
3979
+ Self CUDA time total: 2.738ms
3980
 
3981
 
3982
 
 
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
+ hf_kernels_flash_attn3 2.66% 126.472us 43.74% 2.079ms 2.079ms 0.000us 0.00% 3.863ms 3.863ms 1
3990
+ FlashAttnFunc 1.87% 89.050us 41.07% 1.952ms 650.694us 0.000us 0.00% 3.863ms 1.288ms 3
3991
+ _flash_attn3_1d39a44::fwd 1.00% 47.600us 39.20% 1.863ms 621.011us 2.883ms 100.00% 3.863ms 1.288ms 3
3992
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.885ms 100.05% 2.885ms 2.885ms 1
3993
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.883ms 100.00% 2.883ms 961.034us 3
3994
+ Activity Buffer Request 36.94% 1.756ms 36.94% 1.756ms 1.756ms 979.903us 33.99% 979.903us 979.903us 1
3995
+ aten::empty 0.53% 25.081us 0.53% 25.081us 4.180us 0.000us 0.00% 0.000us 0.000us 6
3996
+ cudaFuncSetAttribute 0.11% 5.050us 0.11% 5.050us 1.683us 0.000us 0.00% 0.000us 0.000us 3
3997
+ cudaLaunchKernel 0.62% 29.612us 0.62% 29.612us 9.871us 0.000us 0.00% 0.000us 0.000us 3
3998
+ cudaDeviceSynchronize 56.26% 2.674ms 56.26% 2.674ms 2.674ms 0.000us 0.00% 0.000us 0.000us 1
3999
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4000
+ Self CPU time total: 4.753ms
4001
+ Self CUDA time total: 2.883ms
4002
 
4003
 
4004
 
 
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
+ hf_kernels_flash_attn3 2.48% 119.623us 44.91% 2.170ms 2.170ms 0.000us 0.00% 3.846ms 3.846ms 1
4012
+ FlashAttnFunc 1.87% 90.201us 42.43% 2.050ms 683.325us 0.000us 0.00% 3.846ms 1.282ms 3
4013
+ _flash_attn3_1d39a44::fwd 0.98% 47.571us 40.56% 1.960ms 653.258us 2.874ms 100.00% 3.846ms 1.282ms 3
4014
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.876ms 100.05% 2.876ms 2.876ms 1
4015
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.874ms 100.00% 2.874ms 957.983us 3
4016
+ Activity Buffer Request 34.13% 1.649ms 34.13% 1.649ms 1.649ms 972.223us 33.83% 972.223us 972.223us 1
4017
+ aten::empty 0.55% 26.410us 0.55% 26.410us 4.402us 0.000us 0.00% 0.000us 0.000us 6
4018
+ cudaFuncSetAttribute 0.11% 5.420us 0.11% 5.420us 1.807us 0.000us 0.00% 0.000us 0.000us 3
4019
+ cudaLaunchKernel 4.79% 231.213us 4.79% 231.213us 77.071us 0.000us 0.00% 0.000us 0.000us 3
4020
+ cudaDeviceSynchronize 55.09% 2.662ms 55.09% 2.662ms 2.662ms 0.000us 0.00% 0.000us 0.000us 1
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
+ Self CPU time total: 4.831ms
4023
+ Self CUDA time total: 2.874ms
4024
 
4025
 
4026
 
 
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4032
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4033
+ hf_kernels_flash_attn3 2.24% 122.153us 41.67% 2.277ms 2.277ms 0.000us 0.00% 4.541ms 4.541ms 1
4034
+ FlashAttnFunc 1.69% 92.610us 39.43% 2.155ms 718.395us 0.000us 0.00% 4.541ms 1.514ms 3
4035
+ _flash_attn3_1d39a44::fwd 0.86% 47.089us 37.74% 2.063ms 687.525us 3.403ms 100.00% 4.541ms 1.514ms 3
4036
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.404ms 100.05% 3.404ms 3.404ms 1
4037
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.403ms 100.00% 3.403ms 1.134ms 3
4038
+ Activity Buffer Request 32.14% 1.757ms 32.14% 1.757ms 1.757ms 1.138ms 33.45% 1.138ms 1.138ms 1
4039
+ aten::empty 0.49% 26.951us 0.49% 26.951us 4.492us 0.000us 0.00% 0.000us 0.000us 6
4040
+ cudaFuncSetAttribute 0.09% 4.812us 0.09% 4.812us 1.604us 0.000us 0.00% 0.000us 0.000us 3
4041
+ cudaLaunchKernel 4.15% 227.044us 4.15% 227.044us 75.681us 0.000us 0.00% 0.000us 0.000us 3
4042
+ cudaDeviceSynchronize 58.33% 3.188ms 58.33% 3.188ms 3.188ms 0.000us 0.00% 0.000us 0.000us 1
4043
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
+ Self CPU time total: 5.465ms
4045
+ Self CUDA time total: 3.403ms
4046
 
4047
 
4048
 
 
4052
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4053
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4054
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4055
+ hf_kernels_flash_attn3 2.06% 111.143us 40.98% 2.214ms 2.214ms 0.000us 0.00% 4.541ms 4.541ms 1
4056
+ FlashAttnFunc 1.64% 88.581us 38.92% 2.103ms 700.975us 0.000us 0.00% 4.541ms 1.514ms 3
4057
+ _flash_attn3_1d39a44::fwd 0.89% 48.319us 37.28% 2.014ms 671.448us 3.401ms 100.00% 4.541ms 1.514ms 3
4058
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.402ms 100.04% 3.402ms 3.402ms 1
4059
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.401ms 100.00% 3.401ms 1.134ms 3
4060
+ Activity Buffer Request 31.65% 1.710ms 31.65% 1.710ms 1.710ms 1.140ms 33.52% 1.140ms 1.140ms 1
4061
+ aten::empty 0.48% 25.892us 0.48% 25.892us 4.315us 0.000us 0.00% 0.000us 0.000us 6
4062
+ cudaFuncSetAttribute 0.09% 4.710us 0.09% 4.710us 1.570us 0.000us 0.00% 0.000us 0.000us 3
4063
+ cudaLaunchKernel 4.17% 225.304us 4.17% 225.304us 75.101us 0.000us 0.00% 0.000us 0.000us 3
4064
+ cudaDeviceSynchronize 59.02% 3.189ms 59.02% 3.189ms 3.189ms 0.000us 0.00% 0.000us 0.000us 1
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
+ Self CPU time total: 5.403ms
4067
+ Self CUDA time total: 3.401ms
4068
 
4069
 
4070
  impl wl p50(ms) ok
4071
+ hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
4072
+ hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
4073
+ hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.00 True
4074
+ hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True
4075
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.16 True
4076
+ hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True
4077
  </pre></div>
4078
+ <div class="uv-install-logs" id="uv-logs-benchmark">
4079
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4080
+ <div class="uv-logs-content" style="display: none;">
4081
+ Installed 14 packages in 11ms
4082
  </div>
4083
+ </div>
4084
+ <div class="cell-stderr">Fetching 5 files: 0%| | 0/5 [00:00&lt;?, ?it/s]
4085
+ Fetching 5 files: 40%|████ | 2/5 [00:01&lt;00:02, 1.24it/s]
4086
+ Fetching 5 files: 100%|██████████| 5/5 [00:01&lt;00:00, 3.09it/s]</div>
4087
  <div class="cell-artifacts">
4088
  <h4>Artifacts:</h4>
4089
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/mem_efficient_attention.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: benchmark | 4.18s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3941,28 +3941,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
3941
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3942
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
- torch_mem_eff 4.45% 324.566us 35.26% 2.573ms 2.573ms 0.000us 0.00% 5.439ms 5.439ms 1
3945
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.406ms 100.38% 5.406ms 5.406ms 1
3946
- aten::scaled_dot_product_attention 0.42% 30.389us 2.31% 168.211us 56.070us 0.000us 0.00% 4.771ms 1.590ms 3
3947
- aten::_scaled_dot_product_efficient_attention 0.30% 21.751us 1.89% 137.822us 45.941us 0.000us 0.00% 4.771ms 1.590ms 3
3948
- aten::_efficient_attention_forward 0.46% 33.370us 1.30% 95.011us 31.670us 4.771ms 88.58% 4.771ms 1.590ms 3
3949
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.771ms 88.58% 4.771ms 1.590ms 3
3950
- aten::contiguous 0.14% 10.493us 27.68% 2.020ms 224.395us 0.000us 0.00% 668.482us 74.276us 9
3951
- aten::clone 0.39% 28.130us 27.53% 2.009ms 223.229us 0.000us 0.00% 668.482us 74.276us 9
3952
- aten::copy_ 1.01% 73.701us 26.23% 1.914ms 212.678us 614.946us 11.42% 668.482us 74.276us 9
3953
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 614.946us 11.42% 614.946us 68.327us 9
3954
- Activity Buffer Request 24.11% 1.759ms 24.11% 1.759ms 1.759ms 53.536us 0.99% 53.536us 53.536us 1
3955
- aten::transpose 0.83% 60.400us 1.12% 81.609us 3.400us 0.000us 0.00% 0.000us 0.000us 24
3956
- aten::as_strided 0.29% 21.209us 0.29% 21.209us 0.884us 0.000us 0.00% 0.000us 0.000us 24
3957
- aten::empty_like 0.20% 14.439us 0.92% 66.830us 7.426us 0.000us 0.00% 0.000us 0.000us 9
3958
- aten::empty 1.09% 79.191us 1.09% 79.191us 3.771us 0.000us 0.00% 0.000us 0.000us 21
3959
- cudaLaunchKernel 1.43% 104.332us 1.43% 104.332us 8.694us 0.000us 0.00% 0.000us 0.000us 12
3960
- cudaStreamIsCapturing 0.04% 3.220us 0.04% 3.220us 1.073us 0.000us 0.00% 0.000us 0.000us 3
3961
- cudaFuncSetAttribute 0.12% 8.781us 0.12% 8.781us 2.927us 0.000us 0.00% 0.000us 0.000us 3
3962
- cudaDeviceSynchronize 64.74% 4.724ms 64.74% 4.724ms 4.724ms 0.000us 0.00% 0.000us 0.000us 1
3963
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3964
- Self CPU time total: 7.297ms
3965
- Self CUDA time total: 5.386ms
3966
 
3967
 
3968
 
@@ -3972,28 +3972,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
- torch_mem_eff 3.09% 234.954us 31.65% 2.404ms 2.404ms 0.000us 0.00% 5.782ms 5.782ms 1
3976
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.735ms 100.14% 5.735ms 5.735ms 1
3977
- aten::scaled_dot_product_attention 0.22% 16.961us 1.81% 137.382us 45.794us 0.000us 0.00% 5.091ms 1.697ms 3
3978
- aten::_scaled_dot_product_efficient_attention 0.25% 19.139us 1.59% 120.421us 40.140us 0.000us 0.00% 5.091ms 1.697ms 3
3979
- aten::_efficient_attention_forward 0.36% 27.009us 1.04% 78.740us 26.247us 5.091ms 88.89% 5.091ms 1.697ms 3
3980
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.091ms 88.89% 5.091ms 1.697ms 3
3981
- aten::contiguous 0.11% 8.479us 26.21% 1.991ms 221.170us 0.000us 0.00% 690.720us 76.747us 9
3982
- aten::clone 0.29% 22.002us 26.10% 1.982ms 220.228us 0.000us 0.00% 690.720us 76.747us 9
3983
- aten::copy_ 0.83% 62.671us 25.16% 1.911ms 212.305us 636.032us 11.11% 690.720us 76.747us 9
3984
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.032us 11.11% 636.032us 70.670us 9
3985
- Activity Buffer Request 23.48% 1.783ms 23.48% 1.783ms 1.783ms 54.688us 0.95% 54.688us 54.688us 1
3986
- aten::transpose 0.64% 48.410us 0.84% 63.823us 2.659us 0.000us 0.00% 0.000us 0.000us 24
3987
- aten::as_strided 0.20% 15.413us 0.20% 15.413us 0.642us 0.000us 0.00% 0.000us 0.000us 24
3988
- aten::empty_like 0.15% 11.729us 0.65% 49.301us 5.478us 0.000us 0.00% 0.000us 0.000us 9
3989
- aten::empty 0.82% 62.552us 0.82% 62.552us 2.979us 0.000us 0.00% 0.000us 0.000us 21
3990
- cudaLaunchKernel 1.14% 86.431us 1.14% 86.431us 7.203us 0.000us 0.00% 0.000us 0.000us 12
3991
- cudaStreamIsCapturing 0.03% 2.280us 0.03% 2.280us 0.760us 0.000us 0.00% 0.000us 0.000us 3
3992
- cudaFuncSetAttribute 0.04% 2.990us 0.04% 2.990us 0.997us 0.000us 0.00% 0.000us 0.000us 3
3993
- cudaDeviceSynchronize 68.35% 5.191ms 68.35% 5.191ms 5.191ms 0.000us 0.00% 0.000us 0.000us 1
3994
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3995
- Self CPU time total: 7.595ms
3996
- Self CUDA time total: 5.727ms
3997
 
3998
 
3999
 
@@ -4003,28 +4003,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
- torch_mem_eff 3.06% 239.384us 30.93% 2.420ms 2.420ms 0.000us 0.00% 5.994ms 5.994ms 1
4007
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.947ms 100.14% 5.947ms 5.947ms 1
4008
- aten::scaled_dot_product_attention 0.22% 17.549us 1.74% 135.892us 45.297us 0.000us 0.00% 5.295ms 1.765ms 3
4009
- aten::_scaled_dot_product_efficient_attention 0.23% 18.333us 1.51% 118.343us 39.448us 0.000us 0.00% 5.295ms 1.765ms 3
4010
- aten::_efficient_attention_forward 0.35% 27.055us 1.01% 79.012us 26.337us 5.295ms 89.16% 5.295ms 1.765ms 3
4011
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.295ms 89.16% 5.295ms 1.765ms 3
4012
- aten::contiguous 0.10% 7.948us 25.59% 2.002ms 222.464us 0.000us 0.00% 699.457us 77.717us 9
4013
- aten::clone 0.26% 20.152us 25.49% 1.994ms 221.581us 0.000us 0.00% 699.457us 77.717us 9
4014
- aten::copy_ 0.79% 62.172us 24.60% 1.924ms 213.808us 643.713us 10.84% 699.457us 77.717us 9
4015
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 643.713us 10.84% 643.713us 71.524us 9
4016
- Activity Buffer Request 22.96% 1.796ms 22.96% 1.796ms 1.796ms 55.744us 0.94% 55.744us 55.744us 1
4017
- aten::transpose 0.61% 48.091us 0.81% 63.198us 2.633us 0.000us 0.00% 0.000us 0.000us 24
4018
- aten::as_strided 0.19% 15.107us 0.19% 15.107us 0.629us 0.000us 0.00% 0.000us 0.000us 24
4019
- aten::empty_like 0.14% 11.152us 0.64% 49.811us 5.535us 0.000us 0.00% 0.000us 0.000us 9
4020
- aten::empty 0.80% 62.567us 0.80% 62.567us 2.979us 0.000us 0.00% 0.000us 0.000us 21
4021
- cudaLaunchKernel 1.12% 87.709us 1.12% 87.709us 7.309us 0.000us 0.00% 0.000us 0.000us 12
4022
- cudaStreamIsCapturing 0.03% 2.429us 0.03% 2.429us 0.810us 0.000us 0.00% 0.000us 0.000us 3
4023
- cudaFuncSetAttribute 0.05% 3.800us 0.05% 3.800us 1.267us 0.000us 0.00% 0.000us 0.000us 3
4024
- cudaDeviceSynchronize 69.07% 5.404ms 69.07% 5.404ms 5.404ms 0.000us 0.00% 0.000us 0.000us 1
4025
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4026
- Self CPU time total: 7.823ms
4027
- Self CUDA time total: 5.939ms
4028
 
4029
 
4030
 
@@ -4034,28 +4034,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
- torch_mem_eff 3.00% 242.264us 30.89% 2.499ms 2.499ms 0.000us 0.00% 6.191ms 6.191ms 1
4038
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.141ms 100.14% 6.141ms 6.141ms 1
4039
- aten::scaled_dot_product_attention 0.23% 18.320us 1.69% 136.812us 45.604us 0.000us 0.00% 5.471ms 1.824ms 3
4040
- aten::_scaled_dot_product_efficient_attention 0.23% 18.630us 1.46% 118.492us 39.497us 0.000us 0.00% 5.471ms 1.824ms 3
4041
- aten::_efficient_attention_forward 0.33% 26.674us 0.96% 77.952us 25.984us 5.471ms 89.22% 5.471ms 1.824ms 3
4042
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.471ms 89.22% 5.471ms 1.824ms 3
4043
- aten::contiguous 0.10% 8.440us 25.67% 2.076ms 230.653us 0.000us 0.00% 719.363us 79.929us 9
4044
- aten::clone 0.28% 22.639us 25.56% 2.067ms 229.716us 0.000us 0.00% 719.363us 79.929us 9
4045
- aten::copy_ 0.78% 63.183us 24.67% 1.995ms 221.702us 660.931us 10.78% 719.363us 79.929us 9
4046
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 660.931us 10.78% 660.931us 73.437us 9
4047
- Activity Buffer Request 21.08% 1.705ms 21.08% 1.705ms 1.705ms 58.432us 0.95% 58.432us 58.432us 1
4048
- aten::transpose 0.61% 49.449us 0.81% 65.670us 2.736us 0.000us 0.00% 0.000us 0.000us 24
4049
- aten::as_strided 0.20% 16.221us 0.20% 16.221us 0.676us 0.000us 0.00% 0.000us 0.000us 24
4050
- aten::empty_like 0.15% 11.742us 0.61% 49.481us 5.498us 0.000us 0.00% 0.000us 0.000us 9
4051
- aten::empty 0.77% 62.526us 0.77% 62.526us 2.977us 0.000us 0.00% 0.000us 0.000us 21
4052
- cudaLaunchKernel 3.07% 248.624us 3.07% 248.624us 20.719us 0.000us 0.00% 0.000us 0.000us 12
4053
- cudaStreamIsCapturing 0.03% 2.250us 0.03% 2.250us 0.750us 0.000us 0.00% 0.000us 0.000us 3
4054
- cudaFuncSetAttribute 0.04% 3.020us 0.04% 3.020us 1.007us 0.000us 0.00% 0.000us 0.000us 3
4055
- cudaDeviceSynchronize 69.11% 5.590ms 69.11% 5.590ms 5.590ms 0.000us 0.00% 0.000us 0.000us 1
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
- Self CPU time total: 8.088ms
4058
- Self CUDA time total: 6.132ms
4059
 
4060
 
4061
 
@@ -4065,28 +4065,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4067
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4068
- torch_mem_eff 2.96% 243.644us 31.20% 2.571ms 2.571ms 0.000us 0.00% 6.270ms 6.270ms 1
4069
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.220ms 100.13% 6.220ms 6.220ms 1
4070
- aten::scaled_dot_product_attention 0.22% 18.340us 1.66% 136.411us 45.470us 0.000us 0.00% 5.544ms 1.848ms 3
4071
- aten::_scaled_dot_product_efficient_attention 0.23% 18.620us 1.43% 118.071us 39.357us 0.000us 0.00% 5.544ms 1.848ms 3
4072
- aten::_efficient_attention_forward 0.33% 26.920us 0.94% 77.841us 25.947us 5.544ms 89.24% 5.544ms 1.848ms 3
4073
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.544ms 89.24% 5.544ms 1.848ms 3
4074
- aten::contiguous 0.10% 8.441us 26.08% 2.149ms 238.754us 0.000us 0.00% 726.626us 80.736us 9
4075
- aten::clone 0.27% 22.559us 25.98% 2.140ms 237.816us 0.000us 0.00% 726.626us 80.736us 9
4076
- aten::copy_ 0.77% 63.181us 25.09% 2.068ms 229.736us 668.130us 10.76% 726.626us 80.736us 9
4077
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 668.130us 10.76% 668.130us 74.237us 9
4078
- Activity Buffer Request 21.61% 1.780ms 21.61% 1.780ms 1.780ms 58.496us 0.94% 58.496us 58.496us 1
4079
- aten::transpose 0.58% 47.889us 0.77% 63.801us 2.658us 0.000us 0.00% 0.000us 0.000us 24
4080
- aten::as_strided 0.19% 15.912us 0.19% 15.912us 0.663us 0.000us 0.00% 0.000us 0.000us 24
4081
- aten::empty_like 0.14% 11.871us 0.61% 50.162us 5.574us 0.000us 0.00% 0.000us 0.000us 9
4082
- aten::empty 0.75% 62.051us 0.75% 62.051us 2.955us 0.000us 0.00% 0.000us 0.000us 21
4083
- cudaLaunchKernel 2.98% 245.563us 2.98% 245.563us 20.464us 0.000us 0.00% 0.000us 0.000us 12
4084
- cudaStreamIsCapturing 0.03% 2.280us 0.03% 2.280us 0.760us 0.000us 0.00% 0.000us 0.000us 3
4085
- cudaFuncSetAttribute 0.04% 3.301us 0.04% 3.301us 1.100us 0.000us 0.00% 0.000us 0.000us 3
4086
- cudaDeviceSynchronize 68.80% 5.669ms 68.80% 5.669ms 5.669ms 0.000us 0.00% 0.000us 0.000us 1
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
- Self CPU time total: 8.240ms
4089
- Self CUDA time total: 6.212ms
4090
 
4091
 
4092
 
@@ -4096,37 +4096,37 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
4096
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4097
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4098
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4099
- torch_mem_eff 2.78% 238.352us 29.12% 2.495ms 2.495ms 0.000us 0.00% 6.680ms 6.680ms 1
4100
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.628ms 100.13% 6.628ms 6.628ms 1
4101
- aten::scaled_dot_product_attention 0.31% 26.242us 1.71% 146.743us 48.914us 0.000us 0.00% 5.945ms 1.982ms 3
4102
- aten::_scaled_dot_product_efficient_attention 0.23% 19.839us 1.41% 120.501us 40.167us 0.000us 0.00% 5.945ms 1.982ms 3
4103
- aten::_efficient_attention_forward 0.31% 26.859us 0.92% 78.900us 26.300us 5.945ms 89.80% 5.945ms 1.982ms 3
4104
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.945ms 89.80% 5.945ms 1.982ms 3
4105
- aten::contiguous 0.09% 7.528us 24.13% 2.068ms 229.726us 0.000us 0.00% 735.685us 81.743us 9
4106
- aten::clone 0.24% 20.962us 24.04% 2.060ms 228.889us 0.000us 0.00% 735.685us 81.743us 9
4107
- aten::copy_ 0.75% 64.071us 23.20% 1.988ms 220.897us 675.044us 10.20% 735.685us 81.743us 9
4108
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 675.044us 10.20% 675.044us 75.005us 9
4109
- Activity Buffer Request 19.86% 1.702ms 19.86% 1.702ms 1.702ms 60.641us 0.92% 60.641us 60.641us 1
4110
- aten::transpose 0.56% 47.940us 0.74% 63.783us 2.658us 0.000us 0.00% 0.000us 0.000us 24
4111
- aten::as_strided 0.18% 15.843us 0.18% 15.843us 0.660us 0.000us 0.00% 0.000us 0.000us 24
4112
- aten::empty_like 0.13% 11.513us 0.59% 50.972us 5.664us 0.000us 0.00% 0.000us 0.000us 9
4113
- aten::empty 0.75% 64.430us 0.75% 64.430us 3.068us 0.000us 0.00% 0.000us 0.000us 21
4114
- cudaLaunchKernel 2.85% 243.883us 2.85% 243.883us 20.324us 0.000us 0.00% 0.000us 0.000us 12
4115
- cudaStreamIsCapturing 0.03% 2.530us 0.03% 2.530us 0.843us 0.000us 0.00% 0.000us 0.000us 3
4116
- cudaFuncSetAttribute 0.04% 3.050us 0.04% 3.050us 1.017us 0.000us 0.00% 0.000us 0.000us 3
4117
- cudaDeviceSynchronize 70.88% 6.073ms 70.88% 6.073ms 6.073ms 0.000us 0.00% 0.000us 0.000us 1
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
- Self CPU time total: 8.568ms
4120
- Self CUDA time total: 6.620ms
4121
 
4122
 
4123
  impl wl p50(ms) ok
4124
- torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
4125
- torch_mem_eff cuda_attn_L256_bfloat16 1.94 True
4126
- torch_mem_eff cuda_attn_L320_bfloat16 1.96 True
4127
- torch_mem_eff cuda_attn_L384_bfloat16 2.03 True
4128
- torch_mem_eff cuda_attn_L448_bfloat16 2.02 True
4129
- torch_mem_eff cuda_attn_L512_bfloat16 2.23 True
4130
  </pre></div>
4131
  <div class="cell-artifacts">
4132
  <h4>Artifacts:</h4>
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: benchmark | 4.15s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3941
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3942
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
+ torch_mem_eff 4.11% 302.695us 35.19% 2.592ms 2.592ms 0.000us 0.00% 5.476ms 5.476ms 1
3945
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.440ms 100.33% 5.440ms 5.440ms 1
3946
+ aten::scaled_dot_product_attention 0.40% 29.210us 2.30% 169.213us 56.404us 0.000us 0.00% 4.805ms 1.602ms 3
3947
+ aten::_scaled_dot_product_efficient_attention 0.29% 21.719us 1.90% 140.003us 46.668us 0.000us 0.00% 4.805ms 1.602ms 3
3948
+ aten::_efficient_attention_forward 0.48% 35.571us 1.32% 97.242us 32.414us 4.805ms 88.62% 4.805ms 1.602ms 3
3949
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.805ms 88.62% 4.805ms 1.602ms 3
3950
+ aten::contiguous 0.13% 9.829us 27.98% 2.062ms 229.090us 0.000us 0.00% 670.404us 74.489us 9
3951
+ aten::clone 0.35% 25.869us 27.85% 2.052ms 227.998us 0.000us 0.00% 670.404us 74.489us 9
3952
+ aten::copy_ 0.98% 72.210us 26.54% 1.956ms 217.285us 616.836us 11.38% 670.404us 74.489us 9
3953
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 616.836us 11.38% 616.836us 68.537us 9
3954
+ Activity Buffer Request 24.39% 1.797ms 24.39% 1.797ms 1.797ms 53.568us 0.99% 53.568us 53.568us 1
3955
+ aten::transpose 0.81% 59.530us 1.08% 79.784us 3.324us 0.000us 0.00% 0.000us 0.000us 24
3956
+ aten::as_strided 0.27% 20.254us 0.27% 20.254us 0.844us 0.000us 0.00% 0.000us 0.000us 24
3957
+ aten::empty_like 0.20% 14.892us 0.96% 70.554us 7.839us 0.000us 0.00% 0.000us 0.000us 9
3958
+ aten::empty 1.12% 82.341us 1.12% 82.341us 3.921us 0.000us 0.00% 0.000us 0.000us 21
3959
+ cudaLaunchKernel 1.48% 109.241us 1.48% 109.241us 9.103us 0.000us 0.00% 0.000us 0.000us 12
3960
+ cudaStreamIsCapturing 0.04% 3.240us 0.04% 3.240us 1.080us 0.000us 0.00% 0.000us 0.000us 3
3961
+ cudaFuncSetAttribute 0.12% 9.162us 0.12% 9.162us 3.054us 0.000us 0.00% 0.000us 0.000us 3
3962
+ cudaDeviceSynchronize 64.81% 4.776ms 64.81% 4.776ms 4.776ms 0.000us 0.00% 0.000us 0.000us 1
3963
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3964
+ Self CPU time total: 7.368ms
3965
+ Self CUDA time total: 5.422ms
3966
 
3967
 
3968
 
 
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
+ torch_mem_eff 3.18% 243.704us 30.16% 2.312ms 2.312ms 0.000us 0.00% 5.946ms 5.946ms 1
3976
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.900ms 100.14% 5.900ms 5.900ms 1
3977
+ aten::scaled_dot_product_attention 0.23% 17.410us 1.83% 139.893us 46.631us 0.000us 0.00% 5.256ms 1.752ms 3
3978
+ aten::_scaled_dot_product_efficient_attention 0.24% 18.330us 1.60% 122.483us 40.828us 0.000us 0.00% 5.256ms 1.752ms 3
3979
+ aten::_efficient_attention_forward 0.36% 27.350us 1.07% 81.803us 27.268us 5.256ms 89.21% 5.256ms 1.752ms 3
3980
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.256ms 89.21% 5.256ms 1.752ms 3
3981
+ aten::contiguous 0.10% 7.470us 24.63% 1.888ms 209.765us 0.000us 0.00% 690.500us 76.722us 9
3982
+ aten::clone 0.27% 20.522us 24.53% 1.880ms 208.935us 0.000us 0.00% 690.500us 76.722us 9
3983
+ aten::copy_ 0.86% 65.740us 23.60% 1.809ms 200.963us 635.844us 10.79% 690.500us 76.722us 9
3984
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 635.844us 10.79% 635.844us 70.649us 9
3985
+ Activity Buffer Request 21.87% 1.676ms 21.87% 1.676ms 1.676ms 54.656us 0.93% 54.656us 54.656us 1
3986
+ aten::transpose 0.62% 47.210us 0.82% 62.900us 2.621us 0.000us 0.00% 0.000us 0.000us 24
3987
+ aten::as_strided 0.20% 15.690us 0.20% 15.690us 0.654us 0.000us 0.00% 0.000us 0.000us 24
3988
+ aten::empty_like 0.16% 11.901us 0.67% 51.221us 5.691us 0.000us 0.00% 0.000us 0.000us 9
3989
+ aten::empty 0.85% 65.201us 0.85% 65.201us 3.105us 0.000us 0.00% 0.000us 0.000us 21
3990
+ cudaLaunchKernel 1.16% 89.161us 1.16% 89.161us 7.430us 0.000us 0.00% 0.000us 0.000us 12
3991
+ cudaStreamIsCapturing 0.03% 2.381us 0.03% 2.381us 0.794us 0.000us 0.00% 0.000us 0.000us 3
3992
+ cudaFuncSetAttribute 0.05% 3.881us 0.05% 3.881us 1.294us 0.000us 0.00% 0.000us 0.000us 3
3993
+ cudaDeviceSynchronize 69.84% 5.353ms 69.84% 5.353ms 5.353ms 0.000us 0.00% 0.000us 0.000us 1
3994
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3995
+ Self CPU time total: 7.665ms
3996
+ Self CUDA time total: 5.891ms
3997
 
3998
 
3999
 
 
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
+ torch_mem_eff 3.05% 239.816us 30.60% 2.409ms 2.409ms 0.000us 0.00% 6.068ms 6.068ms 1
4007
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.021ms 100.14% 6.021ms 6.021ms 1
4008
+ aten::scaled_dot_product_attention 0.23% 17.959us 1.79% 140.600us 46.867us 0.000us 0.00% 5.365ms 1.788ms 3
4009
+ aten::_scaled_dot_product_efficient_attention 0.23% 18.141us 1.56% 122.641us 40.880us 0.000us 0.00% 5.365ms 1.788ms 3
4010
+ aten::_efficient_attention_forward 0.36% 28.699us 1.04% 81.531us 27.177us 5.365ms 89.24% 5.365ms 1.788ms 3
4011
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.365ms 89.24% 5.365ms 1.788ms 3
4012
+ aten::contiguous 0.10% 7.861us 25.24% 1.987ms 220.773us 0.000us 0.00% 702.468us 78.052us 9
4013
+ aten::clone 0.26% 20.540us 25.14% 1.979ms 219.899us 0.000us 0.00% 702.468us 78.052us 9
4014
+ aten::copy_ 0.92% 72.171us 24.24% 1.908ms 212.002us 646.884us 10.76% 702.468us 78.052us 9
4015
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 646.884us 10.76% 646.884us 71.876us 9
4016
+ Activity Buffer Request 22.46% 1.768ms 22.46% 1.768ms 1.768ms 55.584us 0.92% 55.584us 55.584us 1
4017
+ aten::transpose 0.60% 47.471us 0.81% 64.120us 2.672us 0.000us 0.00% 0.000us 0.000us 24
4018
+ aten::as_strided 0.21% 16.649us 0.21% 16.649us 0.694us 0.000us 0.00% 0.000us 0.000us 24
4019
+ aten::empty_like 0.15% 11.960us 0.64% 50.531us 5.615us 0.000us 0.00% 0.000us 0.000us 9
4020
+ aten::empty 0.81% 63.971us 0.81% 63.971us 3.046us 0.000us 0.00% 0.000us 0.000us 21
4021
+ cudaLaunchKernel 1.13% 89.282us 1.13% 89.282us 7.440us 0.000us 0.00% 0.000us 0.000us 12
4022
+ cudaStreamIsCapturing 0.03% 2.660us 0.03% 2.660us 0.887us 0.000us 0.00% 0.000us 0.000us 3
4023
+ cudaFuncSetAttribute 0.04% 3.150us 0.04% 3.150us 1.050us 0.000us 0.00% 0.000us 0.000us 3
4024
+ cudaDeviceSynchronize 69.40% 5.462ms 69.40% 5.462ms 5.462ms 0.000us 0.00% 0.000us 0.000us 1
4025
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4026
+ Self CPU time total: 7.871ms
4027
+ Self CUDA time total: 6.012ms
4028
 
4029
 
4030
 
 
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
+ torch_mem_eff 2.93% 240.625us 31.13% 2.555ms 2.555ms 0.000us 0.00% 6.259ms 6.259ms 1
4038
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.208ms 100.13% 6.208ms 6.208ms 1
4039
+ aten::scaled_dot_product_attention 0.21% 17.361us 1.73% 142.203us 47.401us 0.000us 0.00% 5.537ms 1.846ms 3
4040
+ aten::_scaled_dot_product_efficient_attention 0.22% 18.441us 1.52% 124.842us 41.614us 0.000us 0.00% 5.537ms 1.846ms 3
4041
+ aten::_efficient_attention_forward 0.36% 29.601us 1.03% 84.471us 28.157us 5.537ms 89.30% 5.537ms 1.846ms 3
4042
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.537ms 89.30% 5.537ms 1.846ms 3
4043
+ aten::contiguous 0.09% 7.769us 25.95% 2.130ms 236.658us 0.000us 0.00% 721.984us 80.220us 9
4044
+ aten::clone 0.26% 21.609us 25.85% 2.122ms 235.795us 0.000us 0.00% 721.984us 80.220us 9
4045
+ aten::copy_ 0.80% 65.822us 24.94% 2.047ms 227.475us 663.552us 10.70% 721.984us 80.220us 9
4046
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 663.552us 10.70% 663.552us 73.728us 9
4047
+ Activity Buffer Request 21.30% 1.749ms 21.30% 1.749ms 1.749ms 58.432us 0.94% 58.432us 58.432us 1
4048
+ aten::transpose 0.59% 48.680us 0.78% 64.131us 2.672us 0.000us 0.00% 0.000us 0.000us 24
4049
+ aten::as_strided 0.19% 15.451us 0.19% 15.451us 0.644us 0.000us 0.00% 0.000us 0.000us 24
4050
+ aten::empty_like 0.15% 12.591us 0.65% 53.271us 5.919us 0.000us 0.00% 0.000us 0.000us 9
4051
+ aten::empty 0.81% 66.120us 0.81% 66.120us 3.149us 0.000us 0.00% 0.000us 0.000us 21
4052
+ cudaLaunchKernel 3.12% 256.044us 3.12% 256.044us 21.337us 0.000us 0.00% 0.000us 0.000us 12
4053
+ cudaStreamIsCapturing 0.03% 2.670us 0.03% 2.670us 0.890us 0.000us 0.00% 0.000us 0.000us 3
4054
+ cudaFuncSetAttribute 0.04% 3.480us 0.04% 3.480us 1.160us 0.000us 0.00% 0.000us 0.000us 3
4055
+ cudaDeviceSynchronize 68.87% 5.653ms 68.87% 5.653ms 5.653ms 0.000us 0.00% 0.000us 0.000us 1
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
+ Self CPU time total: 8.208ms
4058
+ Self CUDA time total: 6.200ms
4059
 
4060
 
4061
 
 
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4067
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4068
+ torch_mem_eff 2.93% 245.582us 31.52% 2.645ms 2.645ms 0.000us 0.00% 6.354ms 6.354ms 1
4069
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.303ms 100.13% 6.303ms 6.303ms 1
4070
+ aten::scaled_dot_product_attention 0.20% 17.170us 1.68% 140.693us 46.898us 0.000us 0.00% 5.628ms 1.876ms 3
4071
+ aten::_scaled_dot_product_efficient_attention 0.21% 17.520us 1.47% 123.523us 41.174us 0.000us 0.00% 5.628ms 1.876ms 3
4072
+ aten::_efficient_attention_forward 0.35% 29.440us 1.00% 84.263us 28.088us 5.628ms 89.41% 5.628ms 1.876ms 3
4073
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.628ms 89.41% 5.628ms 1.876ms 3
4074
+ aten::contiguous 0.09% 7.259us 26.43% 2.218ms 246.393us 0.000us 0.00% 726.309us 80.701us 9
4075
+ aten::clone 0.25% 21.219us 26.34% 2.210ms 245.587us 0.000us 0.00% 726.309us 80.701us 9
4076
+ aten::copy_ 0.78% 65.083us 25.46% 2.136ms 237.368us 666.948us 10.59% 726.309us 80.701us 9
4077
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 666.948us 10.59% 666.948us 74.105us 9
4078
+ Activity Buffer Request 21.84% 1.833ms 21.84% 1.833ms 1.833ms 59.361us 0.94% 59.361us 59.361us 1
4079
+ aten::transpose 0.56% 46.780us 0.75% 62.730us 2.614us 0.000us 0.00% 0.000us 0.000us 24
4080
+ aten::as_strided 0.19% 15.950us 0.19% 15.950us 0.665us 0.000us 0.00% 0.000us 0.000us 24
4081
+ aten::empty_like 0.14% 11.512us 0.63% 52.753us 5.861us 0.000us 0.00% 0.000us 0.000us 9
4082
+ aten::empty 0.79% 66.642us 0.79% 66.642us 3.173us 0.000us 0.00% 0.000us 0.000us 21
4083
+ cudaLaunchKernel 3.12% 261.945us 3.12% 261.945us 21.829us 0.000us 0.00% 0.000us 0.000us 12
4084
+ cudaStreamIsCapturing 0.03% 2.500us 0.03% 2.500us 0.833us 0.000us 0.00% 0.000us 0.000us 3
4085
+ cudaFuncSetAttribute 0.04% 3.581us 0.04% 3.581us 1.194us 0.000us 0.00% 0.000us 0.000us 3
4086
+ cudaDeviceSynchronize 68.48% 5.745ms 68.48% 5.745ms 5.745ms 0.000us 0.00% 0.000us 0.000us 1
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
+ Self CPU time total: 8.390ms
4089
+ Self CUDA time total: 6.295ms
4090
 
4091
 
4092
 
 
4096
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4097
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4098
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4099
+ torch_mem_eff 2.68% 234.298us 28.81% 2.516ms 2.516ms 0.000us 0.00% 6.820ms 6.820ms 1
4100
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.768ms 100.12% 6.768ms 6.768ms 1
4101
+ aten::scaled_dot_product_attention 0.20% 17.618us 1.61% 140.900us 46.967us 0.000us 0.00% 6.087ms 2.029ms 3
4102
+ aten::_scaled_dot_product_efficient_attention 0.21% 18.311us 1.41% 123.282us 41.094us 0.000us 0.00% 6.087ms 2.029ms 3
4103
+ aten::_efficient_attention_forward 0.33% 29.191us 0.95% 82.621us 27.540us 6.087ms 90.04% 6.087ms 2.029ms 3
4104
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 6.087ms 90.04% 6.087ms 2.029ms 3
4105
+ aten::contiguous 0.09% 7.641us 24.06% 2.101ms 233.417us 0.000us 0.00% 733.380us 81.487us 9
4106
+ aten::clone 0.23% 20.279us 23.97% 2.093ms 232.568us 0.000us 0.00% 733.380us 81.487us 9
4107
+ aten::copy_ 0.74% 64.431us 23.10% 2.017ms 224.097us 672.964us 9.96% 733.380us 81.487us 9
4108
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 672.964us 9.96% 672.964us 74.774us 9
4109
+ Activity Buffer Request 19.61% 1.713ms 19.61% 1.713ms 1.713ms 60.416us 0.89% 60.416us 60.416us 1
4110
+ aten::transpose 0.53% 46.410us 0.71% 62.109us 2.588us 0.000us 0.00% 0.000us 0.000us 24
4111
+ aten::as_strided 0.18% 15.699us 0.18% 15.699us 0.654us 0.000us 0.00% 0.000us 0.000us 24
4112
+ aten::empty_like 0.15% 12.751us 0.64% 55.961us 6.218us 0.000us 0.00% 0.000us 0.000us 9
4113
+ aten::empty 0.79% 69.050us 0.79% 69.050us 3.288us 0.000us 0.00% 0.000us 0.000us 21
4114
+ cudaLaunchKernel 2.99% 261.415us 2.99% 261.415us 21.785us 0.000us 0.00% 0.000us 0.000us 12
4115
+ cudaStreamIsCapturing 0.03% 2.920us 0.03% 2.920us 0.973us 0.000us 0.00% 0.000us 0.000us 3
4116
+ cudaFuncSetAttribute 0.03% 2.980us 0.03% 2.980us 0.993us 0.000us 0.00% 0.000us 0.000us 3
4117
+ cudaDeviceSynchronize 71.19% 6.216ms 71.19% 6.216ms 6.216ms 0.000us 0.00% 0.000us 0.000us 1
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
+ Self CPU time total: 8.732ms
4120
+ Self CUDA time total: 6.759ms
4121
 
4122
 
4123
  impl wl p50(ms) ok
4124
+ torch_mem_eff cuda_attn_L128_bfloat16 1.84 True
4125
+ torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
4126
+ torch_mem_eff cuda_attn_L320_bfloat16 1.97 True
4127
+ torch_mem_eff cuda_attn_L384_bfloat16 2.08 True
4128
+ torch_mem_eff cuda_attn_L448_bfloat16 2.04 True
4129
+ torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
4130
  </pre></div>
4131
  <div class="cell-artifacts">
4132
  <h4>Artifacts:</h4>
flash_attn/impls/sage_attention.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: benchmark | 4.59s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3938,22 +3938,22 @@ Cell: benchmark | 4.59s
3938
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
3939
  impl wl p50(ms) ok
3940
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
3941
- Error: module &#x27;sage_attention_d37081df98a5208e&#x27; has no attribute &#x27;fwd&#x27;
3942
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
3943
- Error: module &#x27;sage_attention_d37081df98a5208e&#x27; has no attribute &#x27;fwd&#x27;
3944
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
3945
- Error: module &#x27;sage_attention_d37081df98a5208e&#x27; has no attribute &#x27;fwd&#x27;
3946
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
3947
- Error: module &#x27;sage_attention_d37081df98a5208e&#x27; has no attribute &#x27;fwd&#x27;
3948
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
3949
- Error: module &#x27;sage_attention_d37081df98a5208e&#x27; has no attribute &#x27;fwd&#x27;
3950
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
3951
- Error: module &#x27;sage_attention_d37081df98a5208e&#x27; has no attribute &#x27;fwd&#x27;
3952
  </pre></div>
3953
  <div class="cell-stderr">
3954
- Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3955
- Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 11.98it/s]
3956
- Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 16.47it/s]
3957
  </div>
3958
  <div class="cell-artifacts">
3959
  <h4>Artifacts:</h4>
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: benchmark | 4.58s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3938
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
3939
  impl wl p50(ms) ok
3940
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
3941
+ Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
3942
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
3943
+ Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
3944
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
3945
+ Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
3946
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
3947
+ Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
3948
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
3949
+ Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
3950
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
3951
+ Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
3952
  </pre></div>
3953
  <div class="cell-stderr">
3954
+ Fetching 8 files: 0%| | 0/8 [00:00&lt;?, ?it/s]
3955
+ Fetching 8 files: 38%|███▊ | 3/8 [00:00&lt;00:00, 5.88it/s]
3956
+ Fetching 8 files: 100%|██████████| 8/8 [00:00&lt;00:00, 15.67it/s]
3957
  </div>
3958
  <div class="cell-artifacts">
3959
  <h4>Artifacts:</h4>
flash_attn/impls/xformers.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: benchmark | 5.74s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3940,21 +3940,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
3940
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3941
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3942
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3943
- xformers_meff 9.51% 455.697us 55.56% 2.663ms 2.663ms 0.000us 0.00% 3.558ms 3.558ms 1
3944
- xformers_flash3::flash_fwd 4.08% 195.443us 45.35% 2.174ms 724.544us 0.000us 0.00% 3.558ms 1.186ms 3
3945
- flash_attn_3::fwd 1.49% 71.640us 41.28% 1.978ms 659.396us 2.651ms 100.00% 3.558ms 1.186ms 3
3946
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.653ms 100.06% 2.653ms 2.653ms 1
3947
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.651ms 100.00% 2.651ms 883.711us 3
3948
- Activity Buffer Request 37.88% 1.816ms 37.88% 1.816ms 1.816ms 906.719us 34.20% 906.719us 906.719us 1
3949
- aten::empty 0.75% 35.911us 0.75% 35.911us 5.985us 0.000us 0.00% 0.000us 0.000us 6
3950
- cudaFuncSetAttribute 0.26% 12.331us 0.26% 12.331us 4.110us 0.000us 0.00% 0.000us 0.000us 3
3951
- cudaLaunchKernel 0.89% 42.730us 0.89% 42.730us 14.243us 0.000us 0.00% 0.000us 0.000us 3
3952
- aten::reshape 0.24% 11.531us 0.69% 33.171us 5.529us 0.000us 0.00% 0.000us 0.000us 6
3953
- aten::view 0.45% 21.640us 0.45% 21.640us 3.607us 0.000us 0.00% 0.000us 0.000us 6
3954
- cudaDeviceSynchronize 44.44% 2.130ms 44.44% 2.130ms 2.130ms 0.000us 0.00% 0.000us 0.000us 1
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
- Self CPU time total: 4.793ms
3957
- Self CUDA time total: 2.651ms
3958
 
3959
 
3960
 
@@ -3964,21 +3964,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
3964
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3965
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3966
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3967
- xformers_meff 6.26% 307.825us 49.96% 2.457ms 2.457ms 0.000us 0.00% 3.857ms 3.857ms 1
3968
- xformers_flash3::flash_fwd 2.96% 145.722us 43.25% 2.127ms 708.950us 0.000us 0.00% 3.857ms 1.286ms 3
3969
- flash_attn_3::fwd 1.03% 50.571us 40.29% 1.981ms 660.376us 2.878ms 100.00% 3.857ms 1.286ms 3
3970
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.879ms 100.06% 2.879ms 2.879ms 1
3971
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.878ms 100.00% 2.878ms 959.213us 3
3972
- Activity Buffer Request 37.86% 1.862ms 37.86% 1.862ms 1.862ms 979.202us 34.03% 979.202us 979.202us 1
3973
- aten::empty 0.61% 29.881us 0.61% 29.881us 4.980us 0.000us 0.00% 0.000us 0.000us 6
3974
- cudaFuncSetAttribute 0.11% 5.570us 0.11% 5.570us 1.857us 0.000us 0.00% 0.000us 0.000us 3
3975
- cudaLaunchKernel 0.67% 33.080us 0.67% 33.080us 11.027us 0.000us 0.00% 0.000us 0.000us 3
3976
- aten::reshape 0.18% 8.899us 0.46% 22.400us 3.733us 0.000us 0.00% 0.000us 0.000us 6
3977
- aten::view 0.27% 13.501us 0.27% 13.501us 2.250us 0.000us 0.00% 0.000us 0.000us 6
3978
- cudaDeviceSynchronize 50.04% 2.461ms 50.04% 2.461ms 2.461ms 0.000us 0.00% 0.000us 0.000us 1
3979
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3980
- Self CPU time total: 4.918ms
3981
- Self CUDA time total: 2.878ms
3982
 
3983
 
3984
 
@@ -3988,21 +3988,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3990
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3991
- xformers_meff 6.21% 306.054us 48.92% 2.410ms 2.410ms 0.000us 0.00% 3.933ms 3.933ms 1
3992
- xformers_flash3::flash_fwd 2.99% 147.392us 42.27% 2.082ms 693.957us 0.000us 0.00% 3.933ms 1.311ms 3
3993
- flash_attn_3::fwd 1.07% 52.480us 39.27% 1.934ms 644.826us 2.941ms 100.00% 3.933ms 1.311ms 3
3994
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.942ms 100.05% 2.942ms 2.942ms 1
3995
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.941ms 100.00% 2.941ms 980.234us 3
3996
- Activity Buffer Request 36.76% 1.811ms 36.76% 1.811ms 1.811ms 991.807us 33.73% 991.807us 991.807us 1
3997
- aten::empty 0.60% 29.531us 0.60% 29.531us 4.922us 0.000us 0.00% 0.000us 0.000us 6
3998
- cudaFuncSetAttribute 0.13% 6.550us 0.13% 6.550us 2.183us 0.000us 0.00% 0.000us 0.000us 3
3999
- cudaLaunchKernel 0.71% 35.120us 0.71% 35.120us 11.707us 0.000us 0.00% 0.000us 0.000us 3
4000
- aten::reshape 0.17% 8.281us 0.44% 21.831us 3.638us 0.000us 0.00% 0.000us 0.000us 6
4001
- aten::view 0.28% 13.550us 0.28% 13.550us 2.258us 0.000us 0.00% 0.000us 0.000us 6
4002
- cudaDeviceSynchronize 51.08% 2.516ms 51.08% 2.516ms 2.516ms 0.000us 0.00% 0.000us 0.000us 1
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
- Self CPU time total: 4.926ms
4005
- Self CUDA time total: 2.941ms
4006
 
4007
 
4008
 
@@ -4012,21 +4012,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4014
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4015
- xformers_meff 6.17% 315.944us 50.13% 2.567ms 2.567ms 0.000us 0.00% 4.004ms 4.004ms 1
4016
- xformers_flash3::flash_fwd 2.87% 146.993us 43.50% 2.228ms 742.605us 0.000us 0.00% 4.004ms 1.335ms 3
4017
- flash_attn_3::fwd 0.96% 49.370us 40.63% 2.081ms 693.607us 2.988ms 100.00% 4.004ms 1.335ms 3
4018
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.990ms 100.05% 2.990ms 2.990ms 1
4019
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.988ms 100.00% 2.988ms 996.112us 3
4020
- Activity Buffer Request 35.27% 1.806ms 35.27% 1.806ms 1.806ms 1.016ms 34.00% 1.016ms 1.016ms 1
4021
- aten::empty 0.59% 30.371us 0.59% 30.371us 5.062us 0.000us 0.00% 0.000us 0.000us 6
4022
- cudaFuncSetAttribute 0.11% 5.580us 0.11% 5.580us 1.860us 0.000us 0.00% 0.000us 0.000us 3
4023
- cudaLaunchKernel 3.69% 189.213us 3.69% 189.213us 63.071us 0.000us 0.00% 0.000us 0.000us 3
4024
- aten::reshape 0.19% 9.850us 0.46% 23.640us 3.940us 0.000us 0.00% 0.000us 0.000us 6
4025
- aten::view 0.27% 13.790us 0.27% 13.790us 2.298us 0.000us 0.00% 0.000us 0.000us 6
4026
- cudaDeviceSynchronize 49.87% 2.554ms 49.87% 2.554ms 2.554ms 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
- Self CPU time total: 5.122ms
4029
- Self CUDA time total: 2.988ms
4030
 
4031
 
4032
 
@@ -4036,21 +4036,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
- xformers_meff 5.38% 306.205us 45.35% 2.581ms 2.581ms 0.000us 0.00% 4.704ms 4.704ms 1
4040
- xformers_flash3::flash_fwd 2.54% 144.312us 39.58% 2.253ms 750.894us 0.000us 0.00% 4.704ms 1.568ms 3
4041
- flash_attn_3::fwd 0.92% 52.341us 37.04% 2.108ms 702.790us 3.526ms 100.00% 4.704ms 1.568ms 3
4042
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.528ms 100.05% 3.528ms 3.528ms 1
4043
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.526ms 100.00% 3.526ms 1.175ms 3
4044
- Activity Buffer Request 32.26% 1.836ms 32.26% 1.836ms 1.836ms 1.177ms 33.39% 1.177ms 1.177ms 1
4045
- aten::empty 0.52% 29.660us 0.52% 29.660us 4.943us 0.000us 0.00% 0.000us 0.000us 6
4046
- cudaFuncSetAttribute 0.10% 5.499us 0.10% 5.499us 1.833us 0.000us 0.00% 0.000us 0.000us 3
4047
- cudaLaunchKernel 3.24% 184.684us 3.24% 184.684us 61.561us 0.000us 0.00% 0.000us 0.000us 3
4048
- aten::reshape 0.15% 8.640us 0.39% 22.430us 3.738us 0.000us 0.00% 0.000us 0.000us 6
4049
- aten::view 0.24% 13.790us 0.24% 13.790us 2.298us 0.000us 0.00% 0.000us 0.000us 6
4050
- cudaDeviceSynchronize 54.65% 3.111ms 54.65% 3.111ms 3.111ms 0.000us 0.00% 0.000us 0.000us 1
4051
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4052
- Self CPU time total: 5.692ms
4053
- Self CUDA time total: 3.526ms
4054
 
4055
 
4056
 
@@ -4060,37 +4060,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
4060
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4061
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4062
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4063
- xformers_meff 5.52% 307.264us 44.82% 2.494ms 2.494ms 0.000us 0.00% 4.662ms 4.662ms 1
4064
- xformers_flash3::flash_fwd 2.63% 146.303us 38.91% 2.164ms 721.461us 0.000us 0.00% 4.662ms 1.554ms 3
4065
- flash_attn_3::fwd 0.91% 50.371us 36.28% 2.018ms 672.693us 3.490ms 100.00% 4.662ms 1.554ms 3
4066
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.491ms 100.04% 3.491ms 3.491ms 1
4067
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.490ms 100.00% 3.490ms 1.163ms 3
4068
- Activity Buffer Request 31.37% 1.745ms 31.37% 1.745ms 1.745ms 1.172ms 33.59% 1.172ms 1.172ms 1
4069
- aten::empty 0.54% 29.920us 0.54% 29.920us 4.987us 0.000us 0.00% 0.000us 0.000us 6
4070
- cudaFuncSetAttribute 0.10% 5.750us 0.10% 5.750us 1.917us 0.000us 0.00% 0.000us 0.000us 3
4071
- cudaLaunchKernel 3.36% 187.102us 3.36% 187.102us 62.367us 0.000us 0.00% 0.000us 0.000us 3
4072
- aten::reshape 0.15% 8.539us 0.39% 21.890us 3.648us 0.000us 0.00% 0.000us 0.000us 6
4073
- aten::view 0.24% 13.351us 0.24% 13.351us 2.225us 0.000us 0.00% 0.000us 0.000us 6
4074
- cudaDeviceSynchronize 55.18% 3.069ms 55.18% 3.069ms 3.069ms 0.000us 0.00% 0.000us 0.000us 1
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
- Self CPU time total: 5.563ms
4077
- Self CUDA time total: 3.490ms
4078
 
4079
 
4080
  impl wl p50(ms) ok
4081
- xformers_meff cuda_attn_L128_bfloat16 0.99 True
4082
- xformers_meff cuda_attn_L256_bfloat16 1.05 True
4083
- xformers_meff cuda_attn_L320_bfloat16 1.09 True
4084
- xformers_meff cuda_attn_L384_bfloat16 1.09 True
4085
- xformers_meff cuda_attn_L448_bfloat16 1.27 True
4086
- xformers_meff cuda_attn_L512_bfloat16 1.28 True
4087
  </pre></div>
4088
  <div class="uv-install-logs" id="uv-logs-benchmark">
4089
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4090
  <div class="uv-logs-content" style="display: none;">
4091
  Downloading xformers (111.8MiB)
4092
- Downloading xformers
4093
- Installed 1 package in 11ms
4094
  </div>
4095
  </div>
4096
  <div class="cell-artifacts">
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: benchmark | 8.92s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3940
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3941
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3942
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3943
+ xformers_meff 9.64% 463.299us 53.77% 2.584ms 2.584ms 0.000us 0.00% 3.636ms 3.636ms 1
3944
+ xformers_flash3::flash_fwd 3.92% 188.192us 43.38% 2.085ms 694.978us 0.000us 0.00% 3.636ms 1.212ms 3
3945
+ flash_attn_3::fwd 1.40% 67.082us 39.46% 1.897ms 632.248us 2.748ms 100.00% 3.636ms 1.212ms 3
3946
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.749ms 100.05% 2.749ms 2.749ms 1
3947
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.748ms 100.00% 2.748ms 915.935us 3
3948
+ Activity Buffer Request 36.10% 1.735ms 36.10% 1.735ms 1.735ms 887.807us 32.31% 887.807us 887.807us 1
3949
+ aten::empty 0.82% 39.381us 0.82% 39.381us 6.563us 0.000us 0.00% 0.000us 0.000us 6
3950
+ cudaFuncSetAttribute 0.26% 12.540us 0.26% 12.540us 4.180us 0.000us 0.00% 0.000us 0.000us 3
3951
+ cudaLaunchKernel 0.88% 42.510us 0.88% 42.510us 14.170us 0.000us 0.00% 0.000us 0.000us 3
3952
+ aten::reshape 0.25% 12.121us 0.75% 35.870us 5.978us 0.000us 0.00% 0.000us 0.000us 6
3953
+ aten::view 0.49% 23.749us 0.49% 23.749us 3.958us 0.000us 0.00% 0.000us 0.000us 6
3954
+ cudaDeviceSynchronize 46.23% 2.222ms 46.23% 2.222ms 2.222ms 0.000us 0.00% 0.000us 0.000us 1
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
+ Self CPU time total: 4.806ms
3957
+ Self CUDA time total: 2.748ms
3958
 
3959
 
3960
 
 
3964
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3965
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3966
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3967
+ xformers_meff 6.94% 327.436us 51.65% 2.436ms 2.436ms 0.000us 0.00% 3.659ms 3.659ms 1
3968
+ xformers_flash3::flash_fwd 3.29% 155.063us 44.22% 2.085ms 695.085us 0.000us 0.00% 3.659ms 1.220ms 3
3969
+ flash_attn_3::fwd 1.15% 54.292us 40.93% 1.930ms 643.398us 2.737ms 100.00% 3.659ms 1.220ms 3
3970
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.738ms 100.05% 2.738ms 2.738ms 1
3971
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.737ms 100.00% 2.737ms 912.235us 3
3972
+ Activity Buffer Request 38.21% 1.802ms 38.21% 1.802ms 1.802ms 922.336us 33.70% 922.336us 922.336us 1
3973
+ aten::empty 0.70% 32.930us 0.70% 32.930us 5.488us 0.000us 0.00% 0.000us 0.000us 6
3974
+ cudaFuncSetAttribute 0.12% 5.760us 0.12% 5.760us 1.920us 0.000us 0.00% 0.000us 0.000us 3
3975
+ cudaLaunchKernel 0.75% 35.410us 0.75% 35.410us 11.803us 0.000us 0.00% 0.000us 0.000us 3
3976
+ aten::reshape 0.20% 9.409us 0.49% 22.989us 3.831us 0.000us 0.00% 0.000us 0.000us 6
3977
+ aten::view 0.29% 13.580us 0.29% 13.580us 2.263us 0.000us 0.00% 0.000us 0.000us 6
3978
+ cudaDeviceSynchronize 48.35% 2.280ms 48.35% 2.280ms 2.280ms 0.000us 0.00% 0.000us 0.000us 1
3979
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3980
+ Self CPU time total: 4.715ms
3981
+ Self CUDA time total: 2.737ms
3982
 
3983
 
3984
 
 
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3990
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3991
+ xformers_meff 6.35% 296.325us 47.95% 2.238ms 2.238ms 0.000us 0.00% 3.787ms 3.787ms 1
3992
+ xformers_flash3::flash_fwd 2.95% 137.473us 41.12% 1.919ms 639.648us 0.000us 0.00% 3.787ms 1.262ms 3
3993
+ flash_attn_3::fwd 1.09% 50.850us 38.17% 1.781ms 593.823us 2.829ms 100.00% 3.787ms 1.262ms 3
3994
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.831ms 100.05% 2.831ms 2.831ms 1
3995
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.829ms 100.00% 2.829ms 943.127us 3
3996
+ Activity Buffer Request 35.64% 1.663ms 35.64% 1.663ms 1.663ms 957.186us 33.83% 957.186us 957.186us 1
3997
+ aten::empty 0.63% 29.301us 0.63% 29.301us 4.884us 0.000us 0.00% 0.000us 0.000us 6
3998
+ cudaFuncSetAttribute 0.11% 5.090us 0.11% 5.090us 1.697us 0.000us 0.00% 0.000us 0.000us 3
3999
+ cudaLaunchKernel 0.71% 33.151us 0.71% 33.151us 11.050us 0.000us 0.00% 0.000us 0.000us 3
4000
+ aten::reshape 0.18% 8.531us 0.48% 22.580us 3.763us 0.000us 0.00% 0.000us 0.000us 6
4001
+ aten::view 0.30% 14.049us 0.30% 14.049us 2.341us 0.000us 0.00% 0.000us 0.000us 6
4002
+ cudaDeviceSynchronize 52.05% 2.429ms 52.05% 2.429ms 2.429ms 0.000us 0.00% 0.000us 0.000us 1
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
+ Self CPU time total: 4.667ms
4005
+ Self CUDA time total: 2.829ms
4006
 
4007
 
4008
 
 
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4014
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4015
+ xformers_meff 6.11% 304.138us 50.43% 2.511ms 2.511ms 0.000us 0.00% 3.860ms 3.860ms 1
4016
+ xformers_flash3::flash_fwd 3.07% 152.860us 43.87% 2.184ms 727.989us 0.000us 0.00% 3.860ms 1.287ms 3
4017
+ flash_attn_3::fwd 1.07% 53.395us 40.80% 2.031ms 677.035us 2.883ms 100.00% 3.860ms 1.287ms 3
4018
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.885ms 100.05% 2.885ms 2.885ms 1
4019
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.883ms 100.00% 2.883ms 961.001us 3
4020
+ Activity Buffer Request 34.97% 1.741ms 34.97% 1.741ms 1.741ms 977.086us 33.89% 977.086us 977.086us 1
4021
+ aten::empty 0.66% 32.699us 0.66% 32.699us 5.450us 0.000us 0.00% 0.000us 0.000us 6
4022
+ cudaFuncSetAttribute 0.12% 6.109us 0.12% 6.109us 2.036us 0.000us 0.00% 0.000us 0.000us 3
4023
+ cudaLaunchKernel 3.98% 197.963us 3.98% 197.963us 65.988us 0.000us 0.00% 0.000us 0.000us 3
4024
+ aten::reshape 0.17% 8.489us 0.45% 22.539us 3.757us 0.000us 0.00% 0.000us 0.000us 6
4025
+ aten::view 0.28% 14.050us 0.28% 14.050us 2.342us 0.000us 0.00% 0.000us 0.000us 6
4026
+ cudaDeviceSynchronize 49.57% 2.468ms 49.57% 2.468ms 2.468ms 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
+ Self CPU time total: 4.978ms
4029
+ Self CUDA time total: 2.883ms
4030
 
4031
 
4032
 
 
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
+ xformers_meff 5.45% 299.105us 45.26% 2.482ms 2.482ms 0.000us 0.00% 4.556ms 4.556ms 1
4040
+ xformers_flash3::flash_fwd 2.57% 140.761us 39.42% 2.162ms 720.685us 0.000us 0.00% 4.556ms 1.519ms 3
4041
+ flash_attn_3::fwd 0.92% 50.555us 36.85% 2.021ms 673.765us 3.406ms 100.00% 4.556ms 1.519ms 3
4042
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.408ms 100.05% 3.408ms 3.408ms 1
4043
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.406ms 100.00% 3.406ms 1.135ms 3
4044
+ Activity Buffer Request 31.74% 1.741ms 31.74% 1.741ms 1.741ms 1.150ms 33.76% 1.150ms 1.150ms 1
4045
+ aten::empty 0.52% 28.258us 0.52% 28.258us 4.710us 0.000us 0.00% 0.000us 0.000us 6
4046
+ cudaFuncSetAttribute 0.10% 5.340us 0.10% 5.340us 1.780us 0.000us 0.00% 0.000us 0.000us 3
4047
+ cudaLaunchKernel 3.58% 196.453us 3.58% 196.453us 65.484us 0.000us 0.00% 0.000us 0.000us 3
4048
+ aten::reshape 0.14% 7.863us 0.39% 21.181us 3.530us 0.000us 0.00% 0.000us 0.000us 6
4049
+ aten::view 0.24% 13.318us 0.24% 13.318us 2.220us 0.000us 0.00% 0.000us 0.000us 6
4050
+ cudaDeviceSynchronize 54.74% 3.003ms 54.74% 3.003ms 3.003ms 0.000us 0.00% 0.000us 0.000us 1
4051
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4052
+ Self CPU time total: 5.485ms
4053
+ Self CUDA time total: 3.406ms
4054
 
4055
 
4056
 
 
4060
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4061
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4062
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4063
+ xformers_meff 5.08% 273.484us 44.98% 2.423ms 2.423ms 0.000us 0.00% 4.494ms 4.494ms 1
4064
+ xformers_flash3::flash_fwd 2.55% 137.253us 39.52% 2.129ms 709.536us 0.000us 0.00% 4.494ms 1.498ms 3
4065
+ flash_attn_3::fwd 0.94% 50.440us 36.97% 1.991ms 663.785us 3.366ms 100.00% 4.494ms 1.498ms 3
4066
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.368ms 100.05% 3.368ms 3.368ms 1
4067
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.366ms 100.00% 3.366ms 1.122ms 3
4068
+ Activity Buffer Request 31.81% 1.713ms 31.81% 1.713ms 1.713ms 1.127ms 33.48% 1.127ms 1.127ms 1
4069
+ aten::empty 0.56% 30.302us 0.56% 30.302us 5.050us 0.000us 0.00% 0.000us 0.000us 6
4070
+ cudaFuncSetAttribute 0.10% 5.300us 0.10% 5.300us 1.767us 0.000us 0.00% 0.000us 0.000us 3
4071
+ cudaLaunchKernel 3.56% 191.983us 3.56% 191.983us 63.994us 0.000us 0.00% 0.000us 0.000us 3
4072
+ aten::reshape 0.15% 8.029us 0.39% 20.930us 3.488us 0.000us 0.00% 0.000us 0.000us 6
4073
+ aten::view 0.24% 12.901us 0.24% 12.901us 2.150us 0.000us 0.00% 0.000us 0.000us 6
4074
+ cudaDeviceSynchronize 55.02% 2.964ms 55.02% 2.964ms 2.964ms 0.000us 0.00% 0.000us 0.000us 1
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
+ Self CPU time total: 5.387ms
4077
+ Self CUDA time total: 3.366ms
4078
 
4079
 
4080
  impl wl p50(ms) ok
4081
+ xformers_meff cuda_attn_L128_bfloat16 0.98 True
4082
+ xformers_meff cuda_attn_L256_bfloat16 1.03 True
4083
+ xformers_meff cuda_attn_L320_bfloat16 1.06 True
4084
+ xformers_meff cuda_attn_L384_bfloat16 1.06 True
4085
+ xformers_meff cuda_attn_L448_bfloat16 1.25 True
4086
+ xformers_meff cuda_attn_L512_bfloat16 1.23 True
4087
  </pre></div>
4088
  <div class="uv-install-logs" id="uv-logs-benchmark">
4089
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4090
  <div class="uv-logs-content" style="display: none;">
4091
  Downloading xformers (111.8MiB)
4092
+ Downloaded xformers
4093
+ Installed 38 packages in 217ms
4094
  </div>
4095
  </div>
4096
  <div class="cell-artifacts">
flash_attn/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: d3a72737b2e51b137700eeffb292bcf730686137439c6c61e905fd6d06c1d87d
  • Pointer size: 130 Bytes
  • Size of remote file: 24.8 kB

Git LFS Details

  • SHA256: 42ecd5306fef7c29b246aeecde0a12e51ef4139ff514ca508f9c74968d64ef13
  • Pointer size: 130 Bytes
  • Size of remote file: 24.8 kB
flash_attn/results/cells/combine.py CHANGED
@@ -20,6 +20,7 @@ cache_env_map = {
20
  "HF Kernels Flash Attn": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK",
21
  "HF Kernels Flash Attn3": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK",
22
  "SageAttention": "UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK",
 
23
  }
24
 
25
  # Generate combined results with visualization
 
20
  "HF Kernels Flash Attn": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK",
21
  "HF Kernels Flash Attn3": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK",
22
  "SageAttention": "UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK",
23
+ # "Flash Attn CUTE": "UVNOTE_FILE_FLASH_ATTN_CUTE_BENCHMARK",
24
  }
25
 
26
  # Generate combined results with visualization
flash_attn/results/combined_results.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3889,11 +3889,11 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-11-10T22:12:19.411851</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
3896
- <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
3897
  </ns2:Agent>
3898
  </dc:creator>
3899
  </ns2:Work>
@@ -3999,96 +3999,96 @@ body[data-tool="eraser"] .main-content {
3999
  <g id="matplotlib.axis_2">
4000
  <g id="ytick_1">
4001
  <g id="grid-y--2" class="grid grid-y">
4002
- <path d="M 47.81 410.22293 L 835.361742 410.22293 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4003
  </g>
4004
  <g id="line2d_7">
4005
  <defs>
4006
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4007
  </defs>
4008
  <g>
4009
- <use ns4:href="#m0fca2865ba" x="47.81" y="410.22293" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="414.022149" transform="rotate(-0 40.81 414.022149)">1.0</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_2">
4017
  <g id="grid-y--3" class="grid grid-y">
4018
- <path d="M 47.81 351.165867 L 835.361742 351.165867 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
- <use ns4:href="#m0fca2865ba" x="47.81" y="351.165867" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="354.965085" transform="rotate(-0 40.81 354.965085)">1.2</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_3">
4030
  <g id="grid-y--4" class="grid grid-y">
4031
- <path d="M 47.81 292.108803 L 835.361742 292.108803 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
- <use ns4:href="#m0fca2865ba" x="47.81" y="292.108803" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="295.908021" transform="rotate(-0 40.81 295.908021)">1.4</text>
4040
  </g>
4041
  </g>
4042
  <g id="ytick_4">
4043
  <g id="grid-y--5" class="grid grid-y">
4044
- <path d="M 47.81 233.051739 L 835.361742 233.051739 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4045
  </g>
4046
  <g id="line2d_10">
4047
  <g>
4048
- <use ns4:href="#m0fca2865ba" x="47.81" y="233.051739" style="stroke: #000000; stroke-width: 0.8" />
4049
  </g>
4050
  </g>
4051
  <g id="text_10">
4052
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="236.850958" transform="rotate(-0 40.81 236.850958)">1.6</text>
4053
  </g>
4054
  </g>
4055
  <g id="ytick_5">
4056
  <g id="grid-y--6" class="grid grid-y">
4057
- <path d="M 47.81 173.994675 L 835.361742 173.994675 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
  </g>
4059
  <g id="line2d_11">
4060
  <g>
4061
- <use ns4:href="#m0fca2865ba" x="47.81" y="173.994675" style="stroke: #000000; stroke-width: 0.8" />
4062
  </g>
4063
  </g>
4064
  <g id="text_11">
4065
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="177.793894" transform="rotate(-0 40.81 177.793894)">1.8</text>
4066
  </g>
4067
  </g>
4068
  <g id="ytick_6">
4069
  <g id="grid-y--7" class="grid grid-y">
4070
- <path d="M 47.81 114.937611 L 835.361742 114.937611 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
  </g>
4072
  <g id="line2d_12">
4073
  <g>
4074
- <use ns4:href="#m0fca2865ba" x="47.81" y="114.937611" style="stroke: #000000; stroke-width: 0.8" />
4075
  </g>
4076
  </g>
4077
  <g id="text_12">
4078
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="118.73683" transform="rotate(-0 40.81 118.73683)">2.0</text>
4079
  </g>
4080
  </g>
4081
  <g id="ytick_7">
4082
  <g id="grid-y--8" class="grid grid-y">
4083
- <path d="M 47.81 55.880547 L 835.361742 55.880547 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
  </g>
4085
  <g id="line2d_13">
4086
  <g>
4087
- <use ns4:href="#m0fca2865ba" x="47.81" y="55.880547" style="stroke: #000000; stroke-width: 0.8" />
4088
  </g>
4089
  </g>
4090
  <g id="text_13">
4091
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="59.679766" transform="rotate(-0 40.81 59.679766)">2.2</text>
4092
  </g>
4093
  </g>
4094
  <g id="label--y" class="ylabel">
@@ -4096,73 +4096,73 @@ body[data-tool="eraser"] .main-content {
4096
  </g>
4097
  </g>
4098
  <g id="series--torch-flash-ma" class="series">
4099
- <path d="M 83.607806 342.171476 L 226.799032 327.838326 L 369.990258 321.335848 L 513.181484 313.422202 L 656.37271 269.362089 L 799.563935 256.968373 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4100
  <defs>
4101
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4102
  </defs>
4103
  <g clip-path="url(#p09feef2583)">
4104
- <use ns4:href="#md7efaf3aec" x="83.607806" y="342.171476" style="fill: #1f77b4; stroke: #1f77b4" />
4105
- <use ns4:href="#md7efaf3aec" x="226.799032" y="327.838326" style="fill: #1f77b4; stroke: #1f77b4" />
4106
- <use ns4:href="#md7efaf3aec" x="369.990258" y="321.335848" style="fill: #1f77b4; stroke: #1f77b4" />
4107
- <use ns4:href="#md7efaf3aec" x="513.181484" y="313.422202" style="fill: #1f77b4; stroke: #1f77b4" />
4108
- <use ns4:href="#md7efaf3aec" x="656.37271" y="269.362089" style="fill: #1f77b4; stroke: #1f77b4" />
4109
- <use ns4:href="#md7efaf3aec" x="799.563935" y="256.968373" style="fill: #1f77b4; stroke: #1f77b4" />
4110
  </g>
4111
  </g>
4112
  <g id="series--torch-mem-eff" class="series">
4113
- <path d="M 83.607806 164.114723 L 226.799032 132.043785 L 369.990258 127.641081 L 513.181484 106.687044 L 656.37271 109.897091 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4114
  <defs>
4115
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4116
  </defs>
4117
  <g clip-path="url(#p09feef2583)">
4118
- <use ns4:href="#m9b8c54d372" x="83.607806" y="164.114723" style="fill: #ff7f0e; stroke: #ff7f0e" />
4119
- <use ns4:href="#m9b8c54d372" x="226.799032" y="132.043785" style="fill: #ff7f0e; stroke: #ff7f0e" />
4120
- <use ns4:href="#m9b8c54d372" x="369.990258" y="127.641081" style="fill: #ff7f0e; stroke: #ff7f0e" />
4121
- <use ns4:href="#m9b8c54d372" x="513.181484" y="106.687044" style="fill: #ff7f0e; stroke: #ff7f0e" />
4122
- <use ns4:href="#m9b8c54d372" x="656.37271" y="109.897091" style="fill: #ff7f0e; stroke: #ff7f0e" />
4123
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4124
  </g>
4125
  </g>
4126
  <g id="series--xformers-meff" class="series">
4127
- <path d="M 83.607806 411.919345 L 226.799032 395.817141 L 369.990258 385.039522 L 513.181484 383.051957 L 656.37271 330.00011 L 799.563935 328.916708 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4128
  <defs>
4129
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4130
  </defs>
4131
  <g clip-path="url(#p09feef2583)">
4132
- <use ns4:href="#mc655281e0b" x="83.607806" y="411.919345" style="fill: #2ca02c; stroke: #2ca02c" />
4133
- <use ns4:href="#mc655281e0b" x="226.799032" y="395.817141" style="fill: #2ca02c; stroke: #2ca02c" />
4134
- <use ns4:href="#mc655281e0b" x="369.990258" y="385.039522" style="fill: #2ca02c; stroke: #2ca02c" />
4135
- <use ns4:href="#mc655281e0b" x="513.181484" y="383.051957" style="fill: #2ca02c; stroke: #2ca02c" />
4136
- <use ns4:href="#mc655281e0b" x="656.37271" y="330.00011" style="fill: #2ca02c; stroke: #2ca02c" />
4137
- <use ns4:href="#mc655281e0b" x="799.563935" y="328.916708" style="fill: #2ca02c; stroke: #2ca02c" />
4138
  </g>
4139
  </g>
4140
  <g id="series--hf-kernels-flash-attn" class="series">
4141
- <path d="M 83.607806 414.272769 L 226.799032 397.812974 L 369.990258 390.52238 L 513.181484 385.260395 L 656.37271 334.695147 L 799.563935 335.332668 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4142
  <defs>
4143
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4144
  </defs>
4145
  <g clip-path="url(#p09feef2583)">
4146
- <use ns4:href="#m61c8040d7e" x="83.607806" y="414.272769" style="fill: #d62728; stroke: #d62728" />
4147
- <use ns4:href="#m61c8040d7e" x="226.799032" y="397.812974" style="fill: #d62728; stroke: #d62728" />
4148
- <use ns4:href="#m61c8040d7e" x="369.990258" y="390.52238" style="fill: #d62728; stroke: #d62728" />
4149
- <use ns4:href="#m61c8040d7e" x="513.181484" y="385.260395" style="fill: #d62728; stroke: #d62728" />
4150
- <use ns4:href="#m61c8040d7e" x="656.37271" y="334.695147" style="fill: #d62728; stroke: #d62728" />
4151
- <use ns4:href="#m61c8040d7e" x="799.563935" y="335.332668" style="fill: #d62728; stroke: #d62728" />
4152
  </g>
4153
  </g>
4154
  <g id="series--hf-kernels-flash-attn3" class="series">
4155
- <path d="M 83.607806 428.387702 L 226.799032 416.230806 L 369.990258 396.443441 L 513.181484 397.952349 L 656.37271 345.907426 L 799.563935 348.015763 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4156
  <defs>
4157
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4158
  </defs>
4159
  <g clip-path="url(#p09feef2583)">
4160
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4161
- <use ns4:href="#m7cd35be9cc" x="226.799032" y="416.230806" style="fill: #9467bd; stroke: #9467bd" />
4162
- <use ns4:href="#m7cd35be9cc" x="369.990258" y="396.443441" style="fill: #9467bd; stroke: #9467bd" />
4163
- <use ns4:href="#m7cd35be9cc" x="513.181484" y="397.952349" style="fill: #9467bd; stroke: #9467bd" />
4164
- <use ns4:href="#m7cd35be9cc" x="656.37271" y="345.907426" style="fill: #9467bd; stroke: #9467bd" />
4165
- <use ns4:href="#m7cd35be9cc" x="799.563935" y="348.015763" style="fill: #9467bd; stroke: #9467bd" />
4166
  </g>
4167
  </g>
4168
  <g id="patch_3">
@@ -4247,12 +4247,12 @@ body[data-tool="eraser"] .main-content {
4247
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4248
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4249
  </span> |
4250
- Cell: combine | 4.53s
4251
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4252
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4253
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
4254
  </div>
4255
- <div id="code-combine" class="cell-code collapsed" data-lines="30">
4256
  <div class="highlight-with-lines">
4257
  <div class="line-numbers" id="lines-combine">
4258
  <a class="line-number" data-cell="combine" data-line="1" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 1, true);">1</a>
@@ -4285,6 +4285,7 @@ Cell: combine | 4.53s
4285
  <a class="line-number" data-cell="combine" data-line="28" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 28, true);">28</a>
4286
  <a class="line-number" data-cell="combine" data-line="29" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 29, true);">29</a>
4287
  <a class="line-number" data-cell="combine" data-line="30" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 30, true);">30</a>
 
4288
  </div>
4289
  <div class="code-wrap">
4290
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
@@ -4309,6 +4310,7 @@ Cell: combine | 4.53s
4309
  <span class="s2">&quot;HF Kernels Flash Attn&quot;</span><span class="p">:</span> <span class="s2">&quot;UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK&quot;</span><span class="p">,</span>
4310
  <span class="s2">&quot;HF Kernels Flash Attn3&quot;</span><span class="p">:</span> <span class="s2">&quot;UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK&quot;</span><span class="p">,</span>
4311
  <span class="s2">&quot;SageAttention&quot;</span><span class="p">:</span> <span class="s2">&quot;UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK&quot;</span><span class="p">,</span>
 
4312
  <span class="p">}</span>
4313
 
4314
  <span class="c1"># Generate combined results with visualization</span>
@@ -4354,48 +4356,48 @@ Summary: 6 found, 0 skipped, 0 missing
4354
  COMBINED BENCHMARK SUMMARY
4355
 
4356
  impl wl p50(ms) ok
4357
- hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.99 True
4358
- hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.04 True
4359
- hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.07 True
4360
- hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.08 True
4361
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.26 True
4362
- hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.25 True
4363
- hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.94 True
4364
- hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
4365
- hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.05 True
4366
- hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.04 True
4367
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.22 True
4368
- hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.21 True
4369
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4370
- Error: module &#x27;sage_attention_d37081df98a5208e&#x27; has no attribute &#x27;fwd&#x27;
4371
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4372
- Error: module &#x27;sage_attention_d37081df98a5208e&#x27; has no attribute &#x27;fwd&#x27;
4373
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4374
- Error: module &#x27;sage_attention_d37081df98a5208e&#x27; has no attribute &#x27;fwd&#x27;
4375
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4376
- Error: module &#x27;sage_attention_d37081df98a5208e&#x27; has no attribute &#x27;fwd&#x27;
4377
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4378
- Error: module &#x27;sage_attention_d37081df98a5208e&#x27; has no attribute &#x27;fwd&#x27;
4379
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4380
- Error: module &#x27;sage_attention_d37081df98a5208e&#x27; has no attribute &#x27;fwd&#x27;
4381
- torch_flash_ma cuda_attn_L128_bfloat16 1.23 True
4382
- torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
4383
- torch_flash_ma cuda_attn_L320_bfloat16 1.30 True
4384
- torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
4385
- torch_flash_ma cuda_attn_L448_bfloat16 1.48 True
4386
- torch_flash_ma cuda_attn_L512_bfloat16 1.52 True
4387
- torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
4388
- torch_mem_eff cuda_attn_L256_bfloat16 1.94 True
4389
- torch_mem_eff cuda_attn_L320_bfloat16 1.96 True
4390
- torch_mem_eff cuda_attn_L384_bfloat16 2.03 True
4391
- torch_mem_eff cuda_attn_L448_bfloat16 2.02 True
4392
- torch_mem_eff cuda_attn_L512_bfloat16 2.23 True
4393
- xformers_meff cuda_attn_L128_bfloat16 0.99 True
4394
- xformers_meff cuda_attn_L256_bfloat16 1.05 True
4395
- xformers_meff cuda_attn_L320_bfloat16 1.09 True
4396
- xformers_meff cuda_attn_L384_bfloat16 1.09 True
4397
- xformers_meff cuda_attn_L448_bfloat16 1.27 True
4398
- xformers_meff cuda_attn_L512_bfloat16 1.28 True
4399
 
4400
  GENERATING COMBINED VISUALIZATION
4401
 
@@ -4419,7 +4421,7 @@ Implementations included:
4419
  <div class="uv-install-logs" id="uv-logs-combine">
4420
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4421
  <div class="uv-logs-content" style="display: none;">
4422
- Installed 37 packages in 327ms
4423
  </div>
4424
  </div>
4425
  <div class="cell-artifacts">
@@ -4432,11 +4434,11 @@ Installed 37 packages in 327ms
4432
  <rdf:RDF>
4433
  <ns2:Work>
4434
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4435
- <dc:date>2025-11-10T22:12:19.411851</dc:date>
4436
  <dc:format>image/svg+xml</dc:format>
4437
  <dc:creator>
4438
  <ns2:Agent>
4439
- <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
4440
  </ns2:Agent>
4441
  </dc:creator>
4442
  </ns2:Work>
@@ -4542,96 +4544,96 @@ Installed 37 packages in 327ms
4542
  <g id="matplotlib.axis_2">
4543
  <g id="ytick_1">
4544
  <g id="grid-y--2" class="grid grid-y">
4545
- <path d="M 47.81 410.22293 L 835.361742 410.22293 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4546
  </g>
4547
  <g id="line2d_7">
4548
  <defs>
4549
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4550
  </defs>
4551
  <g>
4552
- <use ns4:href="#m0fca2865ba" x="47.81" y="410.22293" style="stroke: #000000; stroke-width: 0.8" />
4553
  </g>
4554
  </g>
4555
  <g id="text_7">
4556
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="414.022149" transform="rotate(-0 40.81 414.022149)">1.0</text>
4557
  </g>
4558
  </g>
4559
  <g id="ytick_2">
4560
  <g id="grid-y--3" class="grid grid-y">
4561
- <path d="M 47.81 351.165867 L 835.361742 351.165867 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4562
  </g>
4563
  <g id="line2d_8">
4564
  <g>
4565
- <use ns4:href="#m0fca2865ba" x="47.81" y="351.165867" style="stroke: #000000; stroke-width: 0.8" />
4566
  </g>
4567
  </g>
4568
  <g id="text_8">
4569
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="354.965085" transform="rotate(-0 40.81 354.965085)">1.2</text>
4570
  </g>
4571
  </g>
4572
  <g id="ytick_3">
4573
  <g id="grid-y--4" class="grid grid-y">
4574
- <path d="M 47.81 292.108803 L 835.361742 292.108803 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4575
  </g>
4576
  <g id="line2d_9">
4577
  <g>
4578
- <use ns4:href="#m0fca2865ba" x="47.81" y="292.108803" style="stroke: #000000; stroke-width: 0.8" />
4579
  </g>
4580
  </g>
4581
  <g id="text_9">
4582
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="295.908021" transform="rotate(-0 40.81 295.908021)">1.4</text>
4583
  </g>
4584
  </g>
4585
  <g id="ytick_4">
4586
  <g id="grid-y--5" class="grid grid-y">
4587
- <path d="M 47.81 233.051739 L 835.361742 233.051739 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4588
  </g>
4589
  <g id="line2d_10">
4590
  <g>
4591
- <use ns4:href="#m0fca2865ba" x="47.81" y="233.051739" style="stroke: #000000; stroke-width: 0.8" />
4592
  </g>
4593
  </g>
4594
  <g id="text_10">
4595
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="236.850958" transform="rotate(-0 40.81 236.850958)">1.6</text>
4596
  </g>
4597
  </g>
4598
  <g id="ytick_5">
4599
  <g id="grid-y--6" class="grid grid-y">
4600
- <path d="M 47.81 173.994675 L 835.361742 173.994675 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4601
  </g>
4602
  <g id="line2d_11">
4603
  <g>
4604
- <use ns4:href="#m0fca2865ba" x="47.81" y="173.994675" style="stroke: #000000; stroke-width: 0.8" />
4605
  </g>
4606
  </g>
4607
  <g id="text_11">
4608
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="177.793894" transform="rotate(-0 40.81 177.793894)">1.8</text>
4609
  </g>
4610
  </g>
4611
  <g id="ytick_6">
4612
  <g id="grid-y--7" class="grid grid-y">
4613
- <path d="M 47.81 114.937611 L 835.361742 114.937611 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4614
  </g>
4615
  <g id="line2d_12">
4616
  <g>
4617
- <use ns4:href="#m0fca2865ba" x="47.81" y="114.937611" style="stroke: #000000; stroke-width: 0.8" />
4618
  </g>
4619
  </g>
4620
  <g id="text_12">
4621
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="118.73683" transform="rotate(-0 40.81 118.73683)">2.0</text>
4622
  </g>
4623
  </g>
4624
  <g id="ytick_7">
4625
  <g id="grid-y--8" class="grid grid-y">
4626
- <path d="M 47.81 55.880547 L 835.361742 55.880547 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4627
  </g>
4628
  <g id="line2d_13">
4629
  <g>
4630
- <use ns4:href="#m0fca2865ba" x="47.81" y="55.880547" style="stroke: #000000; stroke-width: 0.8" />
4631
  </g>
4632
  </g>
4633
  <g id="text_13">
4634
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="59.679766" transform="rotate(-0 40.81 59.679766)">2.2</text>
4635
  </g>
4636
  </g>
4637
  <g id="label--y" class="ylabel">
@@ -4639,73 +4641,73 @@ Installed 37 packages in 327ms
4639
  </g>
4640
  </g>
4641
  <g id="series--torch-flash-ma" class="series">
4642
- <path d="M 83.607806 342.171476 L 226.799032 327.838326 L 369.990258 321.335848 L 513.181484 313.422202 L 656.37271 269.362089 L 799.563935 256.968373 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4643
  <defs>
4644
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4645
  </defs>
4646
  <g clip-path="url(#p09feef2583)">
4647
- <use ns4:href="#md7efaf3aec" x="83.607806" y="342.171476" style="fill: #1f77b4; stroke: #1f77b4" />
4648
- <use ns4:href="#md7efaf3aec" x="226.799032" y="327.838326" style="fill: #1f77b4; stroke: #1f77b4" />
4649
- <use ns4:href="#md7efaf3aec" x="369.990258" y="321.335848" style="fill: #1f77b4; stroke: #1f77b4" />
4650
- <use ns4:href="#md7efaf3aec" x="513.181484" y="313.422202" style="fill: #1f77b4; stroke: #1f77b4" />
4651
- <use ns4:href="#md7efaf3aec" x="656.37271" y="269.362089" style="fill: #1f77b4; stroke: #1f77b4" />
4652
- <use ns4:href="#md7efaf3aec" x="799.563935" y="256.968373" style="fill: #1f77b4; stroke: #1f77b4" />
4653
  </g>
4654
  </g>
4655
  <g id="series--torch-mem-eff" class="series">
4656
- <path d="M 83.607806 164.114723 L 226.799032 132.043785 L 369.990258 127.641081 L 513.181484 106.687044 L 656.37271 109.897091 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4657
  <defs>
4658
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4659
  </defs>
4660
  <g clip-path="url(#p09feef2583)">
4661
- <use ns4:href="#m9b8c54d372" x="83.607806" y="164.114723" style="fill: #ff7f0e; stroke: #ff7f0e" />
4662
- <use ns4:href="#m9b8c54d372" x="226.799032" y="132.043785" style="fill: #ff7f0e; stroke: #ff7f0e" />
4663
- <use ns4:href="#m9b8c54d372" x="369.990258" y="127.641081" style="fill: #ff7f0e; stroke: #ff7f0e" />
4664
- <use ns4:href="#m9b8c54d372" x="513.181484" y="106.687044" style="fill: #ff7f0e; stroke: #ff7f0e" />
4665
- <use ns4:href="#m9b8c54d372" x="656.37271" y="109.897091" style="fill: #ff7f0e; stroke: #ff7f0e" />
4666
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4667
  </g>
4668
  </g>
4669
  <g id="series--xformers-meff" class="series">
4670
- <path d="M 83.607806 411.919345 L 226.799032 395.817141 L 369.990258 385.039522 L 513.181484 383.051957 L 656.37271 330.00011 L 799.563935 328.916708 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4671
  <defs>
4672
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4673
  </defs>
4674
  <g clip-path="url(#p09feef2583)">
4675
- <use ns4:href="#mc655281e0b" x="83.607806" y="411.919345" style="fill: #2ca02c; stroke: #2ca02c" />
4676
- <use ns4:href="#mc655281e0b" x="226.799032" y="395.817141" style="fill: #2ca02c; stroke: #2ca02c" />
4677
- <use ns4:href="#mc655281e0b" x="369.990258" y="385.039522" style="fill: #2ca02c; stroke: #2ca02c" />
4678
- <use ns4:href="#mc655281e0b" x="513.181484" y="383.051957" style="fill: #2ca02c; stroke: #2ca02c" />
4679
- <use ns4:href="#mc655281e0b" x="656.37271" y="330.00011" style="fill: #2ca02c; stroke: #2ca02c" />
4680
- <use ns4:href="#mc655281e0b" x="799.563935" y="328.916708" style="fill: #2ca02c; stroke: #2ca02c" />
4681
  </g>
4682
  </g>
4683
  <g id="series--hf-kernels-flash-attn" class="series">
4684
- <path d="M 83.607806 414.272769 L 226.799032 397.812974 L 369.990258 390.52238 L 513.181484 385.260395 L 656.37271 334.695147 L 799.563935 335.332668 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4685
  <defs>
4686
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4687
  </defs>
4688
  <g clip-path="url(#p09feef2583)">
4689
- <use ns4:href="#m61c8040d7e" x="83.607806" y="414.272769" style="fill: #d62728; stroke: #d62728" />
4690
- <use ns4:href="#m61c8040d7e" x="226.799032" y="397.812974" style="fill: #d62728; stroke: #d62728" />
4691
- <use ns4:href="#m61c8040d7e" x="369.990258" y="390.52238" style="fill: #d62728; stroke: #d62728" />
4692
- <use ns4:href="#m61c8040d7e" x="513.181484" y="385.260395" style="fill: #d62728; stroke: #d62728" />
4693
- <use ns4:href="#m61c8040d7e" x="656.37271" y="334.695147" style="fill: #d62728; stroke: #d62728" />
4694
- <use ns4:href="#m61c8040d7e" x="799.563935" y="335.332668" style="fill: #d62728; stroke: #d62728" />
4695
  </g>
4696
  </g>
4697
  <g id="series--hf-kernels-flash-attn3" class="series">
4698
- <path d="M 83.607806 428.387702 L 226.799032 416.230806 L 369.990258 396.443441 L 513.181484 397.952349 L 656.37271 345.907426 L 799.563935 348.015763 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4699
  <defs>
4700
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4701
  </defs>
4702
  <g clip-path="url(#p09feef2583)">
4703
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4704
- <use ns4:href="#m7cd35be9cc" x="226.799032" y="416.230806" style="fill: #9467bd; stroke: #9467bd" />
4705
- <use ns4:href="#m7cd35be9cc" x="369.990258" y="396.443441" style="fill: #9467bd; stroke: #9467bd" />
4706
- <use ns4:href="#m7cd35be9cc" x="513.181484" y="397.952349" style="fill: #9467bd; stroke: #9467bd" />
4707
- <use ns4:href="#m7cd35be9cc" x="656.37271" y="345.907426" style="fill: #9467bd; stroke: #9467bd" />
4708
- <use ns4:href="#m7cd35be9cc" x="799.563935" y="348.015763" style="fill: #9467bd; stroke: #9467bd" />
4709
  </g>
4710
  </g>
4711
  <g id="patch_3">
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T19:09:55.297355</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
3896
+ <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
3897
  </ns2:Agent>
3898
  </dc:creator>
3899
  </ns2:Work>
 
3999
  <g id="matplotlib.axis_2">
4000
  <g id="ytick_1">
4001
  <g id="grid-y--2" class="grid grid-y">
4002
+ <path d="M 47.81 404.469232 L 835.361742 404.469232 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4003
  </g>
4004
  <g id="line2d_7">
4005
  <defs>
4006
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4007
  </defs>
4008
  <g>
4009
+ <use ns4:href="#m0fca2865ba" x="47.81" y="404.469232" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="408.26845" transform="rotate(-0 40.81 408.26845)">1.0</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_2">
4017
  <g id="grid-y--3" class="grid grid-y">
4018
+ <path d="M 47.81 347.147903 L 835.361742 347.147903 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
+ <use ns4:href="#m0fca2865ba" x="47.81" y="347.147903" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="350.947122" transform="rotate(-0 40.81 350.947122)">1.2</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_3">
4030
  <g id="grid-y--4" class="grid grid-y">
4031
+ <path d="M 47.81 289.826575 L 835.361742 289.826575 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
+ <use ns4:href="#m0fca2865ba" x="47.81" y="289.826575" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="293.625794" transform="rotate(-0 40.81 293.625794)">1.4</text>
4040
  </g>
4041
  </g>
4042
  <g id="ytick_4">
4043
  <g id="grid-y--5" class="grid grid-y">
4044
+ <path d="M 47.81 232.505247 L 835.361742 232.505247 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4045
  </g>
4046
  <g id="line2d_10">
4047
  <g>
4048
+ <use ns4:href="#m0fca2865ba" x="47.81" y="232.505247" style="stroke: #000000; stroke-width: 0.8" />
4049
  </g>
4050
  </g>
4051
  <g id="text_10">
4052
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="236.304466" transform="rotate(-0 40.81 236.304466)">1.6</text>
4053
  </g>
4054
  </g>
4055
  <g id="ytick_5">
4056
  <g id="grid-y--6" class="grid grid-y">
4057
+ <path d="M 47.81 175.183919 L 835.361742 175.183919 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
  </g>
4059
  <g id="line2d_11">
4060
  <g>
4061
+ <use ns4:href="#m0fca2865ba" x="47.81" y="175.183919" style="stroke: #000000; stroke-width: 0.8" />
4062
  </g>
4063
  </g>
4064
  <g id="text_11">
4065
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="178.983137" transform="rotate(-0 40.81 178.983137)">1.8</text>
4066
  </g>
4067
  </g>
4068
  <g id="ytick_6">
4069
  <g id="grid-y--7" class="grid grid-y">
4070
+ <path d="M 47.81 117.86259 L 835.361742 117.86259 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
  </g>
4072
  <g id="line2d_12">
4073
  <g>
4074
+ <use ns4:href="#m0fca2865ba" x="47.81" y="117.86259" style="stroke: #000000; stroke-width: 0.8" />
4075
  </g>
4076
  </g>
4077
  <g id="text_12">
4078
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="121.661809" transform="rotate(-0 40.81 121.661809)">2.0</text>
4079
  </g>
4080
  </g>
4081
  <g id="ytick_7">
4082
  <g id="grid-y--8" class="grid grid-y">
4083
+ <path d="M 47.81 60.541262 L 835.361742 60.541262 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
  </g>
4085
  <g id="line2d_13">
4086
  <g>
4087
+ <use ns4:href="#m0fca2865ba" x="47.81" y="60.541262" style="stroke: #000000; stroke-width: 0.8" />
4088
  </g>
4089
  </g>
4090
  <g id="text_13">
4091
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="64.340481" transform="rotate(-0 40.81 64.340481)">2.2</text>
4092
  </g>
4093
  </g>
4094
  <g id="label--y" class="ylabel">
 
4096
  </g>
4097
  </g>
4098
  <g id="series--torch-flash-ma" class="series">
4099
+ <path d="M 83.607806 346.603064 L 226.799032 331.148661 L 369.990258 322.7508 L 513.181484 313.642154 L 656.37271 270.506995 L 799.563935 259.742049 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4100
  <defs>
4101
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4102
  </defs>
4103
  <g clip-path="url(#p09feef2583)">
4104
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="346.603064" style="fill: #1f77b4; stroke: #1f77b4" />
4105
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="331.148661" style="fill: #1f77b4; stroke: #1f77b4" />
4106
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="322.7508" style="fill: #1f77b4; stroke: #1f77b4" />
4107
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="313.642154" style="fill: #1f77b4; stroke: #1f77b4" />
4108
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="270.506995" style="fill: #1f77b4; stroke: #1f77b4" />
4109
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="259.742049" style="fill: #1f77b4; stroke: #1f77b4" />
4110
  </g>
4111
  </g>
4112
  <g id="series--torch-mem-eff" class="series">
4113
+ <path d="M 83.607806 162.593002 L 226.799032 131.641491 L 369.990258 126.594348 L 513.181484 96.170767 L 656.37271 105.428161 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4114
  <defs>
4115
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4116
  </defs>
4117
  <g clip-path="url(#p09feef2583)">
4118
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="162.593002" style="fill: #ff7f0e; stroke: #ff7f0e" />
4119
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="131.641491" style="fill: #ff7f0e; stroke: #ff7f0e" />
4120
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="126.594348" style="fill: #ff7f0e; stroke: #ff7f0e" />
4121
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="96.170767" style="fill: #ff7f0e; stroke: #ff7f0e" />
4122
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="105.428161" style="fill: #ff7f0e; stroke: #ff7f0e" />
4123
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4124
  </g>
4125
  </g>
4126
  <g id="series--xformers-meff" class="series">
4127
+ <path d="M 83.607806 410.706939 L 226.799032 396.737158 L 369.990258 386.568354 L 513.181484 386.536541 L 656.37271 333.774551 L 799.563935 337.388661 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4128
  <defs>
4129
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4130
  </defs>
4131
  <g clip-path="url(#p09feef2583)">
4132
+ <use ns4:href="#mc655281e0b" x="83.607806" y="410.706939" style="fill: #2ca02c; stroke: #2ca02c" />
4133
+ <use ns4:href="#mc655281e0b" x="226.799032" y="396.737158" style="fill: #2ca02c; stroke: #2ca02c" />
4134
+ <use ns4:href="#mc655281e0b" x="369.990258" y="386.568354" style="fill: #2ca02c; stroke: #2ca02c" />
4135
+ <use ns4:href="#mc655281e0b" x="513.181484" y="386.536541" style="fill: #2ca02c; stroke: #2ca02c" />
4136
+ <use ns4:href="#mc655281e0b" x="656.37271" y="333.774551" style="fill: #2ca02c; stroke: #2ca02c" />
4137
+ <use ns4:href="#mc655281e0b" x="799.563935" y="337.388661" style="fill: #2ca02c; stroke: #2ca02c" />
4138
  </g>
4139
  </g>
4140
  <g id="series--hf-kernels-flash-attn" class="series">
4141
+ <path d="M 83.607806 416.940633 L 226.799032 399.984697 L 369.990258 390.841946 L 513.181484 387.029791 L 656.37271 344.433452 L 799.563935 341.857145 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4142
  <defs>
4143
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4144
  </defs>
4145
  <g clip-path="url(#p09feef2583)">
4146
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="416.940633" style="fill: #d62728; stroke: #d62728" />
4147
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="399.984697" style="fill: #d62728; stroke: #d62728" />
4148
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="390.841946" style="fill: #d62728; stroke: #d62728" />
4149
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="387.029791" style="fill: #d62728; stroke: #d62728" />
4150
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="344.433452" style="fill: #d62728; stroke: #d62728" />
4151
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="341.857145" style="fill: #d62728; stroke: #d62728" />
4152
  </g>
4153
  </g>
4154
  <g id="series--hf-kernels-flash-attn3" class="series">
4155
+ <path d="M 83.607806 428.387702 L 226.799032 412.171498 L 369.990258 404.997448 L 513.181484 400.314295 L 656.37271 358.798463 L 799.563935 355.895138 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4156
  <defs>
4157
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4158
  </defs>
4159
  <g clip-path="url(#p09feef2583)">
4160
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4161
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="412.171498" style="fill: #9467bd; stroke: #9467bd" />
4162
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="404.997448" style="fill: #9467bd; stroke: #9467bd" />
4163
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="400.314295" style="fill: #9467bd; stroke: #9467bd" />
4164
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="358.798463" style="fill: #9467bd; stroke: #9467bd" />
4165
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="355.895138" style="fill: #9467bd; stroke: #9467bd" />
4166
  </g>
4167
  </g>
4168
  <g id="patch_3">
 
4247
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4248
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4249
  </span> |
4250
+ Cell: combine | 4.87s
4251
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4252
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4253
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
4254
  </div>
4255
+ <div id="code-combine" class="cell-code collapsed" data-lines="31">
4256
  <div class="highlight-with-lines">
4257
  <div class="line-numbers" id="lines-combine">
4258
  <a class="line-number" data-cell="combine" data-line="1" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 1, true);">1</a>
 
4285
  <a class="line-number" data-cell="combine" data-line="28" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 28, true);">28</a>
4286
  <a class="line-number" data-cell="combine" data-line="29" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 29, true);">29</a>
4287
  <a class="line-number" data-cell="combine" data-line="30" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 30, true);">30</a>
4288
+ <a class="line-number" data-cell="combine" data-line="31" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 31, true);">31</a>
4289
  </div>
4290
  <div class="code-wrap">
4291
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
 
4310
  <span class="s2">&quot;HF Kernels Flash Attn&quot;</span><span class="p">:</span> <span class="s2">&quot;UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK&quot;</span><span class="p">,</span>
4311
  <span class="s2">&quot;HF Kernels Flash Attn3&quot;</span><span class="p">:</span> <span class="s2">&quot;UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK&quot;</span><span class="p">,</span>
4312
  <span class="s2">&quot;SageAttention&quot;</span><span class="p">:</span> <span class="s2">&quot;UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK&quot;</span><span class="p">,</span>
4313
+ <span class="c1"># &quot;Flash Attn CUTE&quot;: &quot;UVNOTE_FILE_FLASH_ATTN_CUTE_BENCHMARK&quot;,</span>
4314
  <span class="p">}</span>
4315
 
4316
  <span class="c1"># Generate combined results with visualization</span>
 
4356
  COMBINED BENCHMARK SUMMARY
4357
 
4358
  impl wl p50(ms) ok
4359
+ hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
4360
+ hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.02 True
4361
+ hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
4362
+ hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
4363
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
4364
+ hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.22 True
4365
+ hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
4366
+ hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
4367
+ hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.00 True
4368
+ hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True
4369
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.16 True
4370
+ hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True
4371
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4372
+ Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
4373
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4374
+ Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
4375
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4376
+ Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
4377
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4378
+ Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
4379
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4380
+ Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
4381
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4382
+ Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
4383
+ torch_flash_ma cuda_attn_L128_bfloat16 1.20 True
4384
+ torch_flash_ma cuda_attn_L256_bfloat16 1.26 True
4385
+ torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
4386
+ torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
4387
+ torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
4388
+ torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
4389
+ torch_mem_eff cuda_attn_L128_bfloat16 1.84 True
4390
+ torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
4391
+ torch_mem_eff cuda_attn_L320_bfloat16 1.97 True
4392
+ torch_mem_eff cuda_attn_L384_bfloat16 2.08 True
4393
+ torch_mem_eff cuda_attn_L448_bfloat16 2.04 True
4394
+ torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
4395
+ xformers_meff cuda_attn_L128_bfloat16 0.98 True
4396
+ xformers_meff cuda_attn_L256_bfloat16 1.03 True
4397
+ xformers_meff cuda_attn_L320_bfloat16 1.06 True
4398
+ xformers_meff cuda_attn_L384_bfloat16 1.06 True
4399
+ xformers_meff cuda_attn_L448_bfloat16 1.25 True
4400
+ xformers_meff cuda_attn_L512_bfloat16 1.23 True
4401
 
4402
  GENERATING COMBINED VISUALIZATION
4403
 
 
4421
  <div class="uv-install-logs" id="uv-logs-combine">
4422
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4423
  <div class="uv-logs-content" style="display: none;">
4424
+ Installed 37 packages in 315ms
4425
  </div>
4426
  </div>
4427
  <div class="cell-artifacts">
 
4434
  <rdf:RDF>
4435
  <ns2:Work>
4436
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4437
+ <dc:date>2025-12-19T19:09:55.297355</dc:date>
4438
  <dc:format>image/svg+xml</dc:format>
4439
  <dc:creator>
4440
  <ns2:Agent>
4441
+ <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
4442
  </ns2:Agent>
4443
  </dc:creator>
4444
  </ns2:Work>
 
4544
  <g id="matplotlib.axis_2">
4545
  <g id="ytick_1">
4546
  <g id="grid-y--2" class="grid grid-y">
4547
+ <path d="M 47.81 404.469232 L 835.361742 404.469232 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4548
  </g>
4549
  <g id="line2d_7">
4550
  <defs>
4551
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4552
  </defs>
4553
  <g>
4554
+ <use ns4:href="#m0fca2865ba" x="47.81" y="404.469232" style="stroke: #000000; stroke-width: 0.8" />
4555
  </g>
4556
  </g>
4557
  <g id="text_7">
4558
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="408.26845" transform="rotate(-0 40.81 408.26845)">1.0</text>
4559
  </g>
4560
  </g>
4561
  <g id="ytick_2">
4562
  <g id="grid-y--3" class="grid grid-y">
4563
+ <path d="M 47.81 347.147903 L 835.361742 347.147903 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4564
  </g>
4565
  <g id="line2d_8">
4566
  <g>
4567
+ <use ns4:href="#m0fca2865ba" x="47.81" y="347.147903" style="stroke: #000000; stroke-width: 0.8" />
4568
  </g>
4569
  </g>
4570
  <g id="text_8">
4571
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="350.947122" transform="rotate(-0 40.81 350.947122)">1.2</text>
4572
  </g>
4573
  </g>
4574
  <g id="ytick_3">
4575
  <g id="grid-y--4" class="grid grid-y">
4576
+ <path d="M 47.81 289.826575 L 835.361742 289.826575 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4577
  </g>
4578
  <g id="line2d_9">
4579
  <g>
4580
+ <use ns4:href="#m0fca2865ba" x="47.81" y="289.826575" style="stroke: #000000; stroke-width: 0.8" />
4581
  </g>
4582
  </g>
4583
  <g id="text_9">
4584
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="293.625794" transform="rotate(-0 40.81 293.625794)">1.4</text>
4585
  </g>
4586
  </g>
4587
  <g id="ytick_4">
4588
  <g id="grid-y--5" class="grid grid-y">
4589
+ <path d="M 47.81 232.505247 L 835.361742 232.505247 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4590
  </g>
4591
  <g id="line2d_10">
4592
  <g>
4593
+ <use ns4:href="#m0fca2865ba" x="47.81" y="232.505247" style="stroke: #000000; stroke-width: 0.8" />
4594
  </g>
4595
  </g>
4596
  <g id="text_10">
4597
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="236.304466" transform="rotate(-0 40.81 236.304466)">1.6</text>
4598
  </g>
4599
  </g>
4600
  <g id="ytick_5">
4601
  <g id="grid-y--6" class="grid grid-y">
4602
+ <path d="M 47.81 175.183919 L 835.361742 175.183919 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4603
  </g>
4604
  <g id="line2d_11">
4605
  <g>
4606
+ <use ns4:href="#m0fca2865ba" x="47.81" y="175.183919" style="stroke: #000000; stroke-width: 0.8" />
4607
  </g>
4608
  </g>
4609
  <g id="text_11">
4610
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="178.983137" transform="rotate(-0 40.81 178.983137)">1.8</text>
4611
  </g>
4612
  </g>
4613
  <g id="ytick_6">
4614
  <g id="grid-y--7" class="grid grid-y">
4615
+ <path d="M 47.81 117.86259 L 835.361742 117.86259 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4616
  </g>
4617
  <g id="line2d_12">
4618
  <g>
4619
+ <use ns4:href="#m0fca2865ba" x="47.81" y="117.86259" style="stroke: #000000; stroke-width: 0.8" />
4620
  </g>
4621
  </g>
4622
  <g id="text_12">
4623
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="121.661809" transform="rotate(-0 40.81 121.661809)">2.0</text>
4624
  </g>
4625
  </g>
4626
  <g id="ytick_7">
4627
  <g id="grid-y--8" class="grid grid-y">
4628
+ <path d="M 47.81 60.541262 L 835.361742 60.541262 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4629
  </g>
4630
  <g id="line2d_13">
4631
  <g>
4632
+ <use ns4:href="#m0fca2865ba" x="47.81" y="60.541262" style="stroke: #000000; stroke-width: 0.8" />
4633
  </g>
4634
  </g>
4635
  <g id="text_13">
4636
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="64.340481" transform="rotate(-0 40.81 64.340481)">2.2</text>
4637
  </g>
4638
  </g>
4639
  <g id="label--y" class="ylabel">
 
4641
  </g>
4642
  </g>
4643
  <g id="series--torch-flash-ma" class="series">
4644
+ <path d="M 83.607806 346.603064 L 226.799032 331.148661 L 369.990258 322.7508 L 513.181484 313.642154 L 656.37271 270.506995 L 799.563935 259.742049 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4645
  <defs>
4646
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4647
  </defs>
4648
  <g clip-path="url(#p09feef2583)">
4649
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="346.603064" style="fill: #1f77b4; stroke: #1f77b4" />
4650
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="331.148661" style="fill: #1f77b4; stroke: #1f77b4" />
4651
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="322.7508" style="fill: #1f77b4; stroke: #1f77b4" />
4652
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="313.642154" style="fill: #1f77b4; stroke: #1f77b4" />
4653
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="270.506995" style="fill: #1f77b4; stroke: #1f77b4" />
4654
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="259.742049" style="fill: #1f77b4; stroke: #1f77b4" />
4655
  </g>
4656
  </g>
4657
  <g id="series--torch-mem-eff" class="series">
4658
+ <path d="M 83.607806 162.593002 L 226.799032 131.641491 L 369.990258 126.594348 L 513.181484 96.170767 L 656.37271 105.428161 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4659
  <defs>
4660
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4661
  </defs>
4662
  <g clip-path="url(#p09feef2583)">
4663
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="162.593002" style="fill: #ff7f0e; stroke: #ff7f0e" />
4664
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="131.641491" style="fill: #ff7f0e; stroke: #ff7f0e" />
4665
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="126.594348" style="fill: #ff7f0e; stroke: #ff7f0e" />
4666
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="96.170767" style="fill: #ff7f0e; stroke: #ff7f0e" />
4667
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="105.428161" style="fill: #ff7f0e; stroke: #ff7f0e" />
4668
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4669
  </g>
4670
  </g>
4671
  <g id="series--xformers-meff" class="series">
4672
+ <path d="M 83.607806 410.706939 L 226.799032 396.737158 L 369.990258 386.568354 L 513.181484 386.536541 L 656.37271 333.774551 L 799.563935 337.388661 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4673
  <defs>
4674
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4675
  </defs>
4676
  <g clip-path="url(#p09feef2583)">
4677
+ <use ns4:href="#mc655281e0b" x="83.607806" y="410.706939" style="fill: #2ca02c; stroke: #2ca02c" />
4678
+ <use ns4:href="#mc655281e0b" x="226.799032" y="396.737158" style="fill: #2ca02c; stroke: #2ca02c" />
4679
+ <use ns4:href="#mc655281e0b" x="369.990258" y="386.568354" style="fill: #2ca02c; stroke: #2ca02c" />
4680
+ <use ns4:href="#mc655281e0b" x="513.181484" y="386.536541" style="fill: #2ca02c; stroke: #2ca02c" />
4681
+ <use ns4:href="#mc655281e0b" x="656.37271" y="333.774551" style="fill: #2ca02c; stroke: #2ca02c" />
4682
+ <use ns4:href="#mc655281e0b" x="799.563935" y="337.388661" style="fill: #2ca02c; stroke: #2ca02c" />
4683
  </g>
4684
  </g>
4685
  <g id="series--hf-kernels-flash-attn" class="series">
4686
+ <path d="M 83.607806 416.940633 L 226.799032 399.984697 L 369.990258 390.841946 L 513.181484 387.029791 L 656.37271 344.433452 L 799.563935 341.857145 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4687
  <defs>
4688
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4689
  </defs>
4690
  <g clip-path="url(#p09feef2583)">
4691
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="416.940633" style="fill: #d62728; stroke: #d62728" />
4692
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="399.984697" style="fill: #d62728; stroke: #d62728" />
4693
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="390.841946" style="fill: #d62728; stroke: #d62728" />
4694
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="387.029791" style="fill: #d62728; stroke: #d62728" />
4695
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="344.433452" style="fill: #d62728; stroke: #d62728" />
4696
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="341.857145" style="fill: #d62728; stroke: #d62728" />
4697
  </g>
4698
  </g>
4699
  <g id="series--hf-kernels-flash-attn3" class="series">
4700
+ <path d="M 83.607806 428.387702 L 226.799032 412.171498 L 369.990258 404.997448 L 513.181484 400.314295 L 656.37271 358.798463 L 799.563935 355.895138 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4701
  <defs>
4702
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4703
  </defs>
4704
  <g clip-path="url(#p09feef2583)">
4705
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4706
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="412.171498" style="fill: #9467bd; stroke: #9467bd" />
4707
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="404.997448" style="fill: #9467bd; stroke: #9467bd" />
4708
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="400.314295" style="fill: #9467bd; stroke: #9467bd" />
4709
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="358.798463" style="fill: #9467bd; stroke: #9467bd" />
4710
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="355.895138" style="fill: #9467bd; stroke: #9467bd" />
4711
  </g>
4712
  </g>
4713
  <g id="patch_3">
index.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl CHANGED
@@ -1,4 +1,4 @@
1
- {"ts": "2025-11-10T22:11:32Z", "run": "08926e8525be4ec6b9adc7957d91ab7e", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8238129998971999, "p50": 0.8308330000090791, "p90": 0.8364840000467666, "mean": 0.8310172000165039, "iqr": 0.012130999948567478, "raw_times": [0.8243530000981991, 0.8396030000312749, 0.8364840000467666, 0.8238129998971999, 0.8308330000090791], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8376129999305704, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
2
- {"ts": "2025-11-10T22:11:32Z", "run": "08926e8525be4ec6b9adc7957d91ab7e", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6412459999628481, "p50": 1.64963599991097, "p90": 1.6500760000326409, "mean": 1.6485119999288145, "iqr": 0.005011000212107319, "raw_times": [1.6500760000326409, 1.6450649998205336, 1.6412459999628481, 1.6565369999170798, 1.64963599991097], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6503259998899011, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
3
- {"ts": "2025-11-10T22:11:33Z", "run": "08926e8525be4ec6b9adc7957d91ab7e", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.635675999978048, "p50": 1.649695999958567, "p90": 1.6533860000436107, "mean": 1.6475760000048467, "iqr": 0.008750000006330083, "raw_times": [1.649695999958567, 1.635675999978048, 1.6533860000436107, 1.6544860000067274, 1.6446360000372806], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6516960001808911, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
4
- {"ts": "2025-11-10T22:11:33Z", "run": "08926e8525be4ec6b9adc7957d91ab7e", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.2460209999953804, "p50": 3.252669999938007, "p90": 3.25303099998564, "mean": 3.2530550000046787, "iqr": 0.002919999815276242, "raw_times": [3.2634419999340025, 3.25303099998564, 3.252669999938007, 3.2501110001703637, 3.2460209999953804], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.265330999965954, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
 
1
+ {"ts": "2025-12-19T18:57:07Z", "run": "8a911691677c4be4b2377923d73cef2c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8283059999598663, "p50": 0.8335360000160108, "p90": 0.8356760000083341, "mean": 0.8340919999909602, "iqr": 0.0024800000346658635, "raw_times": [0.8356760000083341, 0.8397459999969215, 0.8331959999736682, 0.8335360000160108, 0.8283059999598663], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8370360000071742, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
2
+ {"ts": "2025-12-19T18:57:07Z", "run": "8a911691677c4be4b2377923d73cef2c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6452309999976933, "p50": 1.6598209999756364, "p90": 1.6613920000168036, "mean": 1.6622576000031586, "iqr": 0.0022199999989425123, "raw_times": [1.6591720000178611, 1.6613920000168036, 1.6598209999756364, 1.6856720000077985, 1.6452309999976933], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.654771999994864, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
3
+ {"ts": "2025-12-19T18:57:07Z", "run": "8a911691677c4be4b2377923d73cef2c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6495710000299368, "p50": 1.651621000007708, "p90": 1.6563920000294274, "mean": 1.6539776000172424, "iqr": 0.0065000000404324965, "raw_times": [1.649891999988995, 1.6624120000301446, 1.6563920000294274, 1.6495710000299368, 1.651621000007708], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6589109999927132, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
4
+ {"ts": "2025-12-19T18:57:07Z", "run": "8a911691677c4be4b2377923d73cef2c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.243421000036051, "p50": 3.2525119999604613, "p90": 3.2605619999799274, "mean": 3.252856000005977, "iqr": 0.017038999942542432, "raw_times": [3.2525119999604613, 3.2642620000160605, 3.2605619999799274, 3.243421000036051, 3.243523000037385], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.250041000001147, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
layer_norm/impls/hf_kernels_layer_norm.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
3889
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3890
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3891
  </span> |
3892
- Cell: benchmark | 6.65s
3893
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3894
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3895
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3961,19 +3961,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
3961
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3962
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3963
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3964
- hf_kernels_layer_norm 4.99% 214.535us 50.40% 2.165ms 2.165ms 0.000us 0.00% 3.089ms 3.089ms 1
3965
- _layer_norm_f8ec252::dropout_add_ln_fwd 1.39% 59.840us 44.89% 1.928ms 642.793us 2.355ms 100.00% 3.089ms 1.030ms 3
3966
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.357ms 100.06% 2.357ms 2.357ms 1
3967
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.355ms 100.00% 2.355ms 785.131us 3
3968
- Activity Buffer Request 41.22% 1.771ms 41.22% 1.771ms 1.771ms 733.313us 31.13% 733.313us 733.313us 1
3969
- aten::view 0.51% 21.919us 0.51% 21.919us 3.653us 0.000us 0.00% 0.000us 0.000us 6
3970
- aten::empty 1.06% 45.591us 1.06% 45.591us 5.066us 0.000us 0.00% 0.000us 0.000us 9
3971
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.22% 9.340us 0.22% 9.340us 3.113us 0.000us 0.00% 0.000us 0.000us 3
3972
- cudaLaunchKernel 1.00% 42.910us 1.00% 42.910us 14.303us 0.000us 0.00% 0.000us 0.000us 3
3973
- cudaDeviceSynchronize 49.60% 2.131ms 49.60% 2.131ms 2.131ms 0.000us 0.00% 0.000us 0.000us 1
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
- Self CPU time total: 4.295ms
3976
- Self CUDA time total: 2.355ms
3977
 
3978
 
3979
 
@@ -3983,19 +3983,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
3983
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3984
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3985
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3986
- hf_kernels_layer_norm 2.21% 146.665us 30.12% 2.003ms 2.003ms 0.000us 0.00% 6.394ms 6.394ms 1
3987
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.64% 42.811us 27.74% 1.845ms 614.956us 4.819ms 100.00% 6.394ms 2.131ms 3
3988
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.820ms 100.03% 4.820ms 4.820ms 1
3989
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.819ms 100.00% 4.819ms 1.606ms 3
3990
- Activity Buffer Request 26.14% 1.739ms 26.14% 1.739ms 1.739ms 1.575ms 32.69% 1.575ms 1.575ms 1
3991
- aten::view 0.18% 11.889us 0.18% 11.889us 1.981us 0.000us 0.00% 0.000us 0.000us 6
3992
- aten::empty 0.44% 29.319us 0.44% 29.319us 3.258us 0.000us 0.00% 0.000us 0.000us 9
3993
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.690us 0.07% 4.690us 1.563us 0.000us 0.00% 0.000us 0.000us 3
3994
- cudaLaunchKernel 0.44% 29.150us 0.44% 29.150us 9.717us 0.000us 0.00% 0.000us 0.000us 3
3995
- cudaDeviceSynchronize 69.88% 4.648ms 69.88% 4.648ms 4.648ms 0.000us 0.00% 0.000us 0.000us 1
3996
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
- Self CPU time total: 6.651ms
3998
- Self CUDA time total: 4.819ms
3999
 
4000
 
4001
 
@@ -4005,19 +4005,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
- hf_kernels_layer_norm 2.00% 133.492us 30.10% 2.007ms 2.007ms 0.000us 0.00% 6.406ms 6.406ms 1
4009
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.67% 44.942us 27.93% 1.863ms 620.970us 4.818ms 100.00% 6.406ms 2.135ms 3
4010
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.819ms 100.03% 4.819ms 4.819ms 1
4011
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.818ms 100.00% 4.818ms 1.606ms 3
4012
- Activity Buffer Request 26.34% 1.756ms 26.34% 1.756ms 1.756ms 1.588ms 32.97% 1.588ms 1.588ms 1
4013
- aten::view 0.16% 10.780us 0.16% 10.780us 1.797us 0.000us 0.00% 0.000us 0.000us 6
4014
- aten::empty 0.44% 29.582us 0.44% 29.582us 3.287us 0.000us 0.00% 0.000us 0.000us 9
4015
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.759us 0.07% 4.759us 1.586us 0.000us 0.00% 0.000us 0.000us 3
4016
- cudaLaunchKernel 0.41% 27.190us 0.41% 27.190us 9.063us 0.000us 0.00% 0.000us 0.000us 3
4017
- cudaDeviceSynchronize 69.90% 4.662ms 69.90% 4.662ms 4.662ms 0.000us 0.00% 0.000us 0.000us 1
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
- Self CPU time total: 6.669ms
4020
- Self CUDA time total: 4.818ms
4021
 
4022
 
4023
 
@@ -4027,36 +4027,36 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4029
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4030
- hf_kernels_layer_norm 1.10% 128.730us 19.25% 2.252ms 2.252ms 0.000us 0.00% 12.776ms 12.776ms 1
4031
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.38% 44.142us 18.05% 2.112ms 704.015us 9.608ms 100.00% 12.776ms 4.259ms 3
4032
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.609ms 100.01% 9.609ms 9.609ms 1
4033
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.608ms 100.00% 9.608ms 3.203ms 3
4034
- Activity Buffer Request 15.07% 1.763ms 15.07% 1.763ms 1.763ms 3.168ms 32.98% 3.168ms 3.168ms 1
4035
- aten::view 0.10% 11.611us 0.10% 11.611us 1.935us 0.000us 0.00% 0.000us 0.000us 6
4036
- aten::empty 0.25% 29.429us 0.25% 29.429us 3.270us 0.000us 0.00% 0.000us 0.000us 9
4037
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.04% 4.891us 0.04% 4.891us 1.630us 0.000us 0.00% 0.000us 0.000us 3
4038
- cudaLaunchKernel 2.31% 270.775us 2.31% 270.775us 90.258us 0.000us 0.00% 0.000us 0.000us 3
4039
- cudaDeviceSynchronize 80.75% 9.448ms 80.75% 9.448ms 9.448ms 0.000us 0.00% 0.000us 0.000us 1
4040
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4041
- Self CPU time total: 11.700ms
4042
- Self CUDA time total: 9.608ms
4043
 
4044
 
4045
  impl wl p50(ms) ok
4046
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4047
- hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4048
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4049
  hf_kernels_layer_norm LN_B16_S4096_D8192 3.25 True
4050
  </pre></div>
4051
  <div class="uv-install-logs" id="uv-logs-benchmark">
4052
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4053
  <div class="uv-logs-content" style="display: none;">
4054
- Installed 15 packages in 12ms
4055
  </div>
4056
  </div>
4057
  <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4058
- Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.12it/s]
4059
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.24it/s]</div>
4060
  <div class="cell-artifacts">
4061
  <h4>Artifacts:</h4>
4062
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3889
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3890
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3891
  </span> |
3892
+ Cell: benchmark | 6.26s
3893
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3894
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3895
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3961
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3962
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3963
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3964
+ hf_kernels_layer_norm 4.17% 177.304us 48.13% 2.048ms 2.048ms 0.000us 0.00% 3.167ms 3.167ms 1
3965
+ _layer_norm_f8ec252::dropout_add_ln_fwd 1.47% 62.693us 43.45% 1.849ms 616.229us 2.429ms 100.00% 3.167ms 1.056ms 3
3966
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.430ms 100.06% 2.430ms 2.430ms 1
3967
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.429ms 100.00% 2.429ms 809.553us 3
3968
+ Activity Buffer Request 39.70% 1.689ms 39.70% 1.689ms 1.689ms 738.629us 30.41% 738.629us 738.629us 1
3969
+ aten::view 0.51% 21.739us 0.51% 21.739us 3.623us 0.000us 0.00% 0.000us 0.000us 6
3970
+ aten::empty 1.04% 44.400us 1.04% 44.400us 4.933us 0.000us 0.00% 0.000us 0.000us 9
3971
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.22% 9.310us 0.22% 9.310us 3.103us 0.000us 0.00% 0.000us 0.000us 3
3972
+ cudaLaunchKernel 1.01% 43.131us 1.01% 43.131us 14.377us 0.000us 0.00% 0.000us 0.000us 3
3973
+ cudaDeviceSynchronize 51.87% 2.207ms 51.87% 2.207ms 2.207ms 0.000us 0.00% 0.000us 0.000us 1
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
+ Self CPU time total: 4.255ms
3976
+ Self CUDA time total: 2.429ms
3977
 
3978
 
3979
 
 
3983
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3984
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3985
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3986
+ hf_kernels_layer_norm 2.14% 140.133us 29.32% 1.923ms 1.923ms 0.000us 0.00% 6.388ms 6.388ms 1
3987
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.69% 45.053us 27.01% 1.772ms 590.648us 4.807ms 100.00% 6.388ms 2.129ms 3
3988
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.808ms 100.03% 4.808ms 4.808ms 1
3989
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.807ms 100.00% 4.807ms 1.602ms 3
3990
+ Activity Buffer Request 25.34% 1.663ms 25.34% 1.663ms 1.663ms 1.581ms 32.89% 1.581ms 1.581ms 1
3991
+ aten::view 0.17% 11.390us 0.17% 11.390us 1.898us 0.000us 0.00% 0.000us 0.000us 6
3992
+ aten::empty 0.45% 29.620us 0.45% 29.620us 3.291us 0.000us 0.00% 0.000us 0.000us 9
3993
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.820us 0.07% 4.820us 1.607us 0.000us 0.00% 0.000us 0.000us 3
3994
+ cudaLaunchKernel 0.46% 29.860us 0.46% 29.860us 9.953us 0.000us 0.00% 0.000us 0.000us 3
3995
+ cudaDeviceSynchronize 70.68% 4.637ms 70.68% 4.637ms 4.637ms 0.000us 0.00% 0.000us 0.000us 1
3996
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
+ Self CPU time total: 6.560ms
3998
+ Self CUDA time total: 4.807ms
3999
 
4000
 
4001
 
 
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
+ hf_kernels_layer_norm 1.98% 129.253us 29.33% 1.919ms 1.919ms 0.000us 0.00% 6.330ms 6.330ms 1
4009
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.71% 46.780us 27.18% 1.779ms 592.854us 4.774ms 100.00% 6.330ms 2.110ms 3
4010
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.775ms 100.03% 4.775ms 4.775ms 1
4011
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.774ms 100.00% 4.774ms 1.591ms 3
4012
+ Activity Buffer Request 25.49% 1.668ms 25.49% 1.668ms 1.668ms 1.556ms 32.59% 1.556ms 1.556ms 1
4013
+ aten::view 0.17% 11.271us 0.17% 11.271us 1.879us 0.000us 0.00% 0.000us 0.000us 6
4014
+ aten::empty 0.45% 29.221us 0.45% 29.221us 3.247us 0.000us 0.00% 0.000us 0.000us 9
4015
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.980us 0.08% 4.980us 1.660us 0.000us 0.00% 0.000us 0.000us 3
4016
+ cudaLaunchKernel 0.45% 29.470us 0.45% 29.470us 9.823us 0.000us 0.00% 0.000us 0.000us 3
4017
+ cudaDeviceSynchronize 70.67% 4.624ms 70.67% 4.624ms 4.624ms 0.000us 0.00% 0.000us 0.000us 1
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
+ Self CPU time total: 6.543ms
4020
+ Self CUDA time total: 4.774ms
4021
 
4022
 
4023
 
 
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4029
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4030
+ hf_kernels_layer_norm 1.22% 142.314us 18.53% 2.155ms 2.155ms 0.000us 0.00% 12.836ms 12.836ms 1
4031
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.38% 44.492us 17.20% 2.000ms 666.802us 9.636ms 100.00% 12.836ms 4.279ms 3
4032
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.637ms 100.02% 9.637ms 9.637ms 1
4033
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.636ms 100.00% 9.636ms 3.212ms 3
4034
+ Activity Buffer Request 14.57% 1.694ms 14.57% 1.694ms 1.694ms 3.200ms 33.21% 3.200ms 3.200ms 1
4035
+ aten::view 0.10% 12.130us 0.10% 12.130us 2.022us 0.000us 0.00% 0.000us 0.000us 6
4036
+ aten::empty 0.25% 29.499us 0.25% 29.499us 3.278us 0.000us 0.00% 0.000us 0.000us 9
4037
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.04% 4.820us 0.04% 4.820us 1.607us 0.000us 0.00% 0.000us 0.000us 3
4038
+ cudaLaunchKernel 1.96% 227.814us 1.96% 227.814us 75.938us 0.000us 0.00% 0.000us 0.000us 3
4039
+ cudaDeviceSynchronize 81.47% 9.472ms 81.47% 9.472ms 9.472ms 0.000us 0.00% 0.000us 0.000us 1
4040
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4041
+ Self CPU time total: 11.627ms
4042
+ Self CUDA time total: 9.636ms
4043
 
4044
 
4045
  impl wl p50(ms) ok
4046
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4047
+ hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
4048
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4049
  hf_kernels_layer_norm LN_B16_S4096_D8192 3.25 True
4050
  </pre></div>
4051
  <div class="uv-install-logs" id="uv-logs-benchmark">
4052
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4053
  <div class="uv-logs-content" style="display: none;">
4054
+ Installed 14 packages in 12ms
4055
  </div>
4056
  </div>
4057
  <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4058
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.35it/s]
4059
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.71it/s]</div>
4060
  <div class="cell-artifacts">
4061
  <h4>Artifacts:</h4>
4062
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
layer_norm/impls/torch_layer_norm.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.22s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,16 +3904,16 @@ Cell: nv | 0.22s
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
- <div class="cell-stdout"><pre class="stdout-text">Mon Nov 10 22:11:21 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
- | NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
3911
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3912
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
- | N/A 36C P0 121W / 350W | 0MiB / 46068MiB | 27% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
@@ -3937,7 +3937,7 @@ Cell: nv | 0.22s
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3939
  </span> |
3940
- Cell: benchmark | 7.73s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3985,19 +3985,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
3985
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3986
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
- torch_layer_norm 3.69% 156.741us 50.70% 2.155ms 2.155ms 0.000us 0.00% 3.028ms 3.028ms 1
3989
- aten::layer_norm 0.35% 14.940us 47.01% 1.998ms 666.050us 0.000us 0.00% 3.028ms 1.009ms 3
3990
- aten::native_layer_norm 1.75% 74.522us 46.66% 1.983ms 661.070us 2.321ms 100.00% 3.028ms 1.009ms 3
3991
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.322ms 100.06% 2.322ms 2.322ms 1
3992
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.321ms 100.00% 2.321ms 773.663us 3
3993
- Activity Buffer Request 42.51% 1.807ms 42.51% 1.807ms 1.807ms 707.360us 30.48% 707.360us 707.360us 1
3994
- aten::empty 1.11% 47.041us 1.11% 47.041us 5.227us 0.000us 0.00% 0.000us 0.000us 9
3995
- cudaLaunchKernel 1.12% 47.761us 1.12% 47.761us 15.920us 0.000us 0.00% 0.000us 0.000us 3
3996
- aten::view 0.17% 7.200us 0.17% 7.200us 1.200us 0.000us 0.00% 0.000us 0.000us 6
3997
- cudaDeviceSynchronize 49.30% 2.095ms 49.30% 2.095ms 2.095ms 0.000us 0.00% 0.000us 0.000us 1
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
- Self CPU time total: 4.250ms
4000
- Self CUDA time total: 2.321ms
4001
 
4002
 
4003
 
@@ -4007,19 +4007,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4009
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4010
- torch_layer_norm 1.08% 72.370us 29.56% 1.986ms 1.986ms 0.000us 0.00% 6.439ms 6.439ms 1
4011
- aten::layer_norm 0.14% 9.121us 28.49% 1.914ms 637.916us 0.000us 0.00% 6.439ms 2.146ms 3
4012
- aten::native_layer_norm 0.74% 49.777us 28.35% 1.905ms 634.876us 4.867ms 100.00% 6.439ms 2.146ms 3
4013
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.868ms 100.03% 4.868ms 4.868ms 1
4014
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.867ms 100.00% 4.867ms 1.622ms 3
4015
- Activity Buffer Request 26.73% 1.796ms 26.73% 1.796ms 1.796ms 1.572ms 32.30% 1.572ms 1.572ms 1
4016
- aten::empty 0.42% 28.501us 0.42% 28.501us 3.167us 0.000us 0.00% 0.000us 0.000us 9
4017
- cudaLaunchKernel 0.40% 26.970us 0.40% 26.970us 8.990us 0.000us 0.00% 0.000us 0.000us 3
4018
- aten::view 0.06% 3.863us 0.06% 3.863us 0.644us 0.000us 0.00% 0.000us 0.000us 6
4019
- cudaDeviceSynchronize 70.44% 4.732ms 70.44% 4.732ms 4.732ms 0.000us 0.00% 0.000us 0.000us 1
4020
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4021
- Self CPU time total: 6.718ms
4022
- Self CUDA time total: 4.867ms
4023
 
4024
 
4025
 
@@ -4029,19 +4029,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
4029
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4030
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4031
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4032
- torch_layer_norm 1.07% 70.921us 30.56% 2.021ms 2.021ms 0.000us 0.00% 6.238ms 6.238ms 1
4033
- aten::layer_norm 0.13% 8.430us 29.49% 1.951ms 650.186us 0.000us 0.00% 6.238ms 2.079ms 3
4034
- aten::native_layer_norm 0.76% 50.331us 29.36% 1.942ms 647.376us 4.725ms 100.00% 6.238ms 2.079ms 3
4035
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.726ms 100.03% 4.726ms 4.726ms 1
4036
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.725ms 100.00% 4.725ms 1.575ms 3
4037
- Activity Buffer Request 27.69% 1.832ms 27.69% 1.832ms 1.832ms 1.513ms 32.02% 1.513ms 1.513ms 1
4038
- aten::empty 0.42% 27.940us 0.42% 27.940us 3.104us 0.000us 0.00% 0.000us 0.000us 9
4039
- cudaLaunchKernel 0.42% 27.891us 0.42% 27.891us 9.297us 0.000us 0.00% 0.000us 0.000us 3
4040
- aten::view 0.06% 4.260us 0.06% 4.260us 0.710us 0.000us 0.00% 0.000us 0.000us 6
4041
- cudaDeviceSynchronize 69.44% 4.592ms 69.44% 4.592ms 4.592ms 0.000us 0.00% 0.000us 0.000us 1
4042
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4043
- Self CPU time total: 6.614ms
4044
- Self CUDA time total: 4.725ms
4045
 
4046
 
4047
 
@@ -4051,23 +4051,23 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
4051
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4052
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
- torch_layer_norm 0.62% 70.560us 14.96% 1.705ms 1.705ms 0.000us 0.00% 13.056ms 13.056ms 1
4055
- aten::layer_norm 0.08% 8.830us 14.34% 1.634ms 544.695us 0.000us 0.00% 13.056ms 4.352ms 3
4056
- aten::native_layer_norm 0.44% 49.828us 14.26% 1.625ms 541.752us 9.820ms 100.00% 13.056ms 4.352ms 3
4057
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.821ms 100.01% 9.821ms 9.821ms 1
4058
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.820ms 100.00% 9.820ms 3.273ms 3
4059
- Activity Buffer Request 11.47% 1.307ms 11.47% 1.307ms 1.307ms 3.236ms 32.96% 3.236ms 3.236ms 1
4060
- aten::empty 0.24% 27.683us 0.24% 27.683us 3.076us 0.000us 0.00% 0.000us 0.000us 9
4061
- cudaLaunchKernel 2.07% 236.314us 2.07% 236.314us 78.771us 0.000us 0.00% 0.000us 0.000us 3
4062
- aten::view 0.03% 3.970us 0.03% 3.970us 0.662us 0.000us 0.00% 0.000us 0.000us 6
4063
- cudaDeviceSynchronize 85.04% 9.690ms 85.04% 9.690ms 9.690ms 0.000us 0.00% 0.000us 0.000us 1
4064
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
- Self CPU time total: 11.395ms
4066
- Self CUDA time total: 9.820ms
4067
 
4068
 
4069
  impl wl p50(ms) ok
4070
- torch_layer_norm LN_B16_S2048_D4096 0.83 True
4071
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4072
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4073
  torch_layer_norm LN_B16_S4096_D8192 3.33 True
@@ -4075,7 +4075,7 @@ torch_layer_norm LN_B16_S4096_D8192 3.33 True
4075
  <div class="uv-install-logs" id="uv-logs-benchmark">
4076
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4077
  <div class="uv-logs-content" style="display: none;">
4078
- Installed 37 packages in 306ms
4079
  </div>
4080
  </div>
4081
  <div class="cell-artifacts">
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:56:51 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
+ | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
3911
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3912
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
+ | N/A 34C P0 107W / 350W | 0MiB / 46068MiB | 53% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
 
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3939
  </span> |
3940
+ Cell: benchmark | 7.79s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3985
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3986
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
+ torch_layer_norm 3.56% 149.102us 49.42% 2.068ms 2.068ms 0.000us 0.00% 3.039ms 3.039ms 1
3989
+ aten::layer_norm 0.35% 14.790us 45.86% 1.919ms 639.751us 0.000us 0.00% 3.039ms 1.013ms 3
3990
+ aten::native_layer_norm 1.65% 69.001us 45.51% 1.904ms 634.821us 2.327ms 100.00% 3.039ms 1.013ms 3
3991
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.329ms 100.06% 2.329ms 2.329ms 1
3992
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.327ms 100.00% 2.327ms 775.759us 3
3993
+ Activity Buffer Request 41.45% 1.735ms 41.45% 1.735ms 1.735ms 711.588us 30.58% 711.588us 711.588us 1
3994
+ aten::empty 1.11% 46.511us 1.11% 46.511us 5.168us 0.000us 0.00% 0.000us 0.000us 9
3995
+ cudaLaunchKernel 1.13% 47.301us 1.13% 47.301us 15.767us 0.000us 0.00% 0.000us 0.000us 3
3996
+ aten::view 0.16% 6.890us 0.16% 6.890us 1.148us 0.000us 0.00% 0.000us 0.000us 6
3997
+ cudaDeviceSynchronize 50.58% 2.117ms 50.58% 2.117ms 2.117ms 0.000us 0.00% 0.000us 0.000us 1
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
+ Self CPU time total: 4.185ms
4000
+ Self CUDA time total: 2.327ms
4001
 
4002
 
4003
 
 
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4009
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4010
+ torch_layer_norm 1.05% 70.042us 28.68% 1.911ms 1.911ms 0.000us 0.00% 6.475ms 6.475ms 1
4011
+ aten::layer_norm 0.13% 8.728us 27.63% 1.841ms 613.810us 0.000us 0.00% 6.475ms 2.158ms 3
4012
+ aten::native_layer_norm 0.73% 48.442us 27.50% 1.833ms 610.901us 4.886ms 100.00% 6.475ms 2.158ms 3
4013
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.888ms 100.03% 4.888ms 4.888ms 1
4014
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.886ms 100.00% 4.886ms 1.629ms 3
4015
+ Activity Buffer Request 25.85% 1.723ms 25.85% 1.723ms 1.723ms 1.589ms 32.51% 1.589ms 1.589ms 1
4016
+ aten::empty 0.43% 28.711us 0.43% 28.711us 3.190us 0.000us 0.00% 0.000us 0.000us 9
4017
+ cudaLaunchKernel 0.44% 29.201us 0.44% 29.201us 9.734us 0.000us 0.00% 0.000us 0.000us 3
4018
+ aten::view 0.06% 3.829us 0.06% 3.829us 0.638us 0.000us 0.00% 0.000us 0.000us 6
4019
+ cudaDeviceSynchronize 71.32% 4.753ms 71.32% 4.753ms 4.753ms 0.000us 0.00% 0.000us 0.000us 1
4020
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4021
+ Self CPU time total: 6.665ms
4022
+ Self CUDA time total: 4.886ms
4023
 
4024
 
4025
 
 
4029
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4030
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4031
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4032
+ torch_layer_norm 1.06% 69.120us 29.93% 1.960ms 1.960ms 0.000us 0.00% 6.232ms 6.232ms 1
4033
+ aten::layer_norm 0.13% 8.631us 28.88% 1.891ms 630.434us 0.000us 0.00% 6.232ms 2.077ms 3
4034
+ aten::native_layer_norm 0.71% 46.790us 28.75% 1.883ms 627.557us 4.719ms 100.00% 6.232ms 2.077ms 3
4035
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.721ms 100.03% 4.721ms 4.721ms 1
4036
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.719ms 100.00% 4.719ms 1.573ms 3
4037
+ Activity Buffer Request 27.06% 1.772ms 27.06% 1.772ms 1.772ms 1.513ms 32.05% 1.513ms 1.513ms 1
4038
+ aten::empty 0.45% 29.333us 0.45% 29.333us 3.259us 0.000us 0.00% 0.000us 0.000us 9
4039
+ cudaLaunchKernel 0.46% 30.200us 0.46% 30.200us 10.067us 0.000us 0.00% 0.000us 0.000us 3
4040
+ aten::view 0.06% 3.850us 0.06% 3.850us 0.642us 0.000us 0.00% 0.000us 0.000us 6
4041
+ cudaDeviceSynchronize 70.07% 4.589ms 70.07% 4.589ms 4.589ms 0.000us 0.00% 0.000us 0.000us 1
4042
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4043
+ Self CPU time total: 6.549ms
4044
+ Self CUDA time total: 4.719ms
4045
 
4046
 
4047
 
 
4051
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4052
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
+ torch_layer_norm 0.60% 67.701us 14.52% 1.650ms 1.650ms 0.000us 0.00% 13.091ms 13.091ms 1
4055
+ aten::layer_norm 0.08% 8.549us 13.92% 1.582ms 527.445us 0.000us 0.00% 13.091ms 4.364ms 3
4056
+ aten::native_layer_norm 0.41% 47.051us 13.85% 1.574ms 524.596us 9.846ms 100.00% 13.091ms 4.364ms 3
4057
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.848ms 100.02% 9.848ms 9.848ms 1
4058
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.846ms 100.00% 9.846ms 3.282ms 3
4059
+ Activity Buffer Request 11.12% 1.264ms 11.12% 1.264ms 1.264ms 3.245ms 32.96% 3.245ms 3.245ms 1
4060
+ aten::empty 0.26% 29.420us 0.26% 29.420us 3.269us 0.000us 0.00% 0.000us 0.000us 9
4061
+ cudaLaunchKernel 2.02% 229.604us 2.02% 229.604us 76.535us 0.000us 0.00% 0.000us 0.000us 3
4062
+ aten::view 0.04% 3.990us 0.04% 3.990us 0.665us 0.000us 0.00% 0.000us 0.000us 6
4063
+ cudaDeviceSynchronize 85.48% 9.715ms 85.48% 9.715ms 9.715ms 0.000us 0.00% 0.000us 0.000us 1
4064
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
+ Self CPU time total: 11.365ms
4066
+ Self CUDA time total: 9.846ms
4067
 
4068
 
4069
  impl wl p50(ms) ok
4070
+ torch_layer_norm LN_B16_S2048_D4096 0.81 True
4071
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4072
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4073
  torch_layer_norm LN_B16_S4096_D8192 3.33 True
 
4075
  <div class="uv-install-logs" id="uv-logs-benchmark">
4076
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4077
  <div class="uv-logs-content" style="display: none;">
4078
+ Installed 37 packages in 298ms
4079
  </div>
4080
  </div>
4081
  <div class="cell-artifacts">
layer_norm/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 89b9bc3a43e718cff3bcbca28b57be1049bf5906666d437e6fb05e36ce003086
  • Pointer size: 130 Bytes
  • Size of remote file: 14.6 kB

Git LFS Details

  • SHA256: 7e491e55a24ade71662af81d8d2a6705d52134907b596d3edfe9685af71c4890
  • Pointer size: 130 Bytes
  • Size of remote file: 14.6 kB
layer_norm/results/combined_results.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3889,11 +3889,11 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-11-10T22:12:10.245468</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
3896
- <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
3897
  </ns2:Agent>
3898
  </dc:creator>
3899
  </ns2:Work>
@@ -3973,70 +3973,70 @@ body[data-tool="eraser"] .main-content {
3973
  <g id="matplotlib.axis_2">
3974
  <g id="ytick_1">
3975
  <g id="grid-y--2" class="grid grid-y">
3976
- <path d="M 47.72 410.499175 L 840.20233 410.499175 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3977
  </g>
3978
  <g id="line2d_5">
3979
  <defs>
3980
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3981
  </defs>
3982
  <g>
3983
- <use ns4:href="#m0fca2865ba" x="47.72" y="410.499175" style="stroke: #000000; stroke-width: 0.8" />
3984
  </g>
3985
  </g>
3986
  <g id="text_5">
3987
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="414.298394" transform="rotate(-0 40.72 414.298394)">1.0</text>
3988
  </g>
3989
  </g>
3990
  <g id="ytick_2">
3991
  <g id="grid-y--3" class="grid grid-y">
3992
- <path d="M 47.72 332.509284 L 840.20233 332.509284 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3993
  </g>
3994
  <g id="line2d_6">
3995
  <g>
3996
- <use ns4:href="#m0fca2865ba" x="47.72" y="332.509284" style="stroke: #000000; stroke-width: 0.8" />
3997
  </g>
3998
  </g>
3999
  <g id="text_6">
4000
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="336.308502" transform="rotate(-0 40.72 336.308502)">1.5</text>
4001
  </g>
4002
  </g>
4003
  <g id="ytick_3">
4004
  <g id="grid-y--4" class="grid grid-y">
4005
- <path d="M 47.72 254.519392 L 840.20233 254.519392 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4006
  </g>
4007
  <g id="line2d_7">
4008
  <g>
4009
- <use ns4:href="#m0fca2865ba" x="47.72" y="254.519392" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="258.318611" transform="rotate(-0 40.72 258.318611)">2.0</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_4">
4017
  <g id="grid-y--5" class="grid grid-y">
4018
- <path d="M 47.72 176.5295 L 840.20233 176.5295 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
- <use ns4:href="#m0fca2865ba" x="47.72" y="176.5295" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="180.328719" transform="rotate(-0 40.72 180.328719)">2.5</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_5">
4030
  <g id="grid-y--6" class="grid grid-y">
4031
- <path d="M 47.72 98.539608 L 840.20233 98.539608 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
- <use ns4:href="#m0fca2865ba" x="47.72" y="98.539608" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="102.338827" transform="rotate(-0 40.72 102.338827)">3.0</text>
4040
  </g>
4041
  </g>
4042
  <g id="label--y" class="ylabel">
@@ -4044,27 +4044,27 @@ body[data-tool="eraser"] .main-content {
4044
  </g>
4045
  </g>
4046
  <g id="series--torch-layer-norm" class="series">
4047
- <path d="M 83.741924 437.689571 L 323.888085 305.054658 L 564.034245 315.620729 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4048
  <defs>
4049
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4050
  </defs>
4051
  <g clip-path="url(#p2214f54723)">
4052
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4053
- <use ns4:href="#md7efaf3aec" x="323.888085" y="305.054658" style="fill: #1f77b4; stroke: #1f77b4" />
4054
- <use ns4:href="#md7efaf3aec" x="564.034245" y="315.620729" style="fill: #1f77b4; stroke: #1f77b4" />
4055
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4056
  </g>
4057
  </g>
4058
  <g id="series--hf-kernels-layer-norm" class="series">
4059
- <path d="M 83.741924 436.885808 L 323.888085 309.169093 L 564.034245 309.159734 L 804.180406 59.128196 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4060
  <defs>
4061
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4062
  </defs>
4063
  <g clip-path="url(#p2214f54723)">
4064
- <use ns4:href="#m9b8c54d372" x="83.741924" y="436.885808" style="fill: #ff7f0e; stroke: #ff7f0e" />
4065
- <use ns4:href="#m9b8c54d372" x="323.888085" y="309.169093" style="fill: #ff7f0e; stroke: #ff7f0e" />
4066
- <use ns4:href="#m9b8c54d372" x="564.034245" y="309.159734" style="fill: #ff7f0e; stroke: #ff7f0e" />
4067
- <use ns4:href="#m9b8c54d372" x="804.180406" y="59.128196" style="fill: #ff7f0e; stroke: #ff7f0e" />
4068
  </g>
4069
  </g>
4070
  <g id="patch_3">
@@ -4122,7 +4122,7 @@ body[data-tool="eraser"] .main-content {
4122
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4123
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4124
  </span> |
4125
- Cell: combine | 4.44s
4126
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4127
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4128
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4210,10 +4210,10 @@ COMBINED BENCHMARK SUMMARY
4210
 
4211
  impl wl p50(ms) ok
4212
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4213
- hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4214
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4215
  hf_kernels_layer_norm LN_B16_S4096_D8192 3.25 True
4216
- torch_layer_norm LN_B16_S2048_D4096 0.83 True
4217
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4218
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4219
  torch_layer_norm LN_B16_S4096_D8192 3.33 True
@@ -4236,7 +4236,7 @@ Implementations included:
4236
  <div class="uv-install-logs" id="uv-logs-combine">
4237
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4238
  <div class="uv-logs-content" style="display: none;">
4239
- Installed 37 packages in 284ms
4240
  </div>
4241
  </div>
4242
  <div class="cell-artifacts">
@@ -4249,11 +4249,11 @@ Installed 37 packages in 284ms
4249
  <rdf:RDF>
4250
  <ns2:Work>
4251
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4252
- <dc:date>2025-11-10T22:12:10.245468</dc:date>
4253
  <dc:format>image/svg+xml</dc:format>
4254
  <dc:creator>
4255
  <ns2:Agent>
4256
- <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
4257
  </ns2:Agent>
4258
  </dc:creator>
4259
  </ns2:Work>
@@ -4333,70 +4333,70 @@ Installed 37 packages in 284ms
4333
  <g id="matplotlib.axis_2">
4334
  <g id="ytick_1">
4335
  <g id="grid-y--2" class="grid grid-y">
4336
- <path d="M 47.72 410.499175 L 840.20233 410.499175 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4337
  </g>
4338
  <g id="line2d_5">
4339
  <defs>
4340
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4341
  </defs>
4342
  <g>
4343
- <use ns4:href="#m0fca2865ba" x="47.72" y="410.499175" style="stroke: #000000; stroke-width: 0.8" />
4344
  </g>
4345
  </g>
4346
  <g id="text_5">
4347
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="414.298394" transform="rotate(-0 40.72 414.298394)">1.0</text>
4348
  </g>
4349
  </g>
4350
  <g id="ytick_2">
4351
  <g id="grid-y--3" class="grid grid-y">
4352
- <path d="M 47.72 332.509284 L 840.20233 332.509284 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4353
  </g>
4354
  <g id="line2d_6">
4355
  <g>
4356
- <use ns4:href="#m0fca2865ba" x="47.72" y="332.509284" style="stroke: #000000; stroke-width: 0.8" />
4357
  </g>
4358
  </g>
4359
  <g id="text_6">
4360
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="336.308502" transform="rotate(-0 40.72 336.308502)">1.5</text>
4361
  </g>
4362
  </g>
4363
  <g id="ytick_3">
4364
  <g id="grid-y--4" class="grid grid-y">
4365
- <path d="M 47.72 254.519392 L 840.20233 254.519392 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4366
  </g>
4367
  <g id="line2d_7">
4368
  <g>
4369
- <use ns4:href="#m0fca2865ba" x="47.72" y="254.519392" style="stroke: #000000; stroke-width: 0.8" />
4370
  </g>
4371
  </g>
4372
  <g id="text_7">
4373
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="258.318611" transform="rotate(-0 40.72 258.318611)">2.0</text>
4374
  </g>
4375
  </g>
4376
  <g id="ytick_4">
4377
  <g id="grid-y--5" class="grid grid-y">
4378
- <path d="M 47.72 176.5295 L 840.20233 176.5295 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4379
  </g>
4380
  <g id="line2d_8">
4381
  <g>
4382
- <use ns4:href="#m0fca2865ba" x="47.72" y="176.5295" style="stroke: #000000; stroke-width: 0.8" />
4383
  </g>
4384
  </g>
4385
  <g id="text_8">
4386
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="180.328719" transform="rotate(-0 40.72 180.328719)">2.5</text>
4387
  </g>
4388
  </g>
4389
  <g id="ytick_5">
4390
  <g id="grid-y--6" class="grid grid-y">
4391
- <path d="M 47.72 98.539608 L 840.20233 98.539608 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4392
  </g>
4393
  <g id="line2d_9">
4394
  <g>
4395
- <use ns4:href="#m0fca2865ba" x="47.72" y="98.539608" style="stroke: #000000; stroke-width: 0.8" />
4396
  </g>
4397
  </g>
4398
  <g id="text_9">
4399
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="102.338827" transform="rotate(-0 40.72 102.338827)">3.0</text>
4400
  </g>
4401
  </g>
4402
  <g id="label--y" class="ylabel">
@@ -4404,27 +4404,27 @@ Installed 37 packages in 284ms
4404
  </g>
4405
  </g>
4406
  <g id="series--torch-layer-norm" class="series">
4407
- <path d="M 83.741924 437.689571 L 323.888085 305.054658 L 564.034245 315.620729 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4408
  <defs>
4409
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4410
  </defs>
4411
  <g clip-path="url(#p2214f54723)">
4412
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4413
- <use ns4:href="#md7efaf3aec" x="323.888085" y="305.054658" style="fill: #1f77b4; stroke: #1f77b4" />
4414
- <use ns4:href="#md7efaf3aec" x="564.034245" y="315.620729" style="fill: #1f77b4; stroke: #1f77b4" />
4415
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4416
  </g>
4417
  </g>
4418
  <g id="series--hf-kernels-layer-norm" class="series">
4419
- <path d="M 83.741924 436.885808 L 323.888085 309.169093 L 564.034245 309.159734 L 804.180406 59.128196 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4420
  <defs>
4421
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4422
  </defs>
4423
  <g clip-path="url(#p2214f54723)">
4424
- <use ns4:href="#m9b8c54d372" x="83.741924" y="436.885808" style="fill: #ff7f0e; stroke: #ff7f0e" />
4425
- <use ns4:href="#m9b8c54d372" x="323.888085" y="309.169093" style="fill: #ff7f0e; stroke: #ff7f0e" />
4426
- <use ns4:href="#m9b8c54d372" x="564.034245" y="309.159734" style="fill: #ff7f0e; stroke: #ff7f0e" />
4427
- <use ns4:href="#m9b8c54d372" x="804.180406" y="59.128196" style="fill: #ff7f0e; stroke: #ff7f0e" />
4428
  </g>
4429
  </g>
4430
  <g id="patch_3">
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T19:09:50.663153</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
3896
+ <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
3897
  </ns2:Agent>
3898
  </dc:creator>
3899
  </ns2:Work>
 
3973
  <g id="matplotlib.axis_2">
3974
  <g id="ytick_1">
3975
  <g id="grid-y--2" class="grid grid-y">
3976
+ <path d="M 47.72 408.774166 L 840.20233 408.774166 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3977
  </g>
3978
  <g id="line2d_5">
3979
  <defs>
3980
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3981
  </defs>
3982
  <g>
3983
+ <use ns4:href="#m0fca2865ba" x="47.72" y="408.774166" style="stroke: #000000; stroke-width: 0.8" />
3984
  </g>
3985
  </g>
3986
  <g id="text_5">
3987
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.573385" transform="rotate(-0 40.72 412.573385)">1.0</text>
3988
  </g>
3989
  </g>
3990
  <g id="ytick_2">
3991
  <g id="grid-y--3" class="grid grid-y">
3992
+ <path d="M 47.72 330.886714 L 840.20233 330.886714 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3993
  </g>
3994
  <g id="line2d_6">
3995
  <g>
3996
+ <use ns4:href="#m0fca2865ba" x="47.72" y="330.886714" style="stroke: #000000; stroke-width: 0.8" />
3997
  </g>
3998
  </g>
3999
  <g id="text_6">
4000
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.685933" transform="rotate(-0 40.72 334.685933)">1.5</text>
4001
  </g>
4002
  </g>
4003
  <g id="ytick_3">
4004
  <g id="grid-y--4" class="grid grid-y">
4005
+ <path d="M 47.72 252.999261 L 840.20233 252.999261 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4006
  </g>
4007
  <g id="line2d_7">
4008
  <g>
4009
+ <use ns4:href="#m0fca2865ba" x="47.72" y="252.999261" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.79848" transform="rotate(-0 40.72 256.79848)">2.0</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_4">
4017
  <g id="grid-y--5" class="grid grid-y">
4018
+ <path d="M 47.72 175.111809 L 840.20233 175.111809 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
+ <use ns4:href="#m0fca2865ba" x="47.72" y="175.111809" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.911028" transform="rotate(-0 40.72 178.911028)">2.5</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_5">
4030
  <g id="grid-y--6" class="grid grid-y">
4031
+ <path d="M 47.72 97.224356 L 840.20233 97.224356 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
+ <use ns4:href="#m0fca2865ba" x="47.72" y="97.224356" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.023575" transform="rotate(-0 40.72 101.023575)">3.0</text>
4040
  </g>
4041
  </g>
4042
  <g id="label--y" class="ylabel">
 
4044
  </g>
4045
  </g>
4046
  <g id="series--torch-layer-norm" class="series">
4047
+ <path d="M 83.741924 437.689571 L 323.888085 302.950354 L 564.034245 314.128917 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4048
  <defs>
4049
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4050
  </defs>
4051
  <g clip-path="url(#p2214f54723)">
4052
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4053
+ <use ns4:href="#md7efaf3aec" x="323.888085" y="302.950354" style="fill: #1f77b4; stroke: #1f77b4" />
4054
+ <use ns4:href="#md7efaf3aec" x="564.034245" y="314.128917" style="fill: #1f77b4; stroke: #1f77b4" />
4055
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4056
  </g>
4057
  </g>
4058
  <g id="series--hf-kernels-layer-norm" class="series">
4059
+ <path d="M 83.741924 434.70508 L 323.888085 305.990613 L 564.034245 307.267967 L 804.180406 57.889324 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4060
  <defs>
4061
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4062
  </defs>
4063
  <g clip-path="url(#p2214f54723)">
4064
+ <use ns4:href="#m9b8c54d372" x="83.741924" y="434.70508" style="fill: #ff7f0e; stroke: #ff7f0e" />
4065
+ <use ns4:href="#m9b8c54d372" x="323.888085" y="305.990613" style="fill: #ff7f0e; stroke: #ff7f0e" />
4066
+ <use ns4:href="#m9b8c54d372" x="564.034245" y="307.267967" style="fill: #ff7f0e; stroke: #ff7f0e" />
4067
+ <use ns4:href="#m9b8c54d372" x="804.180406" y="57.889324" style="fill: #ff7f0e; stroke: #ff7f0e" />
4068
  </g>
4069
  </g>
4070
  <g id="patch_3">
 
4122
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4123
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4124
  </span> |
4125
+ Cell: combine | 4.51s
4126
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4127
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4128
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4210
 
4211
  impl wl p50(ms) ok
4212
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4213
+ hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
4214
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4215
  hf_kernels_layer_norm LN_B16_S4096_D8192 3.25 True
4216
+ torch_layer_norm LN_B16_S2048_D4096 0.81 True
4217
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4218
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4219
  torch_layer_norm LN_B16_S4096_D8192 3.33 True
 
4236
  <div class="uv-install-logs" id="uv-logs-combine">
4237
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4238
  <div class="uv-logs-content" style="display: none;">
4239
+ Installed 37 packages in 297ms
4240
  </div>
4241
  </div>
4242
  <div class="cell-artifacts">
 
4249
  <rdf:RDF>
4250
  <ns2:Work>
4251
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4252
+ <dc:date>2025-12-19T19:09:50.663153</dc:date>
4253
  <dc:format>image/svg+xml</dc:format>
4254
  <dc:creator>
4255
  <ns2:Agent>
4256
+ <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
4257
  </ns2:Agent>
4258
  </dc:creator>
4259
  </ns2:Work>
 
4333
  <g id="matplotlib.axis_2">
4334
  <g id="ytick_1">
4335
  <g id="grid-y--2" class="grid grid-y">
4336
+ <path d="M 47.72 408.774166 L 840.20233 408.774166 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4337
  </g>
4338
  <g id="line2d_5">
4339
  <defs>
4340
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4341
  </defs>
4342
  <g>
4343
+ <use ns4:href="#m0fca2865ba" x="47.72" y="408.774166" style="stroke: #000000; stroke-width: 0.8" />
4344
  </g>
4345
  </g>
4346
  <g id="text_5">
4347
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.573385" transform="rotate(-0 40.72 412.573385)">1.0</text>
4348
  </g>
4349
  </g>
4350
  <g id="ytick_2">
4351
  <g id="grid-y--3" class="grid grid-y">
4352
+ <path d="M 47.72 330.886714 L 840.20233 330.886714 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4353
  </g>
4354
  <g id="line2d_6">
4355
  <g>
4356
+ <use ns4:href="#m0fca2865ba" x="47.72" y="330.886714" style="stroke: #000000; stroke-width: 0.8" />
4357
  </g>
4358
  </g>
4359
  <g id="text_6">
4360
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.685933" transform="rotate(-0 40.72 334.685933)">1.5</text>
4361
  </g>
4362
  </g>
4363
  <g id="ytick_3">
4364
  <g id="grid-y--4" class="grid grid-y">
4365
+ <path d="M 47.72 252.999261 L 840.20233 252.999261 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4366
  </g>
4367
  <g id="line2d_7">
4368
  <g>
4369
+ <use ns4:href="#m0fca2865ba" x="47.72" y="252.999261" style="stroke: #000000; stroke-width: 0.8" />
4370
  </g>
4371
  </g>
4372
  <g id="text_7">
4373
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.79848" transform="rotate(-0 40.72 256.79848)">2.0</text>
4374
  </g>
4375
  </g>
4376
  <g id="ytick_4">
4377
  <g id="grid-y--5" class="grid grid-y">
4378
+ <path d="M 47.72 175.111809 L 840.20233 175.111809 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4379
  </g>
4380
  <g id="line2d_8">
4381
  <g>
4382
+ <use ns4:href="#m0fca2865ba" x="47.72" y="175.111809" style="stroke: #000000; stroke-width: 0.8" />
4383
  </g>
4384
  </g>
4385
  <g id="text_8">
4386
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.911028" transform="rotate(-0 40.72 178.911028)">2.5</text>
4387
  </g>
4388
  </g>
4389
  <g id="ytick_5">
4390
  <g id="grid-y--6" class="grid grid-y">
4391
+ <path d="M 47.72 97.224356 L 840.20233 97.224356 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4392
  </g>
4393
  <g id="line2d_9">
4394
  <g>
4395
+ <use ns4:href="#m0fca2865ba" x="47.72" y="97.224356" style="stroke: #000000; stroke-width: 0.8" />
4396
  </g>
4397
  </g>
4398
  <g id="text_9">
4399
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.023575" transform="rotate(-0 40.72 101.023575)">3.0</text>
4400
  </g>
4401
  </g>
4402
  <g id="label--y" class="ylabel">
 
4404
  </g>
4405
  </g>
4406
  <g id="series--torch-layer-norm" class="series">
4407
+ <path d="M 83.741924 437.689571 L 323.888085 302.950354 L 564.034245 314.128917 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4408
  <defs>
4409
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4410
  </defs>
4411
  <g clip-path="url(#p2214f54723)">
4412
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4413
+ <use ns4:href="#md7efaf3aec" x="323.888085" y="302.950354" style="fill: #1f77b4; stroke: #1f77b4" />
4414
+ <use ns4:href="#md7efaf3aec" x="564.034245" y="314.128917" style="fill: #1f77b4; stroke: #1f77b4" />
4415
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4416
  </g>
4417
  </g>
4418
  <g id="series--hf-kernels-layer-norm" class="series">
4419
+ <path d="M 83.741924 434.70508 L 323.888085 305.990613 L 564.034245 307.267967 L 804.180406 57.889324 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4420
  <defs>
4421
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4422
  </defs>
4423
  <g clip-path="url(#p2214f54723)">
4424
+ <use ns4:href="#m9b8c54d372" x="83.741924" y="434.70508" style="fill: #ff7f0e; stroke: #ff7f0e" />
4425
+ <use ns4:href="#m9b8c54d372" x="323.888085" y="305.990613" style="fill: #ff7f0e; stroke: #ff7f0e" />
4426
+ <use ns4:href="#m9b8c54d372" x="564.034245" y="307.267967" style="fill: #ff7f0e; stroke: #ff7f0e" />
4427
+ <use ns4:href="#m9b8c54d372" x="804.180406" y="57.889324" style="fill: #ff7f0e; stroke: #ff7f0e" />
4428
  </g>
4429
  </g>
4430
  <g id="patch_3">
openai_moe/impls/artifacts/benchmark/openai_moe.jsonl CHANGED
@@ -1,8 +1,8 @@
1
- {"ts": "2025-11-10T21:59:28Z", "run": "1939dc0ee47a4164bf38304335c67bc8", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_E2", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 153.23935800000754, "p50": 154.66906200003905, "p90": 155.3045599999905, "mean": 154.4065966000062, "iqr": 1.9825210000021798, "raw_times": [154.66906200003905, 153.23935800000754, 153.3220389999883, 155.3045599999905, 155.4979640000056], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 150.5313740000247, "peak_bytes": 416866816, "ok": true, "absmax": 2.765655517578125e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.765655517578125e-05, "mae": 2.0696452338597737e-06, "mse": 7.332408985538663e-12, "ref": "naive_moe"}, "err": null}
2
- {"ts": "2025-11-10T21:59:51Z", "run": "1939dc0ee47a4164bf38304335c67bc8", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_E4", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 200.93769199996814, "p50": 201.49722299998984, "p90": 202.5282779999884, "mean": 202.0041708000008, "iqr": 1.4469799999687893, "raw_times": [201.08129800001961, 203.97636300003796, 200.93769199996814, 201.49722299998984, 202.5282779999884], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 201.3829520000172, "peak_bytes": 632035840, "ok": true, "absmax": 1.621246337890625e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.621246337890625e-05, "mae": 9.61917862696282e-07, "mse": 1.59423277530657e-12, "ref": "naive_moe"}, "err": null}
3
- {"ts": "2025-11-10T22:00:35Z", "run": "1939dc0ee47a4164bf38304335c67bc8", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S1024_E2", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 367.22704099997827, "p50": 367.62146799998163, "p90": 367.7445199999738, "mean": 367.9209119999882, "iqr": 0.4843269999810218, "raw_times": [369.7513380000146, 367.2601929999928, 367.62146799998163, 367.7445199999738, 367.22704099997827], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 368.07921899998064, "peak_bytes": 643844608, "ok": true, "absmax": 2.6226043701171875e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.6226043701171875e-05, "mae": 2.0501920516835526e-06, "mse": 7.1848811622476916e-12, "ref": "naive_moe"}, "err": null}
4
- {"ts": "2025-11-10T22:01:22Z", "run": "1939dc0ee47a4164bf38304335c67bc8", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S1024_E4", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 392.9537719999985, "p50": 394.19261099999403, "p90": 394.552635000025, "mean": 394.1458786000112, "iqr": 1.3762300000053074, "raw_times": [393.1764050000197, 394.19261099999403, 392.9537719999985, 394.552635000025, 395.85397000001876], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 399.8835020000229, "peak_bytes": 823386112, "ok": true, "absmax": 1.3589859008789062e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.3589859008789062e-05, "mae": 9.400179123986163e-07, "mse": 1.5130355735665235e-12, "ref": "naive_moe"}, "err": null}
5
- {"ts": "2025-11-10T22:02:51Z", "run": "1939dc0ee47a4164bf38304335c67bc8", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S512_E2", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 734.9415649999855, "p50": 736.2597970000024, "p90": 736.4179590000504, "mean": 736.8552042000147, "iqr": 0.5320090000395794, "raw_times": [735.8859500000108, 736.4179590000504, 734.9415649999855, 736.2597970000024, 740.7707500000242], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 738.5199589999729, "peak_bytes": 1036112384, "ok": true, "absmax": 3.2901763916015625e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 3.2901763916015625e-05, "mae": 2.0572656467265915e-06, "mse": 7.247809123700488e-12, "ref": "naive_moe"}, "err": null}
6
- {"ts": "2025-11-10T22:04:32Z", "run": "1939dc0ee47a4164bf38304335c67bc8", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S512_E4", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 795.7670519999738, "p50": 798.8816239999323, "p90": 799.2389810000304, "mean": 798.3748011999751, "iqr": 0.5543240000633887, "raw_times": [798.684656999967, 798.8816239999323, 799.3016919999718, 799.2389810000304, 795.7670519999738], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 796.8497300000763, "peak_bytes": 1235263488, "ok": true, "absmax": 1.430511474609375e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.430511474609375e-05, "mae": 9.400343401466671e-07, "mse": 1.5107844445957919e-12, "ref": "naive_moe"}, "err": null}
7
- {"ts": "2025-11-10T22:07:29Z", "run": "1939dc0ee47a4164bf38304335c67bc8", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S1024_E2", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1474.9918590000561, "p50": 1483.5365430000138, "p90": 1484.378332999995, "mean": 1483.3181600000216, "iqr": 3.7910559999545512, "raw_times": [1480.5872770000406, 1474.9918590000561, 1484.378332999995, 1483.5365430000138, 1493.0967880000026], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1492.4540110000635, "peak_bytes": 1861947904, "ok": true, "absmax": 2.6226043701171875e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.6226043701171875e-05, "mae": 2.060702854578267e-06, "mse": 7.262949790198814e-12, "ref": "naive_moe"}, "err": null}
8
- {"ts": "2025-11-10T22:10:52Z", "run": "1939dc0ee47a4164bf38304335c67bc8", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S1024_E4", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1594.949616000008, "p50": 1601.9022579999955, "p90": 1602.6959760000636, "mean": 1600.7068320000144, "iqr": 2.6664300000902585, "raw_times": [1601.9022579999955, 1600.0295459999734, 1594.949616000008, 1602.6959760000636, 1603.9567640000314], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1575.0532499999963, "peak_bytes": 2062163968, "ok": true, "absmax": 1.5974044799804688e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.5974044799804688e-05, "mae": 9.529014732834185e-07, "mse": 1.5621694476192216e-12, "ref": "naive_moe"}, "err": null}
 
1
+ {"ts": "2025-12-19T18:57:39Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_E2", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 155.7981640000321, "p50": 157.7297640000097, "p90": 159.48504900001126, "mean": 158.39911260001145, "iqr": 2.223896000032255, "raw_times": [161.72143300002517, 157.261152999979, 155.7981640000321, 159.48504900001126, 157.7297640000097], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 159.10347999999885, "peak_bytes": 416866816, "ok": true, "absmax": 2.765655517578125e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.765655517578125e-05, "mae": 2.0696452338597737e-06, "mse": 7.332408985538663e-12, "ref": "naive_moe"}, "err": null}
2
+ {"ts": "2025-12-19T18:58:03Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_E4", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 199.79041199997027, "p50": 204.82147100000248, "p90": 205.0451750000093, "mean": 203.32668460000605, "iqr": 3.4747309999829668, "raw_times": [205.40592100002186, 199.79041199997027, 201.57044400002633, 205.0451750000093, 204.82147100000248], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 200.72428899999295, "peak_bytes": 632035840, "ok": true, "absmax": 1.621246337890625e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.621246337890625e-05, "mae": 9.61917862696282e-07, "mse": 1.59423277530657e-12, "ref": "naive_moe"}, "err": null}
3
+ {"ts": "2025-12-19T18:58:47Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S1024_E2", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 372.8170420000083, "p50": 383.31174900002907, "p90": 392.9121939999618, "mean": 385.07766660000016, "iqr": 10.251173999961338, "raw_times": [393.68632800000114, 392.9121939999618, 382.66102000000046, 383.31174900002907, 372.8170420000083], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 393.062126000018, "peak_bytes": 643844608, "ok": true, "absmax": 2.6226043701171875e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.6226043701171875e-05, "mae": 2.0501920516835526e-06, "mse": 7.1848811622476916e-12, "ref": "naive_moe"}, "err": null}
4
+ {"ts": "2025-12-19T18:59:36Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S1024_E4", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 418.8624209999716, "p50": 421.41534400002456, "p90": 422.4395519999007, "mean": 421.30189059998884, "iqr": 1.8283119999296105, "raw_times": [423.18089600007625, 421.41534400002456, 418.8624209999716, 420.6112399999711, 422.4395519999007], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 421.8970150000132, "peak_bytes": 823386112, "ok": true, "absmax": 1.3589859008789062e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.3589859008789062e-05, "mae": 9.400179123986163e-07, "mse": 1.5130355735665235e-12, "ref": "naive_moe"}, "err": null}
5
+ {"ts": "2025-12-19T19:01:05Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S512_E2", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 766.098573000022, "p50": 773.6994500000947, "p90": 774.865274999911, "mean": 772.9573942000115, "iqr": 8.746404999897095, "raw_times": [766.1188700000139, 773.6994500000947, 766.098573000022, 774.865274999911, 784.0048030000162], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 782.9079639999463, "peak_bytes": 1036112384, "ok": true, "absmax": 3.2901763916015625e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 3.2901763916015625e-05, "mae": 2.0572656467265915e-06, "mse": 7.247809123700488e-12, "ref": "naive_moe"}, "err": null}
6
+ {"ts": "2025-12-19T19:02:49Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S512_E4", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 828.9166780000414, "p50": 840.0145479999992, "p90": 848.174653000001, "mean": 841.7884347999916, "iqr": 11.353517000088686, "raw_times": [855.0151590000041, 828.9166780000414, 848.174653000001, 836.8211359999123, 840.0145479999992], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 857.4785790000305, "peak_bytes": 1235263488, "ok": true, "absmax": 1.430511474609375e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.430511474609375e-05, "mae": 9.400343401466671e-07, "mse": 1.5107844445957919e-12, "ref": "naive_moe"}, "err": null}
7
+ {"ts": "2025-12-19T19:05:50Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S1024_E2", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1492.7651169999763, "p50": 1513.7102520000099, "p90": 1522.1755649999977, "mean": 1513.4781133999923, "iqr": 10.99431700004061, "raw_times": [1492.7651169999763, 1511.1812479999571, 1522.1755649999977, 1527.5583850000203, 1513.7102520000099], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1532.0516410000664, "peak_bytes": 1861947904, "ok": true, "absmax": 2.6226043701171875e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.6226043701171875e-05, "mae": 2.060702854578267e-06, "mse": 7.262949790198814e-12, "ref": "naive_moe"}, "err": null}
8
+ {"ts": "2025-12-19T19:09:07Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S1024_E4", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1654.5569229999728, "p50": 1658.7427389999903, "p90": 1665.0588319999997, "mean": 1660.4780848000016, "iqr": 7.11779099992782, "raw_times": [1658.7427389999903, 1665.0588319999997, 1666.0908889999746, 1657.941041000072, 1654.5569229999728], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1670.381679000002, "peak_bytes": 2062163968, "ok": true, "absmax": 1.5974044799804688e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.5974044799804688e-05, "mae": 9.529014732834185e-07, "mse": 1.5621694476192216e-12, "ref": "naive_moe"}, "err": null}
openai_moe/impls/binned_torch.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.22s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,16 +3904,16 @@ Cell: nv | 0.22s
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
- <div class="cell-stdout"><pre class="stdout-text">Mon Nov 10 21:58:43 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
- | NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
3911
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3912
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
- | N/A 31C P0 78W / 350W | 0MiB / 46068MiB | 17% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
@@ -3937,7 +3937,7 @@ Cell: nv | 0.22s
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3939
  </span> |
3940
- Cell: benchmark | 727.18s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -4095,29 +4095,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S512_E2
4095
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4096
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 931.122ms 1835.78% 931.122ms 931.122ms 1
4099
- binned_torch 25.32% 236.300ms 100.00% 933.185ms 933.185ms 0.000us 0.00% 50.723ms 50.723ms 1
4100
- aten::item 1.92% 17.916ms 25.08% 234.061ms 15.253us 0.000us 0.00% 15.750ms 1.026us 15345
4101
- aten::_local_scalar_dense 5.72% 53.357ms 23.16% 216.145ms 14.086us 15.749ms 31.05% 15.750ms 1.026us 15345
4102
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 15.749ms 31.05% 15.749ms 1.026us 15345
4103
- aten::floor_divide 5.56% 51.923ms 13.14% 122.652ms 19.963us 7.815ms 15.41% 7.815ms 1.272us 6144
4104
- aten::bmm 0.02% 190.442us 0.02% 231.383us 38.564us 7.780ms 15.34% 7.780ms 1.297ms 6
4105
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.780ms 15.34% 7.780ms 1.297ms 6
4106
- aten::copy_ 3.79% 35.401ms 9.18% 85.713ms 13.923us 6.584ms 12.98% 6.585ms 1.070us 6156
4107
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.579ms 12.97% 6.579ms 1.069us 6153
4108
- aten::mul 3.06% 28.578ms 5.54% 51.726ms 16.789us 4.711ms 9.29% 4.711ms 1.529us 3081
4109
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.480ms 8.83% 4.480ms 1.458us 3072
4110
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.161ms 8.20% 4.161ms 1.354us 3072
4111
- aten::remainder 3.12% 29.137ms 4.83% 45.065ms 14.669us 3.840ms 7.57% 3.840ms 1.250us 3072
4112
- aten::add 2.80% 26.083ms 4.76% 44.381ms 14.633us 3.757ms 7.41% 3.757ms 1.239us 3033
4113
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.656ms 7.21% 3.656ms 1.190us 3072
4114
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.366ms 6.64% 3.366ms 1.111us 3030
4115
  void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.023ms 3.99% 2.023ms 1.317us 1536
4116
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.817ms 3.58% 1.817ms 1.183us 1536
4117
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 283.649us 0.56% 283.649us 47.275us 6
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
- Self CPU time total: 933.193ms
4120
- Self CUDA time total: 50.721ms
4121
 
4122
 
4123
 
@@ -4127,29 +4127,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S512_E4
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4129
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4130
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 938.961ms 1720.32% 938.961ms 938.961ms 1
4131
- binned_torch 25.07% 235.565ms 100.00% 939.473ms 939.473ms 0.000us 0.00% 54.589ms 54.589ms 1
4132
- aten::item 1.76% 16.540ms 26.46% 248.589ms 14.679us 0.000us 0.00% 17.855ms 1.054us 16935
4133
- aten::_local_scalar_dense 5.69% 53.475ms 24.70% 232.048ms 13.702us 17.853ms 32.71% 17.855ms 1.054us 16935
4134
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 17.853ms 32.71% 17.853ms 1.054us 16935
4135
- aten::bmm 0.02% 182.580us 0.02% 223.522us 37.254us 7.981ms 14.62% 7.981ms 1.330ms 6
4136
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.981ms 14.62% 7.981ms 1.330ms 6
4137
- aten::floor_divide 5.18% 48.644ms 12.51% 117.515ms 19.127us 7.813ms 14.31% 7.816ms 1.272us 6144
4138
- aten::copy_ 3.69% 34.686ms 8.73% 82.032ms 13.325us 6.629ms 12.15% 6.630ms 1.077us 6156
4139
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.626ms 12.14% 6.626ms 1.077us 6153
4140
- aten::add 3.97% 37.266ms 6.91% 64.908ms 14.132us 5.261ms 9.64% 5.261ms 1.145us 4593
4141
- aten::mul 2.87% 26.992ms 5.23% 49.129ms 15.946us 4.699ms 8.61% 4.699ms 1.525us 3081
4142
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.475ms 8.20% 4.475ms 1.457us 3072
4143
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.158ms 7.62% 4.158ms 1.353us 3072
4144
- aten::remainder 2.85% 26.773ms 4.50% 42.318ms 13.775us 3.852ms 7.06% 3.852ms 1.254us 3072
4145
  void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 6.70% 3.655ms 1.190us 3072
4146
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.271ms 5.99% 3.271ms 1.080us 3030
4147
  void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.030ms 3.72% 2.030ms 1.322us 1536
4148
  void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.822ms 3.34% 1.822ms 1.186us 1536
4149
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.585ms 2.90% 1.585ms 1.016us 1560
4150
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4151
- Self CPU time total: 939.480ms
4152
- Self CUDA time total: 54.581ms
4153
 
4154
 
4155
 
@@ -4159,29 +4159,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S1024_E2
4159
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4160
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4161
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4162
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.710s 1645.94% 1.710s 1.710s 1
4163
- binned_torch 23.47% 401.594ms 100.00% 1.711s 1.711s 0.000us 0.00% 103.932ms 103.932ms 1
4164
- aten::item 1.77% 30.361ms 27.00% 461.971ms 15.140us 0.000us 0.00% 31.541ms 1.034us 30513
4165
- aten::_local_scalar_dense 5.97% 102.153ms 25.22% 431.610ms 14.145us 31.538ms 30.35% 31.541ms 1.034us 30513
4166
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 31.538ms 30.35% 31.538ms 1.034us 30513
4167
- aten::floor_divide 5.77% 98.697ms 13.68% 234.018ms 19.044us 15.598ms 15.01% 15.600ms 1.270us 12288
4168
- aten::bmm 0.01% 219.084us 0.02% 260.723us 43.454us 15.235ms 14.66% 15.235ms 2.539ms 6
4169
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.235ms 14.66% 15.235ms 2.539ms 6
4170
- aten::copy_ 3.97% 67.926ms 9.38% 160.451ms 13.045us 13.315ms 12.81% 13.316ms 1.083us 12300
4171
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.311ms 12.81% 13.311ms 1.083us 12294
4172
- aten::mul 3.19% 54.637ms 5.82% 99.678ms 16.200us 11.250ms 10.83% 11.252ms 1.829us 6153
4173
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.903ms 9.53% 9.903ms 1.612us 6144
4174
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.304ms 7.99% 8.304ms 1.352us 6144
4175
- aten::remainder 3.07% 52.461ms 4.79% 82.008ms 13.348us 7.670ms 7.38% 7.671ms 1.249us 6144
4176
- aten::add 2.76% 47.163ms 4.86% 83.106ms 14.055us 7.632ms 7.34% 7.633ms 1.291us 5913
4177
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.294ms 7.02% 7.294ms 1.187us 6144
4178
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.354ms 6.11% 6.354ms 1.075us 5910
4179
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.041ms 3.89% 4.041ms 1.316us 3072
4180
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.629ms 3.49% 3.629ms 1.181us 3072
4181
- aten::clamp 0.00% 71.350us 0.01% 113.931us 18.988us 1.190ms 1.15% 1.190ms 198.366us 6
4182
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4183
- Self CPU time total: 1.711s
4184
- Self CUDA time total: 103.922ms
4185
 
4186
 
4187
 
@@ -4191,29 +4191,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S1024_E4
4191
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4192
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4193
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4194
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.831s 1659.19% 1.831s 1.831s 1
4195
- binned_torch 23.77% 435.469ms 100.00% 1.832s 1.832s 0.000us 0.00% 110.361ms 110.361ms 1
4196
- aten::item 1.74% 31.875ms 27.52% 504.183ms 14.948us 0.000us 0.00% 34.964ms 1.037us 33729
4197
- aten::_local_scalar_dense 6.20% 113.521ms 25.78% 472.309ms 14.003us 34.961ms 31.68% 34.964ms 1.037us 33729
4198
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 34.961ms 31.68% 34.961ms 1.037us 33729
4199
- aten::floor_divide 5.21% 95.369ms 12.55% 229.877ms 18.707us 15.595ms 14.13% 15.597ms 1.269us 12288
4200
- aten::bmm 0.01% 225.035us 0.01% 267.825us 44.638us 15.231ms 13.80% 15.231ms 2.539ms 6
4201
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.231ms 13.80% 15.231ms 2.539ms 6
4202
- aten::copy_ 3.69% 67.648ms 8.80% 161.241ms 13.109us 13.343ms 12.09% 13.347ms 1.085us 12300
4203
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.340ms 12.09% 13.340ms 1.085us 12297
4204
- aten::mul 2.99% 54.761ms 5.39% 98.799ms 16.057us 10.934ms 9.91% 10.936ms 1.777us 6153
4205
- aten::add 3.91% 71.612ms 6.90% 126.397ms 13.891us 10.863ms 9.84% 10.863ms 1.194us 9099
4206
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.586ms 8.69% 9.586ms 1.560us 6144
4207
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.308ms 7.53% 8.308ms 1.352us 6144
4208
- aten::remainder 2.81% 51.395ms 4.41% 80.796ms 13.150us 7.688ms 6.97% 7.688ms 1.251us 6144
4209
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.287ms 6.60% 7.287ms 1.186us 6144
4210
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.364ms 5.77% 6.364ms 1.077us 5910
4211
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.054ms 3.67% 4.054ms 1.320us 3072
4212
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.634ms 3.29% 3.634ms 1.183us 3072
4213
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.232ms 2.93% 3.232ms 1.014us 3186
4214
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4215
- Self CPU time total: 1.832s
4216
- Self CUDA time total: 110.351ms
4217
 
4218
 
4219
 
@@ -4223,29 +4223,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S512_E2
4223
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4224
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4225
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4226
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.493s 1641.52% 3.493s 3.493s 1
4227
- binned_torch 23.72% 828.141ms 100.00% 3.492s 3.492s 0.000us 0.00% 212.777ms 212.777ms 1
4228
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 63.619ms 29.90% 63.619ms 1.033us 61586
4229
- aten::item 1.76% 61.470ms 26.76% 934.319ms 15.171us 0.000us 0.00% 63.619ms 1.033us 61587
4230
- aten::_local_scalar_dense 5.95% 207.894ms 25.00% 872.849ms 14.173us 63.616ms 29.90% 63.619ms 1.033us 61587
4231
- aten::floor_divide 5.53% 193.077ms 13.34% 465.879ms 18.957us 31.606ms 14.86% 31.612ms 1.286us 24576
4232
- aten::bmm 0.01% 236.694us 0.01% 284.594us 47.432us 29.067ms 13.66% 29.067ms 4.844ms 6
4233
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 29.067ms 13.66% 29.067ms 4.844ms 6
4234
- aten::copy_ 3.89% 135.756ms 9.33% 325.881ms 13.254us 26.713ms 12.56% 26.714ms 1.086us 24588
4235
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.711ms 12.55% 26.711ms 1.087us 24582
4236
- aten::mul 3.15% 110.066ms 5.73% 199.944ms 16.260us 25.593ms 12.03% 25.595ms 2.081us 12297
4237
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.131ms 10.40% 22.131ms 1.801us 12288
4238
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 17.009ms 7.99% 17.009ms 1.384us 12288
4239
- aten::add 2.82% 98.495ms 4.98% 173.932ms 14.014us 16.658ms 7.83% 16.659ms 1.342us 12411
4240
- aten::remainder 3.04% 106.037ms 4.77% 166.563ms 13.555us 15.433ms 7.25% 15.435ms 1.256us 12288
4241
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.597ms 6.86% 14.597ms 1.188us 12288
4242
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.527ms 6.36% 13.527ms 1.090us 12408
4243
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.132ms 3.82% 8.132ms 1.324us 6144
4244
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.300ms 3.43% 7.300ms 1.188us 6144
4245
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.623ms 1.23% 2.623ms 437.201us 6
4246
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4247
- Self CPU time total: 3.492s
4248
- Self CUDA time total: 212.763ms
4249
 
4250
 
4251
 
@@ -4255,29 +4255,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S512_E4
4255
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4256
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4257
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4258
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.669s 1629.04% 3.669s 3.669s 1
4259
- binned_torch 23.71% 870.025ms 100.00% 3.670s 3.670s 0.000us 0.00% 225.217ms 225.217ms 1
4260
- aten::item 1.74% 63.801ms 26.98% 990.130ms 14.594us 0.000us 0.00% 69.736ms 1.028us 67845
4261
- aten::_local_scalar_dense 5.93% 217.737ms 25.24% 926.329ms 13.654us 69.731ms 30.96% 69.736ms 1.028us 67845
4262
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 69.731ms 30.96% 69.731ms 1.028us 67841
4263
- aten::floor_divide 5.15% 189.112ms 12.36% 453.770ms 18.464us 31.523ms 14.00% 31.529ms 1.283us 24576
4264
- aten::bmm 0.01% 229.594us 0.01% 272.075us 45.346us 28.926ms 12.84% 28.926ms 4.821ms 6
4265
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 28.926ms 12.84% 28.926ms 4.821ms 6
4266
- aten::copy_ 3.90% 143.149ms 8.93% 327.628ms 13.325us 26.721ms 11.87% 26.722ms 1.087us 24588
4267
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.719ms 11.86% 26.719ms 1.087us 24581
4268
- aten::mul 3.13% 114.822ms 5.47% 200.852ms 16.333us 25.594ms 11.37% 25.596ms 2.081us 12297
4269
- aten::add 3.87% 141.881ms 6.78% 248.742ms 13.345us 23.243ms 10.32% 23.243ms 1.247us 18639
4270
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.132ms 9.83% 22.132ms 1.801us 12288
4271
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.988ms 7.54% 16.988ms 1.383us 12287
4272
- aten::remainder 2.85% 104.729ms 4.42% 162.304ms 13.208us 15.354ms 6.82% 15.355ms 1.250us 12288
4273
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.535ms 6.45% 14.535ms 1.183us 12287
4274
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.676ms 6.07% 13.676ms 1.102us 12407
4275
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.096ms 3.60% 8.096ms 1.318us 6144
4276
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.258ms 3.22% 7.258ms 1.181us 6144
4277
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.475ms 2.88% 6.475ms 1.040us 6228
4278
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4279
- Self CPU time total: 3.670s
4280
- Self CUDA time total: 225.199ms
4281
 
4282
 
4283
 
@@ -4287,29 +4287,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S1024_E2
4287
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4288
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4289
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4290
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 6.859s 1611.59% 6.859s 6.859s 1
4291
- binned_torch 24.10% 1.655s 100.00% 6.866s 6.866s 0.000us 0.00% 425.661ms 425.661ms 1
4292
- aten::item 1.68% 115.068ms 26.29% 1.805s 14.704us 0.000us 0.00% 127.116ms 1.035us 122763
4293
- aten::_local_scalar_dense 5.74% 393.879ms 24.61% 1.690s 13.764us 127.109ms 29.86% 127.116ms 1.035us 122763
4294
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 127.110ms 29.86% 127.110ms 1.035us 122762
4295
- aten::floor_divide 5.46% 374.656ms 13.09% 898.826ms 18.287us 63.404ms 14.90% 63.408ms 1.290us 49152
4296
- aten::bmm 0.00% 234.973us 0.00% 276.793us 46.132us 56.971ms 13.38% 56.971ms 9.495ms 6
4297
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 56.971ms 13.38% 56.971ms 9.495ms 6
4298
- aten::copy_ 4.17% 286.167ms 9.49% 651.750ms 13.258us 53.615ms 12.60% 53.616ms 1.091us 49158
4299
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.612ms 12.60% 53.612ms 1.091us 49154
4300
- aten::mul 3.34% 229.543ms 5.86% 402.465ms 16.370us 51.556ms 12.11% 51.561ms 2.097us 24585
4301
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.609ms 10.48% 44.609ms 1.815us 24576
4302
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 34.184ms 8.03% 34.184ms 1.391us 24576
4303
- aten::add 2.69% 184.813ms 4.71% 323.308ms 13.231us 33.584ms 7.89% 33.588ms 1.375us 24435
4304
- aten::remainder 3.06% 210.055ms 4.75% 326.044ms 13.267us 30.927ms 7.27% 30.931ms 1.259us 24576
4305
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.221ms 6.87% 29.221ms 1.189us 24576
4306
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.946ms 6.33% 26.946ms 1.103us 24431
4307
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.291ms 3.83% 16.291ms 1.326us 12288
4308
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.637ms 3.44% 14.637ms 1.191us 12288
4309
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 5.222ms 1.23% 5.222ms 870.407us 6
4310
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4311
- Self CPU time total: 6.866s
4312
- Self CUDA time total: 425.634ms
4313
 
4314
 
4315
 
@@ -4319,40 +4319,40 @@ PROFILE TRACE: binned_torch | cuda_B4_S1024_E4
4319
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4320
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4321
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4322
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.331s 1630.84% 7.331s 7.331s 1
4323
- binned_torch 23.92% 1.754s 100.00% 7.333s 7.333s 0.000us 0.00% 449.578ms 449.578ms 1
4324
- aten::item 1.73% 127.153ms 27.44% 2.013s 14.940us 0.000us 0.00% 139.264ms 1.034us 134715
4325
- aten::_local_scalar_dense 6.23% 456.926ms 25.71% 1.885s 13.996us 139.253ms 30.98% 139.264ms 1.034us 134715
4326
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 139.255ms 30.98% 139.255ms 1.034us 134707
4327
- aten::floor_divide 5.02% 368.091ms 12.28% 900.843ms 18.328us 63.383ms 14.10% 63.388ms 1.290us 49152
4328
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 56.831ms 12.64% 56.831ms 9.472ms 6
4329
- aten::bmm 0.00% 231.002us 0.00% 273.424us 45.571us 56.831ms 12.64% 56.831ms 9.472ms 6
4330
- aten::copy_ 3.67% 268.957ms 8.71% 638.523ms 12.989us 53.771ms 11.96% 53.773ms 1.094us 49158
4331
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.768ms 11.96% 53.768ms 1.094us 49149
4332
- aten::mul 2.96% 217.228ms 5.34% 391.576ms 15.927us 51.518ms 11.46% 51.524ms 2.096us 24585
4333
- aten::add 3.83% 280.607ms 6.79% 497.692ms 13.689us 45.514ms 10.12% 45.518ms 1.252us 36357
4334
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.542ms 9.91% 44.542ms 1.812us 24576
4335
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 34.127ms 7.59% 34.127ms 1.389us 24573
4336
- aten::remainder 2.85% 209.203ms 4.50% 330.314ms 13.441us 30.793ms 6.85% 30.795ms 1.253us 24576
4337
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.257ms 6.51% 29.257ms 1.191us 24573
4338
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.610ms 5.92% 26.610ms 1.089us 24431
4339
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.233ms 3.61% 16.233ms 1.321us 12288
4340
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.559ms 3.24% 14.559ms 1.185us 12288
4341
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.261ms 2.73% 12.261ms 1.028us 11922
4342
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4343
- Self CPU time total: 7.333s
4344
- Self CUDA time total: 449.542ms
4345
 
4346
 
4347
  impl wl p50(ms) ok
4348
- binned_torch cuda_B1_S1024_E2 367.62 True
4349
- binned_torch cuda_B1_S1024_E4 394.19 True
4350
- binned_torch cuda_B1_S512_E2 154.67 True
4351
- binned_torch cuda_B1_S512_E4 201.50 True
4352
- binned_torch cuda_B4_S1024_E2 1483.54 True
4353
- binned_torch cuda_B4_S1024_E4 1601.90 True
4354
- binned_torch cuda_B4_S512_E2 736.26 True
4355
- binned_torch cuda_B4_S512_E4 798.88 True
4356
  </pre></div>
4357
  <div class="cell-artifacts">
4358
  <h4>Artifacts:</h4>
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:56:28 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
+ | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
3911
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3912
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
+ | N/A 34C P0 80W / 350W | 0MiB / 46068MiB | 41% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
 
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3939
  </span> |
3940
+ Cell: benchmark | 730.34s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
4095
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4096
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 919.007ms 1814.55% 919.007ms 919.007ms 1
4099
+ binned_torch 24.74% 227.809ms 100.00% 920.989ms 920.989ms 0.000us 0.00% 50.650ms 50.650ms 1
4100
+ aten::item 1.86% 17.169ms 26.20% 241.261ms 15.722us 0.000us 0.00% 15.873ms 1.034us 15345
4101
+ aten::_local_scalar_dense 5.94% 54.669ms 24.33% 224.092ms 14.604us 15.872ms 31.34% 15.873ms 1.034us 15345
4102
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 15.872ms 31.34% 15.872ms 1.034us 15345
4103
+ aten::floor_divide 5.47% 50.387ms 13.12% 120.822ms 19.665us 7.812ms 15.43% 7.812ms 1.272us 6144
4104
+ aten::bmm 0.02% 191.383us 0.03% 231.124us 38.521us 7.592ms 14.99% 7.592ms 1.265ms 6
4105
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.592ms 14.99% 7.592ms 1.265ms 6
4106
+ aten::copy_ 3.61% 33.260ms 9.01% 82.984ms 13.480us 6.583ms 13.00% 6.585ms 1.070us 6156
4107
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.579ms 12.99% 6.579ms 1.069us 6153
4108
+ aten::mul 3.25% 29.933ms 5.69% 52.377ms 17.000us 4.706ms 9.29% 4.706ms 1.527us 3081
4109
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.478ms 8.84% 4.478ms 1.458us 3072
4110
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.159ms 8.21% 4.159ms 1.354us 3072
4111
+ aten::remainder 3.14% 28.956ms 4.78% 44.045ms 14.337us 3.839ms 7.58% 3.839ms 1.250us 3072
4112
+ aten::add 2.87% 26.444ms 4.82% 44.437ms 14.651us 3.761ms 7.43% 3.761ms 1.240us 3033
4113
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 7.22% 3.655ms 1.190us 3072
4114
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.365ms 6.64% 3.365ms 1.110us 3030
4115
  void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.023ms 3.99% 2.023ms 1.317us 1536
4116
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.816ms 3.58% 1.816ms 1.182us 1536
4117
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 287.650us 0.57% 287.650us 47.942us 6
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
+ Self CPU time total: 920.998ms
4120
+ Self CUDA time total: 50.647ms
4121
 
4122
 
4123
 
 
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4129
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4130
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 934.694ms 1714.22% 934.694ms 934.694ms 1
4131
+ binned_torch 24.25% 226.767ms 100.00% 935.247ms 935.247ms 0.000us 0.00% 54.534ms 54.534ms 1
4132
+ aten::item 1.76% 16.424ms 27.79% 259.914ms 15.348us 0.000us 0.00% 17.987ms 1.062us 16935
4133
+ aten::_local_scalar_dense 6.05% 56.595ms 26.03% 243.490ms 14.378us 17.985ms 32.98% 17.987ms 1.062us 16935
4134
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 17.985ms 32.98% 17.985ms 1.062us 16935
4135
+ aten::floor_divide 5.13% 47.972ms 12.39% 115.852ms 18.856us 7.812ms 14.33% 7.813ms 1.272us 6144
4136
+ aten::bmm 0.02% 166.771us 0.02% 207.402us 34.567us 7.794ms 14.29% 7.794ms 1.299ms 6
4137
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.794ms 14.29% 7.794ms 1.299ms 6
4138
+ aten::copy_ 3.47% 32.488ms 8.51% 79.554ms 12.923us 6.633ms 12.17% 6.635ms 1.078us 6156
4139
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.630ms 12.16% 6.630ms 1.078us 6153
4140
+ aten::add 4.14% 38.686ms 7.06% 65.992ms 14.368us 5.259ms 9.64% 5.259ms 1.145us 4593
4141
+ aten::mul 3.02% 28.215ms 5.35% 50.047ms 16.244us 4.701ms 8.62% 4.701ms 1.526us 3081
4142
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.474ms 8.21% 4.474ms 1.457us 3072
4143
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.157ms 7.62% 4.157ms 1.353us 3072
4144
+ aten::remainder 2.81% 26.265ms 4.43% 41.468ms 13.499us 3.852ms 7.06% 3.852ms 1.254us 3072
4145
  void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 6.70% 3.655ms 1.190us 3072
4146
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.270ms 6.00% 3.270ms 1.079us 3030
4147
  void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.030ms 3.72% 2.030ms 1.322us 1536
4148
  void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.822ms 3.34% 1.822ms 1.186us 1536
4149
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.584ms 2.91% 1.584ms 1.015us 1560
4150
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4151
+ Self CPU time total: 935.255ms
4152
+ Self CUDA time total: 54.526ms
4153
 
4154
 
4155
 
 
4159
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4160
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4161
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4162
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.775s 1705.66% 1.775s 1.775s 1
4163
+ binned_torch 24.39% 432.670ms 100.00% 1.774s 1.774s 0.000us 0.00% 104.087ms 104.087ms 1
4164
+ aten::item 1.67% 29.627ms 26.26% 465.825ms 15.266us 0.000us 0.00% 31.856ms 1.044us 30513
4165
+ aten::_local_scalar_dense 5.88% 104.231ms 24.59% 436.198ms 14.295us 31.854ms 30.61% 31.856ms 1.044us 30513
4166
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 31.854ms 30.61% 31.854ms 1.044us 30513
4167
+ aten::floor_divide 5.49% 97.404ms 13.46% 238.769ms 19.431us 15.611ms 15.00% 15.612ms 1.270us 12288
4168
+ aten::bmm 0.01% 215.332us 0.01% 258.864us 43.144us 15.009ms 14.42% 15.009ms 2.502ms 6
4169
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.009ms 14.42% 15.009ms 2.502ms 6
4170
+ aten::copy_ 3.73% 66.187ms 9.04% 160.371ms 13.038us 13.330ms 12.81% 13.331ms 1.084us 12300
4171
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.326ms 12.80% 13.326ms 1.084us 12294
4172
+ aten::mul 3.16% 56.128ms 5.72% 101.496ms 16.495us 11.275ms 10.83% 11.277ms 1.833us 6153
4173
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.921ms 9.53% 9.921ms 1.615us 6144
4174
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.311ms 7.99% 8.311ms 1.353us 6144
4175
+ aten::remainder 3.23% 57.334ms 5.09% 90.371ms 14.709us 7.676ms 7.38% 7.678ms 1.250us 6144
4176
+ aten::add 2.88% 51.067ms 5.02% 88.987ms 15.049us 7.641ms 7.34% 7.642ms 1.292us 5913
4177
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.300ms 7.01% 7.300ms 1.188us 6144
4178
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.359ms 6.11% 6.359ms 1.076us 5910
4179
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.045ms 3.89% 4.045ms 1.317us 3072
4180
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.632ms 3.49% 3.632ms 1.182us 3072
4181
+ aten::clamp 0.00% 74.963us 0.01% 122.824us 20.471us 1.191ms 1.14% 1.191ms 198.444us 6
4182
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4183
+ Self CPU time total: 1.774s
4184
+ Self CUDA time total: 104.078ms
4185
 
4186
 
4187
 
 
4191
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4192
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4193
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4194
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.943s 1756.79% 1.943s 1.943s 1
4195
+ binned_torch 24.29% 471.728ms 100.00% 1.942s 1.942s 0.000us 0.00% 110.592ms 110.592ms 1
4196
+ aten::item 1.62% 31.476ms 26.94% 523.166ms 15.511us 0.000us 0.00% 35.330ms 1.047us 33729
4197
+ aten::_local_scalar_dense 6.11% 118.659ms 25.32% 491.691ms 14.578us 35.327ms 31.95% 35.330ms 1.047us 33729
4198
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 35.327ms 31.95% 35.327ms 1.047us 33728
4199
+ aten::floor_divide 5.19% 100.816ms 12.43% 241.273ms 19.635us 15.609ms 14.12% 15.611ms 1.270us 12288
4200
+ aten::bmm 0.01% 222.165us 0.01% 267.105us 44.517us 15.085ms 13.64% 15.085ms 2.514ms 6
4201
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.085ms 13.64% 15.085ms 2.514ms 6
4202
+ aten::copy_ 3.60% 69.833ms 8.76% 170.090ms 13.828us 13.355ms 12.08% 13.357ms 1.086us 12300
4203
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.353ms 12.07% 13.353ms 1.086us 12294
4204
+ aten::mul 2.94% 57.042ms 5.32% 103.331ms 16.794us 10.942ms 9.89% 10.942ms 1.778us 6153
4205
+ aten::add 3.88% 75.326ms 6.94% 134.721ms 14.806us 10.866ms 9.83% 10.866ms 1.194us 9099
4206
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.591ms 8.67% 9.591ms 1.561us 6144
4207
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.314ms 7.52% 8.314ms 1.353us 6144
4208
+ aten::remainder 2.77% 53.827ms 4.45% 86.321ms 14.050us 7.697ms 6.96% 7.697ms 1.253us 6144
4209
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.295ms 6.60% 7.295ms 1.187us 6144
4210
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.370ms 5.76% 6.370ms 1.078us 5910
4211
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.058ms 3.67% 4.058ms 1.321us 3072
4212
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.639ms 3.29% 3.639ms 1.185us 3072
4213
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.234ms 2.92% 3.234ms 1.015us 3186
4214
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4215
+ Self CPU time total: 1.942s
4216
+ Self CUDA time total: 110.585ms
4217
 
4218
 
4219
 
 
4223
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4224
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4225
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4226
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.554s 1668.92% 3.554s 3.554s 1
4227
+ binned_torch 24.03% 852.954ms 100.00% 3.549s 3.549s 0.000us 0.00% 212.979ms 212.979ms 1
4228
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 63.933ms 30.02% 63.933ms 1.038us 61586
4229
+ aten::item 1.68% 59.518ms 26.66% 946.248ms 15.364us 0.000us 0.00% 63.933ms 1.038us 61587
4230
+ aten::_local_scalar_dense 6.15% 218.157ms 24.98% 886.634ms 14.396us 63.932ms 30.02% 63.933ms 1.038us 61587
4231
+ aten::floor_divide 5.36% 190.145ms 13.28% 471.339ms 19.179us 31.621ms 14.85% 31.623ms 1.287us 24576
4232
+ aten::bmm 0.01% 230.233us 0.01% 275.904us 45.984us 28.855ms 13.55% 28.855ms 4.809ms 6
4233
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 28.855ms 13.55% 28.855ms 4.809ms 6
4234
+ aten::copy_ 3.84% 136.428ms 9.38% 333.073ms 13.546us 26.747ms 12.56% 26.749ms 1.088us 24588
4235
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.744ms 12.56% 26.744ms 1.088us 24582
4236
+ aten::mul 3.20% 113.415ms 5.79% 205.629ms 16.722us 25.614ms 12.03% 25.614ms 2.083us 12297
4237
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.161ms 10.41% 22.161ms 1.803us 12288
4238
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 17.018ms 7.99% 17.018ms 1.385us 12288
4239
+ aten::add 2.93% 103.833ms 5.19% 184.217ms 14.843us 16.665ms 7.83% 16.666ms 1.343us 12411
4240
+ aten::remainder 3.13% 110.979ms 5.01% 177.878ms 14.476us 15.442ms 7.25% 15.444ms 1.257us 12288
4241
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.607ms 6.86% 14.607ms 1.189us 12288
4242
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.543ms 6.36% 13.543ms 1.091us 12408
4243
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.136ms 3.82% 8.136ms 1.324us 6144
4244
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.305ms 3.43% 7.305ms 1.189us 6144
4245
+ aten::clamp 0.00% 80.604us 0.00% 131.123us 21.854us 2.608ms 1.22% 2.608ms 434.678us 6
4246
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4247
+ Self CPU time total: 3.549s
4248
+ Self CUDA time total: 212.971ms
4249
 
4250
 
4251
 
 
4255
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4256
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4257
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4258
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.834s 1701.16% 3.834s 3.834s 1
4259
+ binned_torch 23.91% 917.039ms 100.00% 3.836s 3.836s 0.000us 0.00% 225.394ms 225.394ms 1
4260
+ aten::item 1.70% 65.086ms 27.21% 1.044s 15.386us 0.000us 0.00% 70.210ms 1.035us 67845
4261
+ aten::_local_scalar_dense 6.32% 242.356ms 25.52% 978.758ms 14.426us 70.207ms 31.15% 70.210ms 1.035us 67845
4262
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 70.207ms 31.15% 70.207ms 1.035us 67840
4263
+ aten::floor_divide 5.09% 195.347ms 12.48% 478.676ms 19.477us 31.474ms 13.97% 31.481ms 1.281us 24576
4264
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 28.832ms 12.79% 28.832ms 4.805ms 6
4265
+ aten::bmm 0.01% 227.473us 0.01% 274.364us 45.727us 28.832ms 12.79% 28.832ms 4.805ms 6
4266
+ aten::copy_ 3.61% 138.479ms 8.82% 338.314ms 13.759us 26.687ms 11.84% 26.689ms 1.085us 24588
4267
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.685ms 11.84% 26.685ms 1.086us 24581
4268
+ aten::mul 2.97% 113.735ms 5.38% 206.436ms 16.787us 25.537ms 11.33% 25.539ms 2.077us 12297
4269
+ aten::add 4.18% 160.247ms 7.41% 284.235ms 15.249us 23.217ms 10.30% 23.217ms 1.246us 18639
4270
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.084ms 9.80% 22.084ms 1.797us 12288
4271
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.963ms 7.53% 16.963ms 1.381us 12287
4272
+ aten::remainder 2.89% 110.779ms 4.66% 178.579ms 14.533us 15.327ms 6.80% 15.329ms 1.247us 12288
4273
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.512ms 6.44% 14.512ms 1.181us 12287
4274
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.655ms 6.06% 13.655ms 1.101us 12407
4275
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.083ms 3.59% 8.083ms 1.316us 6144
4276
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.244ms 3.21% 7.244ms 1.179us 6144
4277
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.461ms 2.87% 6.461ms 1.037us 6228
4278
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4279
+ Self CPU time total: 3.836s
4280
+ Self CUDA time total: 225.376ms
4281
 
4282
 
4283
 
 
4287
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4288
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4289
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4290
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.307s 1714.16% 7.307s 7.307s 1
4291
+ binned_torch 24.10% 1.762s 100.00% 7.313s 7.313s 0.000us 0.00% 426.284ms 426.284ms 1
4292
+ aten::item 1.74% 126.959ms 26.39% 1.930s 15.721us 0.000us 0.00% 128.245ms 1.045us 122763
4293
+ aten::_local_scalar_dense 6.22% 454.984ms 24.65% 1.803s 14.685us 128.239ms 30.08% 128.245ms 1.045us 122763
4294
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 128.241ms 30.08% 128.241ms 1.045us 122762
4295
+ aten::floor_divide 5.53% 404.463ms 13.23% 967.808ms 19.690us 63.393ms 14.87% 63.393ms 1.290us 49152
4296
+ aten::bmm 0.00% 234.623us 0.00% 278.223us 46.371us 56.525ms 13.26% 56.525ms 9.421ms 6
4297
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 56.525ms 13.26% 56.525ms 9.421ms 6
4298
+ aten::copy_ 4.05% 295.852ms 9.44% 690.402ms 14.045us 53.639ms 12.58% 53.640ms 1.091us 49158
4299
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.636ms 12.58% 53.636ms 1.091us 49154
4300
+ aten::mul 3.24% 237.068ms 5.73% 419.319ms 17.056us 51.499ms 12.08% 51.504ms 2.095us 24585
4301
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.577ms 10.46% 44.577ms 1.814us 24576
4302
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 34.181ms 8.02% 34.181ms 1.391us 24576
4303
+ aten::add 2.92% 213.232ms 5.07% 370.760ms 15.173us 33.603ms 7.88% 33.606ms 1.375us 24435
4304
+ aten::remainder 3.14% 229.281ms 5.03% 367.714ms 14.962us 30.916ms 7.25% 30.921ms 1.258us 24576
4305
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.214ms 6.85% 29.214ms 1.189us 24576
4306
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.954ms 6.32% 26.954ms 1.103us 24431
4307
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.285ms 3.82% 16.285ms 1.325us 12288
4308
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.630ms 3.43% 14.630ms 1.191us 12288
4309
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 5.208ms 1.22% 5.208ms 868.029us 6
4310
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4311
+ Self CPU time total: 7.313s
4312
+ Self CUDA time total: 426.263ms
4313
 
4314
 
4315
 
 
4319
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4320
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4321
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4322
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.520s 1665.26% 7.520s 7.520s 1
4323
+ binned_torch 23.83% 1.792s 100.00% 7.522s 7.522s 0.000us 0.00% 451.603ms 451.603ms 1
4324
+ aten::item 1.82% 136.877ms 27.31% 2.054s 15.246us 0.000us 0.00% 140.837ms 1.045us 134715
4325
+ aten::_local_scalar_dense 6.26% 471.062ms 25.49% 1.917s 14.230us 140.825ms 31.19% 140.837ms 1.045us 134715
4326
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 140.826ms 31.19% 140.826ms 1.045us 134706
4327
+ aten::floor_divide 5.15% 387.087ms 12.45% 936.766ms 19.059us 63.494ms 14.06% 63.499ms 1.292us 49152
4328
+ aten::bmm 0.00% 222.563us 0.00% 265.513us 44.252us 56.696ms 12.56% 56.696ms 9.449ms 6
4329
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 56.696ms 12.56% 56.696ms 9.449ms 6
4330
+ aten::copy_ 3.71% 279.306ms 8.85% 665.315ms 13.534us 53.897ms 11.94% 53.900ms 1.096us 49158
4331
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.894ms 11.94% 53.894ms 1.097us 49149
4332
+ aten::mul 3.04% 228.311ms 5.39% 405.691ms 16.502us 51.688ms 11.45% 51.695ms 2.103us 24585
4333
+ aten::add 4.00% 300.523ms 6.98% 525.049ms 14.441us 45.565ms 10.09% 45.568ms 1.253us 36357
4334
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.621ms 9.88% 44.621ms 1.816us 24576
4335
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 34.193ms 7.57% 34.193ms 1.391us 24573
4336
+ aten::remainder 2.86% 215.282ms 4.58% 344.226ms 14.007us 30.855ms 6.83% 30.857ms 1.256us 24576
4337
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.302ms 6.49% 29.302ms 1.192us 24573
4338
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.656ms 5.90% 26.656ms 1.091us 24431
4339
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.266ms 3.60% 16.266ms 1.324us 12288
4340
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.588ms 3.23% 14.588ms 1.187us 12288
4341
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.278ms 2.72% 12.278ms 1.030us 11922
4342
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4343
+ Self CPU time total: 7.522s
4344
+ Self CUDA time total: 451.562ms
4345
 
4346
 
4347
  impl wl p50(ms) ok
4348
+ binned_torch cuda_B1_S1024_E2 383.31 True
4349
+ binned_torch cuda_B1_S1024_E4 421.42 True
4350
+ binned_torch cuda_B1_S512_E2 157.73 True
4351
+ binned_torch cuda_B1_S512_E4 204.82 True
4352
+ binned_torch cuda_B4_S1024_E2 1513.71 True
4353
+ binned_torch cuda_B4_S1024_E4 1658.74 True
4354
+ binned_torch cuda_B4_S512_E2 773.70 True
4355
+ binned_torch cuda_B4_S512_E4 840.01 True
4356
  </pre></div>
4357
  <div class="cell-artifacts">
4358
  <h4>Artifacts:</h4>
openai_moe/impls/gpt_oss_moe.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.22s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3905,16 +3905,16 @@ Cell: nv | 0.22s
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
- <div class="cell-stdout"><pre class="stdout-text">Mon Nov 10 21:58:43 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
- | NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
3912
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3913
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
- | N/A 31C P0 78W / 350W | 0MiB / 46068MiB | 17% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.22s
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
- Cell: benchmark | 25.04s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -4042,29 +4042,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E2
4042
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4043
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4044
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4045
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 10.360ms 190.98% 10.360ms 10.360ms 1
4046
- gpt_oss_experts 15.12% 1.924ms 99.94% 12.713ms 12.713ms 0.000us 0.00% 5.428ms 5.428ms 1
4047
- aten::matmul 0.18% 22.311us 3.73% 473.846us 39.487us 0.000us 0.00% 4.800ms 400.041us 12
4048
- aten::mm 2.34% 297.100us 3.55% 451.535us 37.628us 4.800ms 88.50% 4.800ms 400.041us 12
4049
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.258ms 60.07% 3.258ms 362.028us 9
4050
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.536ms 28.31% 1.536ms 511.862us 3
4051
- aten::mul 1.29% 163.978us 2.14% 271.630us 11.318us 109.411us 2.02% 109.411us 4.559us 24
4052
- aten::add 1.51% 192.130us 3.80% 483.423us 26.857us 103.358us 1.91% 103.358us 5.742us 18
4053
- aten::index 1.52% 193.374us 2.62% 333.164us 27.764us 88.224us 1.63% 88.224us 7.352us 12
4054
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 80.864us 1.49% 80.864us 6.739us 12
4055
- aten::index_add_ 0.46% 58.130us 0.76% 97.241us 16.207us 80.064us 1.48% 80.064us 13.344us 6
4056
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 80.064us 1.48% 80.064us 13.344us 6
4057
- aten::nonzero 2.05% 260.439us 6.29% 799.492us 88.832us 65.278us 1.20% 76.093us 8.455us 9
4058
- aten::clamp 0.99% 126.442us 1.60% 203.852us 16.988us 63.456us 1.17% 63.456us 5.288us 12
4059
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 63.456us 1.17% 63.456us 5.288us 12
4060
- aten::where 0.06% 7.391us 5.01% 637.190us 106.198us 0.000us 0.00% 61.533us 10.256us 6
4061
- aten::nonzero_numpy 0.09% 11.880us 4.95% 629.799us 104.967us 0.000us 0.00% 61.533us 10.256us 6
4062
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 60.544us 1.12% 60.544us 10.091us 6
4063
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 56.929us 1.05% 56.929us 4.744us 12
4064
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 51.073us 0.94% 51.073us 1.135us 45
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
- Self CPU time total: 12.720ms
4067
- Self CUDA time total: 5.425ms
4068
 
4069
 
4070
 
@@ -4074,29 +4074,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E4
4074
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4075
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 13.942ms 218.38% 13.942ms 13.942ms 1
4078
- gpt_oss_experts 15.57% 2.499ms 99.97% 16.048ms 16.048ms 0.000us 0.00% 6.387ms 6.387ms 1
4079
- aten::matmul 0.25% 39.461us 4.79% 769.170us 32.049us 0.000us 0.00% 5.570ms 232.102us 24
4080
- aten::mm 2.77% 444.894us 4.55% 729.709us 30.405us 5.570ms 87.25% 5.570ms 232.102us 24
4081
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.515ms 86.38% 5.515ms 229.794us 24
4082
- aten::nonzero 2.34% 374.919us 7.60% 1.220ms 81.308us 114.786us 1.80% 137.349us 9.157us 15
4083
- aten::mul 1.86% 298.668us 3.09% 496.508us 10.344us 131.614us 2.06% 131.614us 2.742us 48
4084
- aten::add 2.06% 330.439us 3.47% 556.980us 15.472us 127.904us 2.00% 127.904us 3.553us 36
4085
- aten::where 0.07% 11.120us 7.17% 1.151ms 95.939us 0.000us 0.00% 123.109us 10.259us 12
4086
- aten::nonzero_numpy 0.13% 20.771us 7.10% 1.140ms 95.012us 0.000us 0.00% 123.109us 10.259us 12
4087
- aten::index 2.15% 344.365us 3.72% 597.667us 24.903us 111.391us 1.74% 111.391us 4.641us 24
4088
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 101.985us 1.60% 101.985us 4.249us 24
4089
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 91.395us 1.43% 91.395us 1.051us 87
4090
- aten::clamp 1.30% 208.833us 2.21% 355.215us 14.801us 88.257us 1.38% 88.257us 3.677us 24
4091
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 88.257us 1.38% 88.257us 3.677us 24
4092
- aten::item 0.49% 78.042us 39.66% 6.367ms 88.433us 0.000us 0.00% 75.297us 1.046us 72
4093
- aten::_local_scalar_dense 1.92% 308.797us 39.18% 6.289ms 87.349us 75.297us 1.18% 75.297us 1.046us 72
4094
- aten::index_add_ 0.59% 94.029us 0.99% 158.640us 13.220us 71.454us 1.12% 71.454us 5.954us 12
4095
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 71.454us 1.12% 71.454us 5.954us 12
4096
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 66.271us 1.04% 66.271us 5.523us 12
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
- Self CPU time total: 16.053ms
4099
- Self CUDA time total: 6.384ms
4100
 
4101
 
4102
 
@@ -4106,29 +4106,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E2
4106
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4107
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 12.597ms 146.28% 12.597ms 12.597ms 1
4110
- gpt_oss_experts 11.26% 1.671ms 99.96% 14.835ms 14.835ms 0.000us 0.00% 8.616ms 8.616ms 1
4111
- aten::matmul 0.13% 19.980us 2.85% 423.596us 35.300us 0.000us 0.00% 7.614ms 634.486us 12
4112
- aten::mm 1.70% 251.563us 2.72% 403.616us 33.635us 7.614ms 88.42% 7.614ms 634.486us 12
4113
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 4.628ms 53.74% 4.628ms 771.312us 6
4114
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 1.524ms 17.70% 1.524ms 508.107us 3
4115
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.455ms 16.90% 1.455ms 485.046us 3
4116
- aten::mul 1.00% 148.488us 1.71% 253.960us 10.582us 188.737us 2.19% 188.737us 7.864us 24
4117
- aten::add 1.14% 169.821us 1.97% 292.395us 16.244us 180.606us 2.10% 180.606us 10.034us 18
4118
- aten::index_add_ 0.32% 47.691us 0.57% 84.001us 14.000us 164.000us 1.90% 164.000us 27.333us 6
4119
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 164.000us 1.90% 164.000us 27.333us 6
4120
- aten::index 1.23% 181.951us 2.12% 314.145us 26.179us 144.608us 1.68% 144.608us 12.051us 12
4121
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 142.815us 1.66% 142.815us 11.901us 12
4122
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 114.816us 1.33% 114.816us 19.136us 6
4123
- aten::clamp 0.72% 107.083us 1.24% 184.134us 15.345us 106.818us 1.24% 106.818us 8.902us 12
4124
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 106.818us 1.24% 106.818us 8.902us 12
4125
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 100.513us 1.17% 100.513us 8.376us 12
4126
- aten::nonzero 1.51% 224.830us 4.84% 718.263us 79.807us 68.894us 0.80% 80.029us 8.892us 9
4127
- aten::where 0.04% 5.681us 3.95% 586.411us 97.735us 0.000us 0.00% 65.405us 10.901us 6
4128
- aten::nonzero_numpy 0.07% 10.160us 3.91% 580.730us 96.788us 0.000us 0.00% 65.405us 10.901us 6
4129
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4130
- Self CPU time total: 14.841ms
4131
- Self CUDA time total: 8.611ms
4132
 
4133
 
4134
 
@@ -4138,29 +4138,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E4
4138
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4139
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4140
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4141
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 18.460ms 171.74% 18.460ms 18.460ms 1
4142
- gpt_oss_experts 12.58% 2.618ms 99.97% 20.806ms 20.806ms 0.000us 0.00% 10.754ms 10.754ms 1
4143
- aten::matmul 0.19% 39.724us 3.85% 801.313us 33.388us 0.000us 0.00% 9.496ms 395.681us 24
4144
- aten::mm 2.21% 460.813us 3.66% 761.589us 31.733us 9.496ms 88.35% 9.496ms 395.681us 24
4145
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 6.491ms 60.39% 6.491ms 360.603us 18
4146
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.993ms 27.84% 2.993ms 498.774us 6
4147
- aten::mul 2.25% 467.369us 3.28% 683.452us 14.239us 226.014us 2.10% 226.014us 4.709us 48
4148
- aten::add 1.60% 332.210us 2.74% 569.351us 15.815us 207.013us 1.93% 207.013us 5.750us 36
4149
- aten::index 1.72% 357.427us 2.99% 622.664us 25.944us 203.329us 1.89% 203.329us 8.472us 24
4150
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 162.243us 1.51% 162.243us 6.760us 24
4151
- aten::index_add_ 0.45% 94.395us 0.78% 161.485us 13.457us 155.167us 1.44% 155.167us 12.931us 12
4152
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 155.167us 1.44% 155.167us 12.931us 12
4153
- aten::nonzero 1.86% 386.184us 6.07% 1.263ms 84.202us 120.989us 1.13% 144.894us 9.660us 15
4154
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 144.769us 1.35% 144.769us 12.064us 12
4155
- aten::where 0.05% 10.779us 5.71% 1.188ms 99.031us 0.000us 0.00% 130.270us 10.856us 12
4156
- aten::nonzero_numpy 0.10% 20.452us 5.66% 1.178ms 98.133us 0.000us 0.00% 130.270us 10.856us 12
4157
- aten::clamp 1.04% 217.185us 1.79% 373.407us 15.559us 129.252us 1.20% 129.252us 5.386us 24
4158
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 129.252us 1.20% 129.252us 5.386us 24
4159
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.584us 1.08% 115.584us 4.816us 24
4160
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 107.234us 1.00% 107.234us 1.233us 87
4161
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4162
- Self CPU time total: 20.812ms
4163
- Self CUDA time total: 10.749ms
4164
 
4165
 
4166
 
@@ -4170,29 +4170,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E2
4170
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4171
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4172
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4173
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 21.083ms 119.21% 21.083ms 21.083ms 1
4174
- gpt_oss_experts 7.12% 1.665ms 99.98% 23.365ms 23.365ms 0.000us 0.00% 17.695ms 17.695ms 1
4175
- aten::matmul 0.09% 20.129us 1.89% 441.429us 36.786us 0.000us 0.00% 14.828ms 1.236ms 12
4176
- aten::mm 1.11% 260.517us 1.80% 421.300us 35.108us 14.828ms 83.84% 14.828ms 1.236ms 12
4177
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 9.047ms 51.15% 9.047ms 1.508ms 6
4178
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.773ms 32.64% 5.773ms 962.167us 6
4179
- aten::add 0.74% 174.025us 1.27% 296.156us 16.453us 776.579us 4.39% 776.579us 43.143us 18
4180
- aten::mul 0.64% 149.555us 1.10% 257.226us 10.718us 654.338us 3.70% 654.338us 27.264us 24
4181
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 499.874us 2.83% 499.874us 41.656us 12
4182
- aten::index_add_ 0.21% 48.400us 0.36% 84.241us 14.040us 449.985us 2.54% 449.985us 74.998us 6
4183
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 449.985us 2.54% 449.985us 74.998us 6
4184
- aten::clamp 0.46% 107.321us 0.79% 185.253us 15.438us 329.054us 1.86% 329.054us 27.421us 12
4185
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 329.054us 1.86% 329.054us 27.421us 12
4186
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 300.737us 1.70% 300.737us 50.123us 6
4187
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 276.705us 1.56% 276.705us 46.117us 6
4188
- aten::index 0.76% 178.051us 1.32% 309.462us 25.788us 268.800us 1.52% 268.800us 22.400us 12
4189
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 253.889us 1.44% 253.889us 21.157us 12
4190
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 236.095us 1.33% 236.095us 39.349us 6
4191
- aten::sigmoid 0.16% 36.571us 0.27% 63.572us 10.595us 176.833us 1.00% 176.833us 29.472us 6
4192
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 176.833us 1.00% 176.833us 29.472us 6
4193
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4194
- Self CPU time total: 23.371ms
4195
- Self CUDA time total: 17.686ms
4196
 
4197
 
4198
 
@@ -4202,29 +4202,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E4
4202
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4203
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4204
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4205
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 24.709ms 139.35% 24.709ms 24.709ms 1
4206
- gpt_oss_experts 9.76% 2.650ms 99.98% 27.156ms 27.156ms 0.000us 0.00% 17.741ms 17.741ms 1
4207
- aten::matmul 0.15% 40.162us 3.17% 860.144us 35.839us 0.000us 0.00% 15.537ms 647.383us 24
4208
- aten::mm 1.90% 517.331us 3.02% 819.982us 34.166us 15.537ms 87.63% 15.537ms 647.383us 24
4209
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 9.352ms 52.74% 9.352ms 779.317us 12
4210
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.225ms 18.19% 3.225ms 537.452us 6
4211
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.947ms 16.62% 2.947ms 491.169us 6
4212
- aten::add 1.29% 349.077us 2.22% 601.999us 16.722us 419.552us 2.37% 419.552us 11.654us 36
4213
- aten::mul 1.15% 311.953us 1.98% 539.014us 11.229us 410.371us 2.31% 410.371us 8.549us 48
4214
- aten::index_add_ 0.36% 97.270us 0.61% 164.412us 13.701us 379.682us 2.14% 379.682us 31.640us 12
4215
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 379.682us 2.14% 379.682us 31.640us 12
4216
- aten::index 1.31% 354.897us 2.36% 641.129us 26.714us 344.639us 1.94% 344.639us 14.360us 24
4217
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 337.056us 1.90% 337.056us 14.044us 24
4218
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 280.607us 1.58% 280.607us 23.384us 12
4219
- aten::clamp 0.78% 212.661us 1.36% 368.626us 15.359us 225.662us 1.27% 225.662us 9.403us 24
4220
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 225.662us 1.27% 225.662us 9.403us 24
4221
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 218.112us 1.23% 218.112us 9.088us 24
4222
- aten::nonzero 1.41% 383.824us 4.68% 1.271ms 84.702us 127.715us 0.72% 153.604us 10.240us 15
4223
- aten::where 0.04% 11.073us 4.43% 1.203ms 100.252us 0.000us 0.00% 138.052us 11.504us 12
4224
- aten::nonzero_numpy 0.07% 20.230us 4.39% 1.192ms 99.329us 0.000us 0.00% 138.052us 11.504us 12
4225
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4226
- Self CPU time total: 27.162ms
4227
- Self CUDA time total: 17.731ms
4228
 
4229
 
4230
 
@@ -4234,29 +4234,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E2
4234
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4235
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4236
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4237
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.750ms 109.03% 40.750ms 40.750ms 1
4238
- gpt_oss_experts 4.08% 1.695ms 99.82% 41.512ms 41.512ms 0.000us 0.00% 37.407ms 37.407ms 1
4239
- aten::matmul 0.05% 20.951us 1.02% 424.118us 35.343us 0.000us 0.00% 27.409ms 2.284ms 12
4240
- aten::mm 0.67% 277.566us 0.97% 403.167us 33.597us 27.409ms 73.34% 27.409ms 2.284ms 12
4241
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 27.406ms 73.33% 27.406ms 2.284ms 12
4242
- aten::mul 0.37% 154.550us 0.63% 261.852us 10.911us 2.976ms 7.96% 2.976ms 124.014us 24
4243
- aten::add 0.45% 185.160us 1.07% 445.895us 24.772us 2.401ms 6.42% 2.401ms 133.369us 18
4244
- aten::clamp 0.28% 116.599us 0.48% 198.482us 16.540us 2.391ms 6.40% 2.391ms 199.291us 12
4245
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.391ms 6.40% 2.391ms 199.291us 12
4246
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.983ms 5.30% 1.983ms 165.222us 12
4247
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.625ms 4.35% 1.625ms 135.419us 12
4248
- aten::index_add_ 0.12% 48.080us 0.21% 86.751us 14.459us 910.402us 2.44% 910.402us 151.734us 6
4249
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 910.402us 2.44% 910.402us 151.734us 6
4250
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 775.618us 2.08% 775.618us 129.270us 6
4251
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 740.611us 1.98% 740.611us 123.435us 6
4252
- aten::index 0.44% 181.234us 0.76% 317.848us 26.487us 714.884us 1.91% 714.884us 59.574us 12
4253
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 681.379us 1.82% 681.379us 113.563us 6
4254
- aten::sigmoid 0.09% 38.611us 0.16% 65.922us 10.987us 320.927us 0.86% 320.927us 53.488us 6
4255
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 320.927us 0.86% 320.927us 53.488us 6
4256
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 253.057us 0.68% 253.057us 42.176us 6
4257
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4258
- Self CPU time total: 41.585ms
4259
- Self CUDA time total: 37.374ms
4260
 
4261
 
4262
 
@@ -4266,54 +4266,56 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E4
4266
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4267
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4268
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4269
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 41.218ms 116.52% 41.218ms 41.218ms 1
4270
- gpt_oss_experts 6.00% 2.524ms 99.99% 42.088ms 42.088ms 0.000us 0.00% 35.395ms 35.395ms 1
4271
- aten::matmul 0.10% 40.160us 2.08% 875.043us 36.460us 0.000us 0.00% 29.436ms 1.226ms 24
4272
- aten::mm 1.24% 520.099us 1.98% 834.883us 34.787us 29.436ms 83.21% 29.436ms 1.226ms 24
4273
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 20.785ms 58.75% 20.785ms 1.386ms 15
4274
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 8.635ms 24.41% 8.635ms 959.410us 9
4275
- aten::add 0.83% 349.812us 1.43% 602.505us 16.736us 1.482ms 4.19% 1.482ms 41.161us 36
4276
- aten::mul 0.72% 302.661us 1.25% 525.878us 10.956us 1.369ms 3.87% 1.369ms 28.527us 48
4277
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 928.163us 2.62% 928.163us 38.673us 24
4278
- aten::index_add_ 0.23% 95.791us 0.40% 170.382us 14.198us 908.198us 2.57% 908.198us 75.683us 12
4279
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 908.198us 2.57% 908.198us 75.683us 12
4280
- aten::clamp 0.52% 220.263us 0.90% 378.355us 15.765us 771.551us 2.18% 771.551us 32.148us 24
4281
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 771.551us 2.18% 771.551us 32.148us 24
4282
- aten::index 0.83% 351.191us 1.46% 613.487us 25.562us 665.121us 1.88% 665.121us 27.713us 24
4283
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 648.065us 1.83% 648.065us 54.005us 12
4284
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 594.560us 1.68% 594.560us 49.547us 12
4285
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 553.635us 1.57% 553.635us 46.136us 12
4286
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 519.010us 1.47% 519.010us 21.625us 24
4287
- aten::sigmoid 0.17% 72.451us 0.30% 125.701us 10.475us 356.257us 1.01% 356.257us 29.688us 12
4288
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 356.257us 1.01% 356.257us 29.688us 12
4289
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4290
- Self CPU time total: 42.094ms
4291
- Self CUDA time total: 35.375ms
4292
 
4293
 
4294
  impl wl p50(ms) ok
4295
- gpt_oss_experts cuda_B1_S1024_E2 3.84 True
4296
- gpt_oss_experts cuda_B1_S1024_E4 5.30 True
4297
- gpt_oss_experts cuda_B1_S512_E2 2.68 True
4298
- gpt_oss_experts cuda_B1_S512_E4 3.91 True
4299
- gpt_oss_experts cuda_B4_S1024_E2 13.35 True
4300
- gpt_oss_experts cuda_B4_S1024_E4 13.35 True
4301
- gpt_oss_experts cuda_B4_S512_E2 6.80 True
4302
- gpt_oss_experts cuda_B4_S512_E4 7.46 True
4303
  </pre></div>
4304
  <div class="uv-install-logs" id="uv-logs-benchmark">
4305
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4306
  <div class="uv-logs-content" style="display: none;">
4307
  Updating https://github.com/huggingface/kernels.git (HEAD)
4308
- Updated https://github.com/huggingface/kernels.git (39d2ade2d3d5e05476d42bcdd62ecdaa78f2db69)
4309
- Building kernels @ git+https://github.com/huggingface/kernels.git@39d2ade2d3d5e05476d42bcdd62ecdaa78f2db69
4310
- Built kernels @ git+https://github.com/huggingface/kernels.git@39d2ade2d3d5e05476d42bcdd62ecdaa78f2db69
4311
- Installed 52 packages in 267ms
4312
  </div>
4313
  </div>
4314
- <div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00&lt;?, ?it/s]
4315
- Fetching 6 files: 50%|█████ | 3/6 [00:00&lt;00:00, 3.54it/s]
4316
- Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 7.08it/s]</div>
 
 
4317
  <div class="cell-artifacts">
4318
  <h4>Artifacts:</h4>
4319
  <a href="artifacts/benchmark/openai_moe.jsonl" class="artifact" target="_blank">openai_moe.jsonl</a>
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:56:28 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
+ | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
3912
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3913
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
+ | N/A 34C P0 80W / 350W | 0MiB / 46068MiB | 41% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
 
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
+ Cell: benchmark | 24.78s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
4042
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4043
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4044
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4045
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 10.216ms 197.55% 10.216ms 10.216ms 1
4046
+ gpt_oss_experts 15.91% 1.991ms 99.94% 12.506ms 12.506ms 0.000us 0.00% 5.174ms 5.174ms 1
4047
+ aten::matmul 0.20% 25.600us 3.83% 479.475us 39.956us 0.000us 0.00% 4.551ms 379.252us 12
4048
+ aten::mm 2.39% 299.076us 3.63% 453.875us 37.823us 4.551ms 88.01% 4.551ms 379.252us 12
4049
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.080ms 59.56% 3.080ms 342.220us 9
4050
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.465ms 28.33% 1.465ms 488.237us 3
4051
+ aten::mul 1.26% 158.145us 2.18% 272.217us 11.342us 108.227us 2.09% 108.227us 4.509us 24
4052
+ aten::add 1.55% 194.211us 3.70% 462.764us 25.709us 102.178us 1.98% 102.178us 5.677us 18
4053
+ aten::index 1.59% 198.973us 2.67% 334.663us 27.889us 88.354us 1.71% 88.354us 7.363us 12
4054
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 79.810us 1.54% 79.810us 6.651us 12
4055
+ aten::index_add_ 0.45% 56.680us 0.73% 90.740us 15.123us 79.552us 1.54% 79.552us 13.259us 6
4056
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 79.552us 1.54% 79.552us 13.259us 6
4057
+ aten::nonzero 2.18% 273.387us 6.63% 829.392us 92.155us 65.344us 1.26% 76.032us 8.448us 9
4058
+ aten::clamp 1.03% 129.422us 1.66% 207.823us 17.319us 62.817us 1.21% 62.817us 5.235us 12
4059
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.817us 1.21% 62.817us 5.235us 12
4060
+ aten::where 0.06% 7.719us 5.20% 651.098us 108.516us 0.000us 0.00% 61.377us 10.230us 6
4061
+ aten::nonzero_numpy 0.10% 11.990us 5.14% 643.379us 107.230us 0.000us 0.00% 61.377us 10.230us 6
4062
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 60.705us 1.17% 60.705us 10.117us 6
4063
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 56.224us 1.09% 56.224us 4.685us 12
4064
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 52.097us 1.01% 52.097us 1.158us 45
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
+ Self CPU time total: 12.513ms
4067
+ Self CUDA time total: 5.171ms
4068
 
4069
 
4070
 
 
4074
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4075
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 13.651ms 223.52% 13.651ms 13.651ms 1
4078
+ gpt_oss_experts 16.12% 2.545ms 99.96% 15.780ms 15.780ms 0.000us 0.00% 6.110ms 6.110ms 1
4079
+ aten::matmul 0.27% 42.481us 4.88% 770.802us 32.117us 0.000us 0.00% 5.294ms 220.572us 24
4080
+ aten::mm 2.84% 449.097us 4.61% 728.321us 30.347us 5.294ms 86.68% 5.294ms 220.572us 24
4081
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.237ms 85.76% 5.237ms 218.225us 24
4082
+ aten::nonzero 2.39% 377.468us 7.75% 1.223ms 81.521us 114.980us 1.88% 137.541us 9.169us 15
4083
+ aten::mul 1.82% 287.750us 3.13% 494.205us 10.296us 131.291us 2.15% 131.291us 2.735us 48
4084
+ aten::add 2.12% 335.279us 3.54% 558.312us 15.509us 126.947us 2.08% 126.947us 3.526us 36
4085
+ aten::where 0.06% 10.192us 7.29% 1.151ms 95.886us 0.000us 0.00% 123.269us 10.272us 12
4086
+ aten::nonzero_numpy 0.13% 20.434us 7.22% 1.140ms 95.037us 0.000us 0.00% 123.269us 10.272us 12
4087
+ aten::index 2.26% 356.611us 3.79% 598.637us 24.943us 111.201us 1.82% 111.201us 4.633us 24
4088
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 100.995us 1.65% 100.995us 4.208us 24
4089
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 92.827us 1.52% 92.827us 1.067us 87
4090
+ aten::clamp 1.32% 208.364us 2.23% 352.254us 14.677us 87.969us 1.44% 87.969us 3.665us 24
4091
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 87.969us 1.44% 87.969us 3.665us 24
4092
+ aten::item 0.49% 76.878us 38.49% 6.076ms 84.392us 0.000us 0.00% 76.474us 1.062us 72
4093
+ aten::_local_scalar_dense 1.91% 301.114us 38.00% 5.999ms 83.325us 76.474us 1.25% 76.474us 1.062us 72
4094
+ aten::index_add_ 0.59% 93.433us 0.97% 153.683us 12.807us 71.618us 1.17% 71.618us 5.968us 12
4095
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 71.618us 1.17% 71.618us 5.968us 12
4096
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 66.305us 1.09% 66.305us 5.525us 12
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
+ Self CPU time total: 15.786ms
4099
+ Self CUDA time total: 6.107ms
4100
 
4101
 
4102
 
 
4106
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4107
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 12.389ms 148.13% 12.389ms 12.389ms 1
4110
+ gpt_oss_experts 11.41% 1.669ms 99.96% 14.621ms 14.621ms 0.000us 0.00% 8.369ms 8.369ms 1
4111
+ aten::matmul 0.15% 21.391us 2.94% 430.078us 35.840us 0.000us 0.00% 7.346ms 612.203us 12
4112
+ aten::mm 1.75% 256.389us 2.79% 408.687us 34.057us 7.346ms 87.84% 7.346ms 612.203us 12
4113
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 4.488ms 53.66% 4.488ms 748.004us 6
4114
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 1.464ms 17.50% 1.464ms 487.982us 3
4115
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.388ms 16.59% 1.388ms 462.616us 3
4116
+ aten::mul 1.00% 145.604us 1.75% 255.696us 10.654us 194.273us 2.32% 194.273us 8.095us 24
4117
+ aten::add 1.43% 208.704us 2.27% 331.465us 18.415us 186.050us 2.22% 186.050us 10.336us 18
4118
+ aten::index_add_ 0.32% 46.701us 0.54% 78.582us 13.097us 164.160us 1.96% 164.160us 27.360us 6
4119
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 164.160us 1.96% 164.160us 27.360us 6
4120
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 147.425us 1.76% 147.425us 12.285us 12
4121
+ aten::index 1.21% 177.253us 2.08% 304.936us 25.411us 145.886us 1.74% 145.886us 12.157us 12
4122
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 115.777us 1.38% 115.777us 19.296us 6
4123
+ aten::clamp 0.73% 106.215us 1.25% 183.083us 15.257us 109.858us 1.31% 109.858us 9.155us 12
4124
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 109.858us 1.31% 109.858us 9.155us 12
4125
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 103.393us 1.24% 103.393us 8.616us 12
4126
+ aten::nonzero 1.57% 229.936us 5.04% 737.613us 81.957us 69.954us 0.84% 81.378us 9.042us 9
4127
+ aten::where 0.04% 5.651us 4.11% 600.652us 100.109us 0.000us 0.00% 66.625us 11.104us 6
4128
+ aten::nonzero_numpy 0.07% 10.392us 4.07% 595.001us 99.167us 0.000us 0.00% 66.625us 11.104us 6
4129
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4130
+ Self CPU time total: 14.627ms
4131
+ Self CUDA time total: 8.364ms
4132
 
4133
 
4134
 
 
4138
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4139
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4140
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4141
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 18.030ms 173.64% 18.030ms 18.030ms 1
4142
+ gpt_oss_experts 13.01% 2.655ms 99.97% 20.395ms 20.395ms 0.000us 0.00% 10.389ms 10.389ms 1
4143
+ aten::matmul 0.22% 44.301us 3.96% 808.849us 33.702us 0.000us 0.00% 9.112ms 379.676us 24
4144
+ aten::mm 2.30% 469.031us 3.75% 764.548us 31.856us 9.112ms 87.75% 9.112ms 379.676us 24
4145
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 6.210ms 59.81% 6.210ms 345.012us 18
4146
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.889ms 27.82% 2.889ms 481.470us 6
4147
+ aten::mul 1.42% 289.963us 2.49% 508.435us 10.592us 229.763us 2.21% 229.763us 4.787us 48
4148
+ aten::add 1.72% 350.925us 2.89% 589.949us 16.387us 210.624us 2.03% 210.624us 5.851us 36
4149
+ aten::index 1.71% 348.756us 3.02% 616.348us 25.681us 206.625us 1.99% 206.625us 8.609us 24
4150
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 164.800us 1.59% 164.800us 6.867us 24
4151
+ aten::index_add_ 0.46% 94.741us 0.78% 158.583us 13.215us 154.948us 1.49% 154.948us 12.912us 12
4152
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 154.948us 1.49% 154.948us 12.912us 12
4153
+ aten::nonzero 1.87% 380.973us 6.27% 1.279ms 85.299us 123.616us 1.19% 148.097us 9.873us 15
4154
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 147.008us 1.42% 147.008us 12.251us 12
4155
+ aten::where 0.05% 10.520us 5.90% 1.205ms 100.384us 0.000us 0.00% 133.153us 11.096us 12
4156
+ aten::nonzero_numpy 0.10% 20.862us 5.85% 1.194ms 99.507us 0.000us 0.00% 133.153us 11.096us 12
4157
+ aten::clamp 1.12% 227.601us 1.88% 383.872us 15.995us 131.553us 1.27% 131.553us 5.481us 24
4158
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 131.553us 1.27% 131.553us 5.481us 24
4159
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 117.823us 1.13% 117.823us 4.909us 24
4160
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 108.771us 1.05% 108.771us 1.250us 87
4161
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4162
+ Self CPU time total: 20.401ms
4163
+ Self CUDA time total: 10.384ms
4164
 
4165
 
4166
 
 
4170
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4171
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4172
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4173
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 20.818ms 119.74% 20.818ms 20.818ms 1
4174
+ gpt_oss_experts 7.44% 1.725ms 99.97% 23.178ms 23.178ms 0.000us 0.00% 17.396ms 17.396ms 1
4175
+ aten::matmul 0.10% 22.710us 1.92% 444.608us 37.051us 0.000us 0.00% 14.530ms 1.211ms 12
4176
+ aten::mm 1.15% 265.607us 1.82% 421.898us 35.158us 14.530ms 83.57% 14.530ms 1.211ms 12
4177
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 8.913ms 51.26% 8.913ms 1.485ms 6
4178
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.608ms 32.26% 5.608ms 934.678us 6
4179
+ aten::add 0.78% 180.710us 1.31% 303.585us 16.866us 773.156us 4.45% 773.156us 42.953us 18
4180
+ aten::mul 0.65% 149.642us 1.11% 257.853us 10.744us 660.963us 3.80% 660.963us 27.540us 24
4181
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 496.548us 2.86% 496.548us 41.379us 12
4182
+ aten::index_add_ 0.21% 47.690us 0.35% 80.102us 13.350us 447.875us 2.58% 447.875us 74.646us 6
4183
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 447.875us 2.58% 447.875us 74.646us 6
4184
+ aten::clamp 0.46% 106.452us 0.78% 180.843us 15.070us 330.692us 1.90% 330.692us 27.558us 12
4185
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 330.692us 1.90% 330.692us 27.558us 12
4186
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 303.202us 1.74% 303.202us 50.534us 6
4187
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 276.608us 1.59% 276.608us 46.101us 6
4188
+ aten::index 0.79% 182.360us 1.33% 307.754us 25.646us 264.037us 1.52% 264.037us 22.003us 12
4189
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 255.650us 1.47% 255.650us 21.304us 12
4190
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 230.532us 1.33% 230.532us 38.422us 6
4191
+ aten::sigmoid 0.15% 34.019us 0.26% 59.750us 9.958us 176.897us 1.02% 176.897us 29.483us 6
4192
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 176.897us 1.02% 176.897us 29.483us 6
4193
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4194
+ Self CPU time total: 23.184ms
4195
+ Self CUDA time total: 17.386ms
4196
 
4197
 
4198
 
 
4202
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4203
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4204
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4205
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 24.095ms 139.71% 24.095ms 24.095ms 1
4206
+ gpt_oss_experts 10.26% 2.566ms 99.98% 25.007ms 25.007ms 0.000us 0.00% 17.256ms 17.256ms 1
4207
+ aten::matmul 0.18% 46.022us 3.50% 875.333us 36.472us 0.000us 0.00% 15.047ms 626.957us 24
4208
+ aten::mm 2.10% 524.786us 3.32% 829.311us 34.555us 15.047ms 87.25% 15.047ms 626.957us 24
4209
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 9.083ms 52.67% 9.083ms 756.906us 12
4210
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.100ms 17.97% 3.100ms 516.616us 6
4211
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.851ms 16.53% 2.851ms 475.118us 6
4212
+ aten::add 1.39% 348.094us 2.36% 591.130us 16.420us 420.966us 2.44% 420.966us 11.694us 36
4213
+ aten::mul 1.18% 295.904us 2.08% 520.297us 10.840us 412.933us 2.39% 412.933us 8.603us 48
4214
+ aten::index_add_ 0.37% 93.743us 0.64% 158.984us 13.249us 378.655us 2.20% 378.655us 31.555us 12
4215
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 378.655us 2.20% 378.655us 31.555us 12
4216
+ aten::index 1.44% 360.181us 2.46% 616.468us 25.686us 341.602us 1.98% 341.602us 14.233us 24
4217
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 337.478us 1.96% 337.478us 14.062us 24
4218
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 277.186us 1.61% 277.186us 23.099us 12
4219
+ aten::clamp 0.86% 215.346us 1.46% 365.788us 15.241us 227.201us 1.32% 227.201us 9.467us 24
4220
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 227.201us 1.32% 227.201us 9.467us 24
4221
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 218.148us 1.26% 218.148us 9.090us 24
4222
+ aten::nonzero 1.58% 395.427us 5.08% 1.271ms 84.763us 129.407us 0.75% 155.998us 10.400us 15
4223
+ aten::where 0.04% 10.161us 4.81% 1.203ms 100.233us 0.000us 0.00% 140.318us 11.693us 12
4224
+ aten::nonzero_numpy 0.09% 22.657us 4.77% 1.193ms 99.386us 0.000us 0.00% 140.318us 11.693us 12
4225
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4226
+ Self CPU time total: 25.012ms
4227
+ Self CUDA time total: 17.246ms
4228
 
4229
 
4230
 
 
4234
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4235
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4236
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4237
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.042ms 109.28% 40.042ms 40.042ms 1
4238
+ gpt_oss_experts 4.23% 1.729ms 99.82% 40.817ms 40.817ms 0.000us 0.00% 36.674ms 36.674ms 1
4239
+ aten::matmul 0.05% 21.410us 1.03% 421.330us 35.111us 0.000us 0.00% 26.675ms 2.223ms 12
4240
+ aten::mm 0.68% 276.698us 0.98% 399.920us 33.327us 26.675ms 72.80% 26.675ms 2.223ms 12
4241
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 26.671ms 72.79% 26.671ms 2.223ms 12
4242
+ aten::mul 0.37% 150.524us 0.64% 261.025us 10.876us 2.978ms 8.13% 2.978ms 124.096us 24
4243
+ aten::add 0.45% 185.051us 1.06% 431.657us 23.981us 2.397ms 6.54% 2.397ms 133.144us 18
4244
+ aten::clamp 0.27% 109.540us 0.45% 185.742us 15.479us 2.388ms 6.52% 2.388ms 199.031us 12
4245
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.388ms 6.52% 2.388ms 199.031us 12
4246
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.988ms 5.43% 1.988ms 165.705us 12
4247
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.624ms 4.43% 1.624ms 135.337us 12
4248
+ aten::index_add_ 0.12% 48.010us 0.20% 82.940us 13.823us 919.238us 2.51% 919.238us 153.206us 6
4249
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 919.238us 2.51% 919.238us 153.206us 6
4250
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 772.550us 2.11% 772.550us 128.758us 6
4251
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 739.366us 2.02% 739.366us 123.228us 6
4252
+ aten::index 0.45% 182.853us 0.76% 309.646us 25.804us 710.532us 1.94% 710.532us 59.211us 12
4253
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 676.741us 1.85% 676.741us 112.790us 6
4254
+ aten::sigmoid 0.10% 42.329us 0.17% 69.270us 11.545us 319.457us 0.87% 319.457us 53.243us 6
4255
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 319.457us 0.87% 319.457us 53.243us 6
4256
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 250.467us 0.68% 250.467us 41.744us 6
4257
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4258
+ Self CPU time total: 40.890ms
4259
+ Self CUDA time total: 36.640ms
4260
 
4261
 
4262
 
 
4266
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4267
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4268
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4269
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.661ms 117.09% 40.661ms 40.661ms 1
4270
+ gpt_oss_experts 6.16% 2.556ms 99.99% 41.476ms 41.476ms 0.000us 0.00% 34.747ms 34.747ms 1
4271
+ aten::matmul 0.11% 44.399us 2.11% 876.925us 36.539us 0.000us 0.00% 28.768ms 1.199ms 24
4272
+ aten::mm 1.26% 521.881us 2.01% 832.526us 34.689us 28.768ms 82.84% 28.768ms 1.199ms 24
4273
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 20.394ms 58.72% 20.394ms 1.360ms 15
4274
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 8.357ms 24.06% 8.357ms 928.569us 9
4275
+ aten::add 0.86% 357.079us 1.47% 609.793us 16.939us 1.481ms 4.26% 1.481ms 41.126us 36
4276
+ aten::mul 0.72% 298.967us 1.26% 524.144us 10.920us 1.380ms 3.97% 1.380ms 28.743us 48
4277
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 929.416us 2.68% 929.416us 38.726us 24
4278
+ aten::index_add_ 0.23% 94.554us 0.39% 161.804us 13.484us 921.702us 2.65% 921.702us 76.809us 12
4279
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 921.702us 2.65% 921.702us 76.809us 12
4280
+ aten::clamp 0.53% 218.042us 0.91% 375.616us 15.651us 772.487us 2.22% 772.487us 32.187us 24
4281
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 772.487us 2.22% 772.487us 32.187us 24
4282
+ aten::index 0.86% 357.217us 1.47% 607.740us 25.323us 652.838us 1.88% 652.838us 27.202us 24
4283
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 648.162us 1.87% 648.162us 54.013us 12
4284
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 580.997us 1.67% 580.997us 48.416us 12
4285
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 551.108us 1.59% 551.108us 45.926us 12
4286
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 524.097us 1.51% 524.097us 21.837us 24
4287
+ aten::sigmoid 0.17% 69.444us 0.30% 123.064us 10.255us 357.924us 1.03% 357.924us 29.827us 12
4288
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 357.924us 1.03% 357.924us 29.827us 12
4289
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4290
+ Self CPU time total: 41.482ms
4291
+ Self CUDA time total: 34.727ms
4292
 
4293
 
4294
  impl wl p50(ms) ok
4295
+ gpt_oss_experts cuda_B1_S1024_E2 3.77 True
4296
+ gpt_oss_experts cuda_B1_S1024_E4 5.20 True
4297
+ gpt_oss_experts cuda_B1_S512_E2 2.61 True
4298
+ gpt_oss_experts cuda_B1_S512_E4 3.85 True
4299
+ gpt_oss_experts cuda_B4_S1024_E2 13.12 True
4300
+ gpt_oss_experts cuda_B4_S1024_E4 13.22 True
4301
+ gpt_oss_experts cuda_B4_S512_E2 6.64 True
4302
+ gpt_oss_experts cuda_B4_S512_E4 7.30 True
4303
  </pre></div>
4304
  <div class="uv-install-logs" id="uv-logs-benchmark">
4305
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4306
  <div class="uv-logs-content" style="display: none;">
4307
  Updating https://github.com/huggingface/kernels.git (HEAD)
4308
+ Updated https://github.com/huggingface/kernels.git (55b7c980e96bf5f747f0e4136be61c0b089ab76c)
4309
+ Building kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c
4310
+ Built kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c
4311
+ Installed 51 packages in 279ms
4312
  </div>
4313
  </div>
4314
+ <div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00&lt;?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
4315
+
4316
+ Fetching 6 files: 17%|█▋ | 1/6 [00:00&lt;00:00, 5.68it/s]
4317
+ Fetching 6 files: 50%|█████ | 3/6 [00:00&lt;00:00, 5.21it/s]
4318
+ Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 10.50it/s]</div>
4319
  <div class="cell-artifacts">
4320
  <h4>Artifacts:</h4>
4321
  <a href="artifacts/benchmark/openai_moe.jsonl" class="artifact" target="_blank">openai_moe.jsonl</a>
openai_moe/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: b2bc6dda123451533c1a79e388cff2690a74060cba2a49b113394b35cfec34c2
  • Pointer size: 130 Bytes
  • Size of remote file: 21.9 kB

Git LFS Details

  • SHA256: 7db8d527515e46144ea7b8f5f5738602c070e5f806c682f79f8cd000058b9bc5
  • Pointer size: 130 Bytes
  • Size of remote file: 20.3 kB
openai_moe/results/combined_results.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3889,11 +3889,11 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-11-10T22:12:05.730920</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
3896
- <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
3897
  </ns2:Agent>
3898
  </dc:creator>
3899
  </ns2:Work>
@@ -3908,320 +3908,294 @@ body[data-tool="eraser"] .main-content {
3908
  </g>
3909
  <g id="axes--1" class="axes">
3910
  <g id="patch_2">
3911
- <path d="M 57.26 468.317269 L 845.766818 468.317269 L 845.766818 26.88 L 57.26 26.88 L 57.26 468.317269 z " style="fill: none" />
3912
  </g>
3913
  <g id="matplotlib.axis_1">
3914
  <g id="xtick_1">
3915
  <g id="grid-x--1" class="grid grid-x">
3916
- <path d="M 93.101219 468.317269 L 93.101219 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3917
  </g>
3918
  <g id="line2d_1">
3919
  <defs>
3920
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
3921
  </defs>
3922
  <g>
3923
- <use ns4:href="#mafb3703e5b" x="93.101219" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3924
  </g>
3925
  </g>
3926
  <g id="text_1">
3927
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.90334 544.791615) rotate(-45)">cuda_B1_S512_E2</text>
3928
  </g>
3929
  </g>
3930
  <g id="xtick_2">
3931
  <g id="grid-x--2" class="grid grid-x">
3932
- <path d="M 195.504702 468.317269 L 195.504702 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3933
  </g>
3934
  <g id="line2d_2">
3935
  <g>
3936
- <use ns4:href="#mafb3703e5b" x="195.504702" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3937
  </g>
3938
  </g>
3939
  <g id="text_2">
3940
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.306823 544.791615) rotate(-45)">cuda_B1_S512_E4</text>
3941
  </g>
3942
  </g>
3943
  <g id="xtick_3">
3944
  <g id="grid-x--3" class="grid grid-x">
3945
- <path d="M 297.908185 468.317269 L 297.908185 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3946
  </g>
3947
  <g id="line2d_3">
3948
  <g>
3949
- <use ns4:href="#mafb3703e5b" x="297.908185" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3950
  </g>
3951
  </g>
3952
  <g id="text_3">
3953
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.460822 549.290582) rotate(-45)">cuda_B1_S1024_E2</text>
3954
  </g>
3955
  </g>
3956
  <g id="xtick_4">
3957
  <g id="grid-x--4" class="grid grid-x">
3958
- <path d="M 400.311668 468.317269 L 400.311668 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3959
  </g>
3960
  <g id="line2d_4">
3961
  <g>
3962
- <use ns4:href="#mafb3703e5b" x="400.311668" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3963
  </g>
3964
  </g>
3965
  <g id="text_4">
3966
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.864305 549.290582) rotate(-45)">cuda_B1_S1024_E4</text>
3967
  </g>
3968
  </g>
3969
  <g id="xtick_5">
3970
  <g id="grid-x--5" class="grid grid-x">
3971
- <path d="M 502.71515 468.317269 L 502.71515 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3972
  </g>
3973
  <g id="line2d_5">
3974
  <g>
3975
- <use ns4:href="#mafb3703e5b" x="502.71515" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3976
  </g>
3977
  </g>
3978
  <g id="text_5">
3979
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.517271 544.791615) rotate(-45)">cuda_B4_S512_E2</text>
3980
  </g>
3981
  </g>
3982
  <g id="xtick_6">
3983
  <g id="grid-x--6" class="grid grid-x">
3984
- <path d="M 605.118633 468.317269 L 605.118633 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3985
  </g>
3986
  <g id="line2d_6">
3987
  <g>
3988
- <use ns4:href="#mafb3703e5b" x="605.118633" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3989
  </g>
3990
  </g>
3991
  <g id="text_6">
3992
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.920754 544.791615) rotate(-45)">cuda_B4_S512_E4</text>
3993
  </g>
3994
  </g>
3995
  <g id="xtick_7">
3996
  <g id="grid-x--7" class="grid grid-x">
3997
- <path d="M 707.522116 468.317269 L 707.522116 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3998
  </g>
3999
  <g id="line2d_7">
4000
  <g>
4001
- <use ns4:href="#mafb3703e5b" x="707.522116" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4002
  </g>
4003
  </g>
4004
  <g id="text_7">
4005
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.074754 549.290582) rotate(-45)">cuda_B4_S1024_E2</text>
4006
  </g>
4007
  </g>
4008
  <g id="xtick_8">
4009
  <g id="grid-x--8" class="grid grid-x">
4010
- <path d="M 809.925599 468.317269 L 809.925599 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4011
  </g>
4012
  <g id="line2d_8">
4013
  <g>
4014
- <use ns4:href="#mafb3703e5b" x="809.925599" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4015
  </g>
4016
  </g>
4017
  <g id="text_8">
4018
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.478237 549.290582) rotate(-45)">cuda_B4_S1024_E4</text>
4019
  </g>
4020
  </g>
4021
  <g id="label--x" class="xlabel">
4022
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.513409" y="562.556245" transform="rotate(-0 451.513409 562.556245)">Workload</text>
4023
  </g>
4024
  </g>
4025
  <g id="matplotlib.axis_2">
4026
  <g id="ytick_1">
4027
  <g id="grid-y--2" class="grid grid-y">
4028
- <path d="M 57.26 448.9249 L 845.766818 448.9249 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4029
  </g>
4030
  <g id="line2d_9">
4031
  <defs>
4032
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4033
  </defs>
4034
  <g>
4035
- <use ns4:href="#m0fca2865ba" x="57.26" y="448.9249" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="452.724118" transform="rotate(-0 50.26 452.724118)">0</text>
4040
  </g>
4041
  </g>
4042
  <g id="ytick_2">
4043
  <g id="grid-y--3" class="grid grid-y">
4044
- <path d="M 57.26 398.737122 L 845.766818 398.737122 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4045
  </g>
4046
  <g id="line2d_10">
4047
  <g>
4048
- <use ns4:href="#m0fca2865ba" x="57.26" y="398.737122" style="stroke: #000000; stroke-width: 0.8" />
4049
  </g>
4050
  </g>
4051
  <g id="text_10">
4052
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="402.536341" transform="rotate(-0 50.26 402.536341)">200</text>
4053
  </g>
4054
  </g>
4055
  <g id="ytick_3">
4056
  <g id="grid-y--4" class="grid grid-y">
4057
- <path d="M 57.26 348.549345 L 845.766818 348.549345 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
  </g>
4059
  <g id="line2d_11">
4060
  <g>
4061
- <use ns4:href="#m0fca2865ba" x="57.26" y="348.549345" style="stroke: #000000; stroke-width: 0.8" />
4062
  </g>
4063
  </g>
4064
  <g id="text_11">
4065
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="352.348564" transform="rotate(-0 50.26 352.348564)">400</text>
4066
  </g>
4067
  </g>
4068
  <g id="ytick_4">
4069
  <g id="grid-y--5" class="grid grid-y">
4070
- <path d="M 57.26 298.361568 L 845.766818 298.361568 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
  </g>
4072
  <g id="line2d_12">
4073
  <g>
4074
- <use ns4:href="#m0fca2865ba" x="57.26" y="298.361568" style="stroke: #000000; stroke-width: 0.8" />
4075
  </g>
4076
  </g>
4077
  <g id="text_12">
4078
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="302.160786" transform="rotate(-0 50.26 302.160786)">600</text>
4079
  </g>
4080
  </g>
4081
  <g id="ytick_5">
4082
  <g id="grid-y--6" class="grid grid-y">
4083
- <path d="M 57.26 248.17379 L 845.766818 248.17379 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
  </g>
4085
  <g id="line2d_13">
4086
  <g>
4087
- <use ns4:href="#m0fca2865ba" x="57.26" y="248.17379" style="stroke: #000000; stroke-width: 0.8" />
4088
  </g>
4089
  </g>
4090
  <g id="text_13">
4091
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="251.973009" transform="rotate(-0 50.26 251.973009)">800</text>
4092
  </g>
4093
  </g>
4094
  <g id="ytick_6">
4095
  <g id="grid-y--7" class="grid grid-y">
4096
- <path d="M 57.26 197.986013 L 845.766818 197.986013 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4097
  </g>
4098
  <g id="line2d_14">
4099
  <g>
4100
- <use ns4:href="#m0fca2865ba" x="57.26" y="197.986013" style="stroke: #000000; stroke-width: 0.8" />
4101
  </g>
4102
  </g>
4103
  <g id="text_14">
4104
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="201.785232" transform="rotate(-0 50.26 201.785232)">1000</text>
4105
  </g>
4106
  </g>
4107
  <g id="ytick_7">
4108
  <g id="grid-y--8" class="grid grid-y">
4109
- <path d="M 57.26 147.798236 L 845.766818 147.798236 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4110
  </g>
4111
  <g id="line2d_15">
4112
  <g>
4113
- <use ns4:href="#m0fca2865ba" x="57.26" y="147.798236" style="stroke: #000000; stroke-width: 0.8" />
4114
  </g>
4115
  </g>
4116
  <g id="text_15">
4117
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="151.597454" transform="rotate(-0 50.26 151.597454)">1200</text>
4118
- </g>
4119
- </g>
4120
- <g id="ytick_8">
4121
- <g id="grid-y--9" class="grid grid-y">
4122
- <path d="M 57.26 97.610458 L 845.766818 97.610458 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4123
- </g>
4124
- <g id="line2d_16">
4125
- <g>
4126
- <use ns4:href="#m0fca2865ba" x="57.26" y="97.610458" style="stroke: #000000; stroke-width: 0.8" />
4127
- </g>
4128
- </g>
4129
- <g id="text_16">
4130
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="101.409677" transform="rotate(-0 50.26 101.409677)">1400</text>
4131
- </g>
4132
- </g>
4133
- <g id="ytick_9">
4134
- <g id="grid-y--10" class="grid grid-y">
4135
- <path d="M 57.26 47.422681 L 845.766818 47.422681 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4136
- </g>
4137
- <g id="line2d_17">
4138
- <g>
4139
- <use ns4:href="#m0fca2865ba" x="57.26" y="47.422681" style="stroke: #000000; stroke-width: 0.8" />
4140
- </g>
4141
- </g>
4142
- <g id="text_17">
4143
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="51.2219" transform="rotate(-0 50.26 51.2219)">1600</text>
4144
  </g>
4145
  </g>
4146
  <g id="label--y" class="ylabel">
4147
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.730313" y="247.598635" transform="rotate(-90 18.730313 247.598635)">Latency P50 (ms)</text>
4148
  </g>
4149
  </g>
4150
  <g id="series--binned-torch" class="series">
4151
- <path d="M 93.101219 410.112417 L 195.504702 398.361411 L 297.908185 356.674378 L 400.311668 350.006645 L 502.71515 264.168686 L 605.118633 248.454434 L 707.522116 76.647891 L 809.925599 46.94533 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4152
  <defs>
4153
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4154
  </defs>
4155
- <g clip-path="url(#pef1bcf59f7)">
4156
- <use ns4:href="#md7efaf3aec" x="93.101219" y="410.112417" style="fill: #1f77b4; stroke: #1f77b4" />
4157
- <use ns4:href="#md7efaf3aec" x="195.504702" y="398.361411" style="fill: #1f77b4; stroke: #1f77b4" />
4158
- <use ns4:href="#md7efaf3aec" x="297.908185" y="356.674378" style="fill: #1f77b4; stroke: #1f77b4" />
4159
- <use ns4:href="#md7efaf3aec" x="400.311668" y="350.006645" style="fill: #1f77b4; stroke: #1f77b4" />
4160
- <use ns4:href="#md7efaf3aec" x="502.71515" y="264.168686" style="fill: #1f77b4; stroke: #1f77b4" />
4161
- <use ns4:href="#md7efaf3aec" x="605.118633" y="248.454434" style="fill: #1f77b4; stroke: #1f77b4" />
4162
- <use ns4:href="#md7efaf3aec" x="707.522116" y="76.647891" style="fill: #1f77b4; stroke: #1f77b4" />
4163
- <use ns4:href="#md7efaf3aec" x="809.925599" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
4164
  </g>
4165
  </g>
4166
  <g id="series--gpt-oss-experts" class="series">
4167
- <path d="M 93.101219 448.251939 L 195.504702 447.944461 L 297.908185 447.961691 L 400.311668 447.593987 L 502.71515 447.219515 L 605.118633 447.053852 L 707.522116 445.574187 L 809.925599 445.573696 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4168
  <defs>
4169
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4170
  </defs>
4171
- <g clip-path="url(#pef1bcf59f7)">
4172
- <use ns4:href="#m9b8c54d372" x="93.101219" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
4173
- <use ns4:href="#m9b8c54d372" x="195.504702" y="447.944461" style="fill: #ff7f0e; stroke: #ff7f0e" />
4174
- <use ns4:href="#m9b8c54d372" x="297.908185" y="447.961691" style="fill: #ff7f0e; stroke: #ff7f0e" />
4175
- <use ns4:href="#m9b8c54d372" x="400.311668" y="447.593987" style="fill: #ff7f0e; stroke: #ff7f0e" />
4176
- <use ns4:href="#m9b8c54d372" x="502.71515" y="447.219515" style="fill: #ff7f0e; stroke: #ff7f0e" />
4177
- <use ns4:href="#m9b8c54d372" x="605.118633" y="447.053852" style="fill: #ff7f0e; stroke: #ff7f0e" />
4178
- <use ns4:href="#m9b8c54d372" x="707.522116" y="445.574187" style="fill: #ff7f0e; stroke: #ff7f0e" />
4179
- <use ns4:href="#m9b8c54d372" x="809.925599" y="445.573696" style="fill: #ff7f0e; stroke: #ff7f0e" />
4180
  </g>
4181
  </g>
4182
  <g id="patch_3">
4183
- <path d="M 57.26 468.317269 L 57.26 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4184
  </g>
4185
  <g id="patch_4">
4186
  <path d="M 845.766818 468.317269 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4187
  </g>
4188
  <g id="patch_5">
4189
- <path d="M 57.26 468.317269 L 845.766818 468.317269 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4190
  </g>
4191
  <g id="patch_6">
4192
- <path d="M 57.26 26.88 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4193
  </g>
4194
- <g id="text_18">
4195
- <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.513409" y="20.88" transform="rotate(-0 451.513409 20.88)">Attention Implementation Latency</text>
4196
  </g>
4197
  <g id="legend" class="legend">
4198
  <g id="patch_7">
4199
- <path d="M 64.26 64.7925 L 177.05375 64.7925 Q 179.05375 64.7925 179.05375 62.7925 L 179.05375 33.88 Q 179.05375 31.88 177.05375 31.88 L 64.26 31.88 Q 62.26 31.88 62.26 33.88 L 62.26 62.7925 Q 62.26 64.7925 64.26 64.7925 L 64.26 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4200
  </g>
4201
- <g id="line2d_18">
4202
- <path d="M 66.26 39.978438 L 76.26 39.978438 L 86.26 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4203
  <g>
4204
- <use ns4:href="#md7efaf3aec" x="76.26" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4205
  </g>
4206
  </g>
4207
  <g id="legend-label--binned-torch" class="legend">
4208
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.26" y="43.478438" transform="rotate(-0 94.26 43.478438)">binned_torch</text>
4209
  </g>
4210
- <g id="line2d_19">
4211
- <path d="M 66.26 54.934687 L 76.26 54.934687 L 86.26 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4212
  <g>
4213
- <use ns4:href="#m9b8c54d372" x="76.26" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4214
  </g>
4215
  </g>
4216
  <g id="legend-label--gpt-oss-experts" class="legend">
4217
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.26" y="58.434687" transform="rotate(-0 94.26 58.434687)">gpt_oss_experts</text>
4218
  </g>
4219
  </g>
4220
  </g>
4221
  </g>
4222
  <defs>
4223
- <clipPath id="pef1bcf59f7">
4224
- <rect x="57.26" y="26.88" width="788.506818" height="441.437269" />
4225
  </clipPath>
4226
  </defs>
4227
  </svg>
@@ -4234,7 +4208,7 @@ body[data-tool="eraser"] .main-content {
4234
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4235
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4236
  </span> |
4237
- Cell: combine | 4.50s
4238
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4239
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4240
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4323,22 +4297,22 @@ Summary: 2 found, 0 skipped, 0 missing
4323
  COMBINED BENCHMARK SUMMARY
4324
 
4325
  impl wl p50(ms) ok
4326
- binned_torch cuda_B1_S1024_E2 367.62 True
4327
- binned_torch cuda_B1_S1024_E4 394.19 True
4328
- binned_torch cuda_B1_S512_E2 154.67 True
4329
- binned_torch cuda_B1_S512_E4 201.50 True
4330
- binned_torch cuda_B4_S1024_E2 1483.54 True
4331
- binned_torch cuda_B4_S1024_E4 1601.90 True
4332
- binned_torch cuda_B4_S512_E2 736.26 True
4333
- binned_torch cuda_B4_S512_E4 798.88 True
4334
- gpt_oss_experts cuda_B1_S1024_E2 3.84 True
4335
- gpt_oss_experts cuda_B1_S1024_E4 5.30 True
4336
- gpt_oss_experts cuda_B1_S512_E2 2.68 True
4337
- gpt_oss_experts cuda_B1_S512_E4 3.91 True
4338
- gpt_oss_experts cuda_B4_S1024_E2 13.35 True
4339
- gpt_oss_experts cuda_B4_S1024_E4 13.35 True
4340
- gpt_oss_experts cuda_B4_S512_E2 6.80 True
4341
- gpt_oss_experts cuda_B4_S512_E4 7.46 True
4342
 
4343
  GENERATING COMBINED VISUALIZATION
4344
 
@@ -4358,7 +4332,7 @@ Implementations included:
4358
  <div class="uv-install-logs" id="uv-logs-combine">
4359
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4360
  <div class="uv-logs-content" style="display: none;">
4361
- Installed 37 packages in 287ms
4362
  </div>
4363
  </div>
4364
  <div class="cell-artifacts">
@@ -4371,11 +4345,11 @@ Installed 37 packages in 287ms
4371
  <rdf:RDF>
4372
  <ns2:Work>
4373
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4374
- <dc:date>2025-11-10T22:12:05.730920</dc:date>
4375
  <dc:format>image/svg+xml</dc:format>
4376
  <dc:creator>
4377
  <ns2:Agent>
4378
- <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
4379
  </ns2:Agent>
4380
  </dc:creator>
4381
  </ns2:Work>
@@ -4390,320 +4364,294 @@ Installed 37 packages in 287ms
4390
  </g>
4391
  <g id="axes--1" class="axes">
4392
  <g id="patch_2">
4393
- <path d="M 57.26 468.317269 L 845.766818 468.317269 L 845.766818 26.88 L 57.26 26.88 L 57.26 468.317269 z " style="fill: none" />
4394
  </g>
4395
  <g id="matplotlib.axis_1">
4396
  <g id="xtick_1">
4397
  <g id="grid-x--1" class="grid grid-x">
4398
- <path d="M 93.101219 468.317269 L 93.101219 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4399
  </g>
4400
  <g id="line2d_1">
4401
  <defs>
4402
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
4403
  </defs>
4404
  <g>
4405
- <use ns4:href="#mafb3703e5b" x="93.101219" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4406
  </g>
4407
  </g>
4408
  <g id="text_1">
4409
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.90334 544.791615) rotate(-45)">cuda_B1_S512_E2</text>
4410
  </g>
4411
  </g>
4412
  <g id="xtick_2">
4413
  <g id="grid-x--2" class="grid grid-x">
4414
- <path d="M 195.504702 468.317269 L 195.504702 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4415
  </g>
4416
  <g id="line2d_2">
4417
  <g>
4418
- <use ns4:href="#mafb3703e5b" x="195.504702" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4419
  </g>
4420
  </g>
4421
  <g id="text_2">
4422
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.306823 544.791615) rotate(-45)">cuda_B1_S512_E4</text>
4423
  </g>
4424
  </g>
4425
  <g id="xtick_3">
4426
  <g id="grid-x--3" class="grid grid-x">
4427
- <path d="M 297.908185 468.317269 L 297.908185 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4428
  </g>
4429
  <g id="line2d_3">
4430
  <g>
4431
- <use ns4:href="#mafb3703e5b" x="297.908185" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4432
  </g>
4433
  </g>
4434
  <g id="text_3">
4435
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.460822 549.290582) rotate(-45)">cuda_B1_S1024_E2</text>
4436
  </g>
4437
  </g>
4438
  <g id="xtick_4">
4439
  <g id="grid-x--4" class="grid grid-x">
4440
- <path d="M 400.311668 468.317269 L 400.311668 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4441
  </g>
4442
  <g id="line2d_4">
4443
  <g>
4444
- <use ns4:href="#mafb3703e5b" x="400.311668" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4445
  </g>
4446
  </g>
4447
  <g id="text_4">
4448
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.864305 549.290582) rotate(-45)">cuda_B1_S1024_E4</text>
4449
  </g>
4450
  </g>
4451
  <g id="xtick_5">
4452
  <g id="grid-x--5" class="grid grid-x">
4453
- <path d="M 502.71515 468.317269 L 502.71515 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4454
  </g>
4455
  <g id="line2d_5">
4456
  <g>
4457
- <use ns4:href="#mafb3703e5b" x="502.71515" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4458
  </g>
4459
  </g>
4460
  <g id="text_5">
4461
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.517271 544.791615) rotate(-45)">cuda_B4_S512_E2</text>
4462
  </g>
4463
  </g>
4464
  <g id="xtick_6">
4465
  <g id="grid-x--6" class="grid grid-x">
4466
- <path d="M 605.118633 468.317269 L 605.118633 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4467
  </g>
4468
  <g id="line2d_6">
4469
  <g>
4470
- <use ns4:href="#mafb3703e5b" x="605.118633" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4471
  </g>
4472
  </g>
4473
  <g id="text_6">
4474
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.920754 544.791615) rotate(-45)">cuda_B4_S512_E4</text>
4475
  </g>
4476
  </g>
4477
  <g id="xtick_7">
4478
  <g id="grid-x--7" class="grid grid-x">
4479
- <path d="M 707.522116 468.317269 L 707.522116 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4480
  </g>
4481
  <g id="line2d_7">
4482
  <g>
4483
- <use ns4:href="#mafb3703e5b" x="707.522116" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4484
  </g>
4485
  </g>
4486
  <g id="text_7">
4487
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.074754 549.290582) rotate(-45)">cuda_B4_S1024_E2</text>
4488
  </g>
4489
  </g>
4490
  <g id="xtick_8">
4491
  <g id="grid-x--8" class="grid grid-x">
4492
- <path d="M 809.925599 468.317269 L 809.925599 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4493
  </g>
4494
  <g id="line2d_8">
4495
  <g>
4496
- <use ns4:href="#mafb3703e5b" x="809.925599" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4497
  </g>
4498
  </g>
4499
  <g id="text_8">
4500
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.478237 549.290582) rotate(-45)">cuda_B4_S1024_E4</text>
4501
  </g>
4502
  </g>
4503
  <g id="label--x" class="xlabel">
4504
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.513409" y="562.556245" transform="rotate(-0 451.513409 562.556245)">Workload</text>
4505
  </g>
4506
  </g>
4507
  <g id="matplotlib.axis_2">
4508
  <g id="ytick_1">
4509
  <g id="grid-y--2" class="grid grid-y">
4510
- <path d="M 57.26 448.9249 L 845.766818 448.9249 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4511
  </g>
4512
  <g id="line2d_9">
4513
  <defs>
4514
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4515
  </defs>
4516
  <g>
4517
- <use ns4:href="#m0fca2865ba" x="57.26" y="448.9249" style="stroke: #000000; stroke-width: 0.8" />
4518
  </g>
4519
  </g>
4520
  <g id="text_9">
4521
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="452.724118" transform="rotate(-0 50.26 452.724118)">0</text>
4522
  </g>
4523
  </g>
4524
  <g id="ytick_2">
4525
  <g id="grid-y--3" class="grid grid-y">
4526
- <path d="M 57.26 398.737122 L 845.766818 398.737122 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4527
  </g>
4528
  <g id="line2d_10">
4529
  <g>
4530
- <use ns4:href="#m0fca2865ba" x="57.26" y="398.737122" style="stroke: #000000; stroke-width: 0.8" />
4531
  </g>
4532
  </g>
4533
  <g id="text_10">
4534
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="402.536341" transform="rotate(-0 50.26 402.536341)">200</text>
4535
  </g>
4536
  </g>
4537
  <g id="ytick_3">
4538
  <g id="grid-y--4" class="grid grid-y">
4539
- <path d="M 57.26 348.549345 L 845.766818 348.549345 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4540
  </g>
4541
  <g id="line2d_11">
4542
  <g>
4543
- <use ns4:href="#m0fca2865ba" x="57.26" y="348.549345" style="stroke: #000000; stroke-width: 0.8" />
4544
  </g>
4545
  </g>
4546
  <g id="text_11">
4547
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="352.348564" transform="rotate(-0 50.26 352.348564)">400</text>
4548
  </g>
4549
  </g>
4550
  <g id="ytick_4">
4551
  <g id="grid-y--5" class="grid grid-y">
4552
- <path d="M 57.26 298.361568 L 845.766818 298.361568 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4553
  </g>
4554
  <g id="line2d_12">
4555
  <g>
4556
- <use ns4:href="#m0fca2865ba" x="57.26" y="298.361568" style="stroke: #000000; stroke-width: 0.8" />
4557
  </g>
4558
  </g>
4559
  <g id="text_12">
4560
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="302.160786" transform="rotate(-0 50.26 302.160786)">600</text>
4561
  </g>
4562
  </g>
4563
  <g id="ytick_5">
4564
  <g id="grid-y--6" class="grid grid-y">
4565
- <path d="M 57.26 248.17379 L 845.766818 248.17379 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4566
  </g>
4567
  <g id="line2d_13">
4568
  <g>
4569
- <use ns4:href="#m0fca2865ba" x="57.26" y="248.17379" style="stroke: #000000; stroke-width: 0.8" />
4570
  </g>
4571
  </g>
4572
  <g id="text_13">
4573
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="251.973009" transform="rotate(-0 50.26 251.973009)">800</text>
4574
  </g>
4575
  </g>
4576
  <g id="ytick_6">
4577
  <g id="grid-y--7" class="grid grid-y">
4578
- <path d="M 57.26 197.986013 L 845.766818 197.986013 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4579
  </g>
4580
  <g id="line2d_14">
4581
  <g>
4582
- <use ns4:href="#m0fca2865ba" x="57.26" y="197.986013" style="stroke: #000000; stroke-width: 0.8" />
4583
  </g>
4584
  </g>
4585
  <g id="text_14">
4586
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="201.785232" transform="rotate(-0 50.26 201.785232)">1000</text>
4587
  </g>
4588
  </g>
4589
  <g id="ytick_7">
4590
  <g id="grid-y--8" class="grid grid-y">
4591
- <path d="M 57.26 147.798236 L 845.766818 147.798236 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4592
  </g>
4593
  <g id="line2d_15">
4594
  <g>
4595
- <use ns4:href="#m0fca2865ba" x="57.26" y="147.798236" style="stroke: #000000; stroke-width: 0.8" />
4596
  </g>
4597
  </g>
4598
  <g id="text_15">
4599
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="151.597454" transform="rotate(-0 50.26 151.597454)">1200</text>
4600
- </g>
4601
- </g>
4602
- <g id="ytick_8">
4603
- <g id="grid-y--9" class="grid grid-y">
4604
- <path d="M 57.26 97.610458 L 845.766818 97.610458 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4605
- </g>
4606
- <g id="line2d_16">
4607
- <g>
4608
- <use ns4:href="#m0fca2865ba" x="57.26" y="97.610458" style="stroke: #000000; stroke-width: 0.8" />
4609
- </g>
4610
- </g>
4611
- <g id="text_16">
4612
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="101.409677" transform="rotate(-0 50.26 101.409677)">1400</text>
4613
- </g>
4614
- </g>
4615
- <g id="ytick_9">
4616
- <g id="grid-y--10" class="grid grid-y">
4617
- <path d="M 57.26 47.422681 L 845.766818 47.422681 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4618
- </g>
4619
- <g id="line2d_17">
4620
- <g>
4621
- <use ns4:href="#m0fca2865ba" x="57.26" y="47.422681" style="stroke: #000000; stroke-width: 0.8" />
4622
- </g>
4623
- </g>
4624
- <g id="text_17">
4625
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="51.2219" transform="rotate(-0 50.26 51.2219)">1600</text>
4626
  </g>
4627
  </g>
4628
  <g id="label--y" class="ylabel">
4629
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.730313" y="247.598635" transform="rotate(-90 18.730313 247.598635)">Latency P50 (ms)</text>
4630
  </g>
4631
  </g>
4632
  <g id="series--binned-torch" class="series">
4633
- <path d="M 93.101219 410.112417 L 195.504702 398.361411 L 297.908185 356.674378 L 400.311668 350.006645 L 502.71515 264.168686 L 605.118633 248.454434 L 707.522116 76.647891 L 809.925599 46.94533 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4634
  <defs>
4635
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4636
  </defs>
4637
- <g clip-path="url(#pef1bcf59f7)">
4638
- <use ns4:href="#md7efaf3aec" x="93.101219" y="410.112417" style="fill: #1f77b4; stroke: #1f77b4" />
4639
- <use ns4:href="#md7efaf3aec" x="195.504702" y="398.361411" style="fill: #1f77b4; stroke: #1f77b4" />
4640
- <use ns4:href="#md7efaf3aec" x="297.908185" y="356.674378" style="fill: #1f77b4; stroke: #1f77b4" />
4641
- <use ns4:href="#md7efaf3aec" x="400.311668" y="350.006645" style="fill: #1f77b4; stroke: #1f77b4" />
4642
- <use ns4:href="#md7efaf3aec" x="502.71515" y="264.168686" style="fill: #1f77b4; stroke: #1f77b4" />
4643
- <use ns4:href="#md7efaf3aec" x="605.118633" y="248.454434" style="fill: #1f77b4; stroke: #1f77b4" />
4644
- <use ns4:href="#md7efaf3aec" x="707.522116" y="76.647891" style="fill: #1f77b4; stroke: #1f77b4" />
4645
- <use ns4:href="#md7efaf3aec" x="809.925599" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
4646
  </g>
4647
  </g>
4648
  <g id="series--gpt-oss-experts" class="series">
4649
- <path d="M 93.101219 448.251939 L 195.504702 447.944461 L 297.908185 447.961691 L 400.311668 447.593987 L 502.71515 447.219515 L 605.118633 447.053852 L 707.522116 445.574187 L 809.925599 445.573696 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4650
  <defs>
4651
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4652
  </defs>
4653
- <g clip-path="url(#pef1bcf59f7)">
4654
- <use ns4:href="#m9b8c54d372" x="93.101219" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
4655
- <use ns4:href="#m9b8c54d372" x="195.504702" y="447.944461" style="fill: #ff7f0e; stroke: #ff7f0e" />
4656
- <use ns4:href="#m9b8c54d372" x="297.908185" y="447.961691" style="fill: #ff7f0e; stroke: #ff7f0e" />
4657
- <use ns4:href="#m9b8c54d372" x="400.311668" y="447.593987" style="fill: #ff7f0e; stroke: #ff7f0e" />
4658
- <use ns4:href="#m9b8c54d372" x="502.71515" y="447.219515" style="fill: #ff7f0e; stroke: #ff7f0e" />
4659
- <use ns4:href="#m9b8c54d372" x="605.118633" y="447.053852" style="fill: #ff7f0e; stroke: #ff7f0e" />
4660
- <use ns4:href="#m9b8c54d372" x="707.522116" y="445.574187" style="fill: #ff7f0e; stroke: #ff7f0e" />
4661
- <use ns4:href="#m9b8c54d372" x="809.925599" y="445.573696" style="fill: #ff7f0e; stroke: #ff7f0e" />
4662
  </g>
4663
  </g>
4664
  <g id="patch_3">
4665
- <path d="M 57.26 468.317269 L 57.26 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4666
  </g>
4667
  <g id="patch_4">
4668
  <path d="M 845.766818 468.317269 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4669
  </g>
4670
  <g id="patch_5">
4671
- <path d="M 57.26 468.317269 L 845.766818 468.317269 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4672
  </g>
4673
  <g id="patch_6">
4674
- <path d="M 57.26 26.88 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4675
  </g>
4676
- <g id="text_18">
4677
- <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.513409" y="20.88" transform="rotate(-0 451.513409 20.88)">Attention Implementation Latency</text>
4678
  </g>
4679
  <g id="legend" class="legend">
4680
  <g id="patch_7">
4681
- <path d="M 64.26 64.7925 L 177.05375 64.7925 Q 179.05375 64.7925 179.05375 62.7925 L 179.05375 33.88 Q 179.05375 31.88 177.05375 31.88 L 64.26 31.88 Q 62.26 31.88 62.26 33.88 L 62.26 62.7925 Q 62.26 64.7925 64.26 64.7925 L 64.26 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4682
  </g>
4683
- <g id="line2d_18">
4684
- <path d="M 66.26 39.978438 L 76.26 39.978438 L 86.26 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4685
  <g>
4686
- <use ns4:href="#md7efaf3aec" x="76.26" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4687
  </g>
4688
  </g>
4689
  <g id="legend-label--binned-torch" class="legend">
4690
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.26" y="43.478438" transform="rotate(-0 94.26 43.478438)">binned_torch</text>
4691
  </g>
4692
- <g id="line2d_19">
4693
- <path d="M 66.26 54.934687 L 76.26 54.934687 L 86.26 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4694
  <g>
4695
- <use ns4:href="#m9b8c54d372" x="76.26" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4696
  </g>
4697
  </g>
4698
  <g id="legend-label--gpt-oss-experts" class="legend">
4699
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.26" y="58.434687" transform="rotate(-0 94.26 58.434687)">gpt_oss_experts</text>
4700
  </g>
4701
  </g>
4702
  </g>
4703
  </g>
4704
  <defs>
4705
- <clipPath id="pef1bcf59f7">
4706
- <rect x="57.26" y="26.88" width="788.506818" height="441.437269" />
4707
  </clipPath>
4708
  </defs>
4709
  </svg>
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T19:10:00.094905</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
3896
+ <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
3897
  </ns2:Agent>
3898
  </dc:creator>
3899
  </ns2:Work>
 
3908
  </g>
3909
  <g id="axes--1" class="axes">
3910
  <g id="patch_2">
3911
+ <path d="M 57.17 468.317269 L 845.766818 468.317269 L 845.766818 26.88 L 57.17 26.88 L 57.17 468.317269 z " style="fill: none" />
3912
  </g>
3913
  <g id="matplotlib.axis_1">
3914
  <g id="xtick_1">
3915
  <g id="grid-x--1" class="grid grid-x">
3916
+ <path d="M 93.01531 468.317269 L 93.01531 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3917
  </g>
3918
  <g id="line2d_1">
3919
  <defs>
3920
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
3921
  </defs>
3922
  <g>
3923
+ <use ns4:href="#mafb3703e5b" x="93.01531" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3924
  </g>
3925
  </g>
3926
  <g id="text_1">
3927
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.817431 544.791615) rotate(-45)">cuda_B1_S512_E2</text>
3928
  </g>
3929
  </g>
3930
  <g id="xtick_2">
3931
  <g id="grid-x--2" class="grid grid-x">
3932
+ <path d="M 195.430481 468.317269 L 195.430481 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3933
  </g>
3934
  <g id="line2d_2">
3935
  <g>
3936
+ <use ns4:href="#mafb3703e5b" x="195.430481" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3937
  </g>
3938
  </g>
3939
  <g id="text_2">
3940
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.232602 544.791615) rotate(-45)">cuda_B1_S512_E4</text>
3941
  </g>
3942
  </g>
3943
  <g id="xtick_3">
3944
  <g id="grid-x--3" class="grid grid-x">
3945
+ <path d="M 297.845652 468.317269 L 297.845652 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3946
  </g>
3947
  <g id="line2d_3">
3948
  <g>
3949
+ <use ns4:href="#mafb3703e5b" x="297.845652" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3950
  </g>
3951
  </g>
3952
  <g id="text_3">
3953
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.39829 549.290582) rotate(-45)">cuda_B1_S1024_E2</text>
3954
  </g>
3955
  </g>
3956
  <g id="xtick_4">
3957
  <g id="grid-x--4" class="grid grid-x">
3958
+ <path d="M 400.260823 468.317269 L 400.260823 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3959
  </g>
3960
  <g id="line2d_4">
3961
  <g>
3962
+ <use ns4:href="#mafb3703e5b" x="400.260823" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3963
  </g>
3964
  </g>
3965
  <g id="text_4">
3966
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.813461 549.290582) rotate(-45)">cuda_B1_S1024_E4</text>
3967
  </g>
3968
  </g>
3969
  <g id="xtick_5">
3970
  <g id="grid-x--5" class="grid grid-x">
3971
+ <path d="M 502.675995 468.317269 L 502.675995 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3972
  </g>
3973
  <g id="line2d_5">
3974
  <g>
3975
+ <use ns4:href="#mafb3703e5b" x="502.675995" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3976
  </g>
3977
  </g>
3978
  <g id="text_5">
3979
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.478116 544.791615) rotate(-45)">cuda_B4_S512_E2</text>
3980
  </g>
3981
  </g>
3982
  <g id="xtick_6">
3983
  <g id="grid-x--6" class="grid grid-x">
3984
+ <path d="M 605.091166 468.317269 L 605.091166 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3985
  </g>
3986
  <g id="line2d_6">
3987
  <g>
3988
+ <use ns4:href="#mafb3703e5b" x="605.091166" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3989
  </g>
3990
  </g>
3991
  <g id="text_6">
3992
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.893287 544.791615) rotate(-45)">cuda_B4_S512_E4</text>
3993
  </g>
3994
  </g>
3995
  <g id="xtick_7">
3996
  <g id="grid-x--7" class="grid grid-x">
3997
+ <path d="M 707.506337 468.317269 L 707.506337 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3998
  </g>
3999
  <g id="line2d_7">
4000
  <g>
4001
+ <use ns4:href="#mafb3703e5b" x="707.506337" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4002
  </g>
4003
  </g>
4004
  <g id="text_7">
4005
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.058975 549.290582) rotate(-45)">cuda_B4_S1024_E2</text>
4006
  </g>
4007
  </g>
4008
  <g id="xtick_8">
4009
  <g id="grid-x--8" class="grid grid-x">
4010
+ <path d="M 809.921508 468.317269 L 809.921508 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4011
  </g>
4012
  <g id="line2d_8">
4013
  <g>
4014
+ <use ns4:href="#mafb3703e5b" x="809.921508" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4015
  </g>
4016
  </g>
4017
  <g id="text_8">
4018
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.474146 549.290582) rotate(-45)">cuda_B4_S1024_E4</text>
4019
  </g>
4020
  </g>
4021
  <g id="label--x" class="xlabel">
4022
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.468409" y="562.556245" transform="rotate(-0 451.468409 562.556245)">Workload</text>
4023
  </g>
4024
  </g>
4025
  <g id="matplotlib.axis_2">
4026
  <g id="ytick_1">
4027
  <g id="grid-y--2" class="grid grid-y">
4028
+ <path d="M 57.17 448.88374 L 845.766818 448.88374 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4029
  </g>
4030
  <g id="line2d_9">
4031
  <defs>
4032
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4033
  </defs>
4034
  <g>
4035
+ <use ns4:href="#m0fca2865ba" x="57.17" y="448.88374" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="452.682959" transform="rotate(-0 50.17 452.682959)">0</text>
4040
  </g>
4041
  </g>
4042
  <g id="ytick_2">
4043
  <g id="grid-y--3" class="grid grid-y">
4044
+ <path d="M 57.17 388.304965 L 845.766818 388.304965 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4045
  </g>
4046
  <g id="line2d_10">
4047
  <g>
4048
+ <use ns4:href="#m0fca2865ba" x="57.17" y="388.304965" style="stroke: #000000; stroke-width: 0.8" />
4049
  </g>
4050
  </g>
4051
  <g id="text_10">
4052
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="392.104184" transform="rotate(-0 50.17 392.104184)">250</text>
4053
  </g>
4054
  </g>
4055
  <g id="ytick_3">
4056
  <g id="grid-y--4" class="grid grid-y">
4057
+ <path d="M 57.17 327.726191 L 845.766818 327.726191 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
  </g>
4059
  <g id="line2d_11">
4060
  <g>
4061
+ <use ns4:href="#m0fca2865ba" x="57.17" y="327.726191" style="stroke: #000000; stroke-width: 0.8" />
4062
  </g>
4063
  </g>
4064
  <g id="text_11">
4065
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="331.52541" transform="rotate(-0 50.17 331.52541)">500</text>
4066
  </g>
4067
  </g>
4068
  <g id="ytick_4">
4069
  <g id="grid-y--5" class="grid grid-y">
4070
+ <path d="M 57.17 267.147416 L 845.766818 267.147416 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
  </g>
4072
  <g id="line2d_12">
4073
  <g>
4074
+ <use ns4:href="#m0fca2865ba" x="57.17" y="267.147416" style="stroke: #000000; stroke-width: 0.8" />
4075
  </g>
4076
  </g>
4077
  <g id="text_12">
4078
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="270.946635" transform="rotate(-0 50.17 270.946635)">750</text>
4079
  </g>
4080
  </g>
4081
  <g id="ytick_5">
4082
  <g id="grid-y--6" class="grid grid-y">
4083
+ <path d="M 57.17 206.568642 L 845.766818 206.568642 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
  </g>
4085
  <g id="line2d_13">
4086
  <g>
4087
+ <use ns4:href="#m0fca2865ba" x="57.17" y="206.568642" style="stroke: #000000; stroke-width: 0.8" />
4088
  </g>
4089
  </g>
4090
  <g id="text_13">
4091
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="210.367861" transform="rotate(-0 50.17 210.367861)">1000</text>
4092
  </g>
4093
  </g>
4094
  <g id="ytick_6">
4095
  <g id="grid-y--7" class="grid grid-y">
4096
+ <path d="M 57.17 145.989867 L 845.766818 145.989867 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4097
  </g>
4098
  <g id="line2d_14">
4099
  <g>
4100
+ <use ns4:href="#m0fca2865ba" x="57.17" y="145.989867" style="stroke: #000000; stroke-width: 0.8" />
4101
  </g>
4102
  </g>
4103
  <g id="text_14">
4104
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="149.789086" transform="rotate(-0 50.17 149.789086)">1250</text>
4105
  </g>
4106
  </g>
4107
  <g id="ytick_7">
4108
  <g id="grid-y--8" class="grid grid-y">
4109
+ <path d="M 57.17 85.411093 L 845.766818 85.411093 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4110
  </g>
4111
  <g id="line2d_15">
4112
  <g>
4113
+ <use ns4:href="#m0fca2865ba" x="57.17" y="85.411093" style="stroke: #000000; stroke-width: 0.8" />
4114
  </g>
4115
  </g>
4116
  <g id="text_15">
4117
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="89.210312" transform="rotate(-0 50.17 89.210312)">1500</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4118
  </g>
4119
  </g>
4120
  <g id="label--y" class="ylabel">
4121
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.640312" y="247.598635" transform="rotate(-90 18.640312 247.598635)">Latency P50 (ms)</text>
4122
  </g>
4123
  </g>
4124
  <g id="series--binned-torch" class="series">
4125
+ <path d="M 93.01531 410.663437 L 195.430481 399.252405 L 297.845652 356.001516 L 400.260823 346.76844 L 502.675995 261.404682 L 605.091166 245.335532 L 707.506337 82.088892 L 809.921508 46.94533 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4126
  <defs>
4127
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4128
  </defs>
4129
+ <g clip-path="url(#p5307ca50d8)">
4130
+ <use ns4:href="#md7efaf3aec" x="93.01531" y="410.663437" style="fill: #1f77b4; stroke: #1f77b4" />
4131
+ <use ns4:href="#md7efaf3aec" x="195.430481" y="399.252405" style="fill: #1f77b4; stroke: #1f77b4" />
4132
+ <use ns4:href="#md7efaf3aec" x="297.845652" y="356.001516" style="fill: #1f77b4; stroke: #1f77b4" />
4133
+ <use ns4:href="#md7efaf3aec" x="400.260823" y="346.76844" style="fill: #1f77b4; stroke: #1f77b4" />
4134
+ <use ns4:href="#md7efaf3aec" x="502.675995" y="261.404682" style="fill: #1f77b4; stroke: #1f77b4" />
4135
+ <use ns4:href="#md7efaf3aec" x="605.091166" y="245.335532" style="fill: #1f77b4; stroke: #1f77b4" />
4136
+ <use ns4:href="#md7efaf3aec" x="707.506337" y="82.088892" style="fill: #1f77b4; stroke: #1f77b4" />
4137
+ <use ns4:href="#md7efaf3aec" x="809.921508" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
4138
  </g>
4139
  </g>
4140
  <g id="series--gpt-oss-experts" class="series">
4141
+ <path d="M 93.01531 448.251939 L 195.430481 447.950554 L 297.845652 447.969402 L 400.260823 447.623067 L 502.675995 447.274263 L 605.091166 447.115423 L 707.506337 445.70441 L 809.921508 445.680583 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4142
  <defs>
4143
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4144
  </defs>
4145
+ <g clip-path="url(#p5307ca50d8)">
4146
+ <use ns4:href="#m9b8c54d372" x="93.01531" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
4147
+ <use ns4:href="#m9b8c54d372" x="195.430481" y="447.950554" style="fill: #ff7f0e; stroke: #ff7f0e" />
4148
+ <use ns4:href="#m9b8c54d372" x="297.845652" y="447.969402" style="fill: #ff7f0e; stroke: #ff7f0e" />
4149
+ <use ns4:href="#m9b8c54d372" x="400.260823" y="447.623067" style="fill: #ff7f0e; stroke: #ff7f0e" />
4150
+ <use ns4:href="#m9b8c54d372" x="502.675995" y="447.274263" style="fill: #ff7f0e; stroke: #ff7f0e" />
4151
+ <use ns4:href="#m9b8c54d372" x="605.091166" y="447.115423" style="fill: #ff7f0e; stroke: #ff7f0e" />
4152
+ <use ns4:href="#m9b8c54d372" x="707.506337" y="445.70441" style="fill: #ff7f0e; stroke: #ff7f0e" />
4153
+ <use ns4:href="#m9b8c54d372" x="809.921508" y="445.680583" style="fill: #ff7f0e; stroke: #ff7f0e" />
4154
  </g>
4155
  </g>
4156
  <g id="patch_3">
4157
+ <path d="M 57.17 468.317269 L 57.17 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4158
  </g>
4159
  <g id="patch_4">
4160
  <path d="M 845.766818 468.317269 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4161
  </g>
4162
  <g id="patch_5">
4163
+ <path d="M 57.17 468.317269 L 845.766818 468.317269 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4164
  </g>
4165
  <g id="patch_6">
4166
+ <path d="M 57.17 26.88 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4167
  </g>
4168
+ <g id="text_16">
4169
+ <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.468409" y="20.88" transform="rotate(-0 451.468409 20.88)">Attention Implementation Latency</text>
4170
  </g>
4171
  <g id="legend" class="legend">
4172
  <g id="patch_7">
4173
+ <path d="M 64.17 64.7925 L 176.96375 64.7925 Q 178.96375 64.7925 178.96375 62.7925 L 178.96375 33.88 Q 178.96375 31.88 176.96375 31.88 L 64.17 31.88 Q 62.17 31.88 62.17 33.88 L 62.17 62.7925 Q 62.17 64.7925 64.17 64.7925 L 64.17 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4174
  </g>
4175
+ <g id="line2d_16">
4176
+ <path d="M 66.17 39.978438 L 76.17 39.978438 L 86.17 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4177
  <g>
4178
+ <use ns4:href="#md7efaf3aec" x="76.17" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4179
  </g>
4180
  </g>
4181
  <g id="legend-label--binned-torch" class="legend">
4182
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.17" y="43.478438" transform="rotate(-0 94.17 43.478438)">binned_torch</text>
4183
  </g>
4184
+ <g id="line2d_17">
4185
+ <path d="M 66.17 54.934687 L 76.17 54.934687 L 86.17 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4186
  <g>
4187
+ <use ns4:href="#m9b8c54d372" x="76.17" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4188
  </g>
4189
  </g>
4190
  <g id="legend-label--gpt-oss-experts" class="legend">
4191
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.17" y="58.434687" transform="rotate(-0 94.17 58.434687)">gpt_oss_experts</text>
4192
  </g>
4193
  </g>
4194
  </g>
4195
  </g>
4196
  <defs>
4197
+ <clipPath id="p5307ca50d8">
4198
+ <rect x="57.17" y="26.88" width="788.596818" height="441.437269" />
4199
  </clipPath>
4200
  </defs>
4201
  </svg>
 
4208
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4209
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4210
  </span> |
4211
+ Cell: combine | 4.53s
4212
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4213
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4214
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4297
  COMBINED BENCHMARK SUMMARY
4298
 
4299
  impl wl p50(ms) ok
4300
+ binned_torch cuda_B1_S1024_E2 383.31 True
4301
+ binned_torch cuda_B1_S1024_E4 421.42 True
4302
+ binned_torch cuda_B1_S512_E2 157.73 True
4303
+ binned_torch cuda_B1_S512_E4 204.82 True
4304
+ binned_torch cuda_B4_S1024_E2 1513.71 True
4305
+ binned_torch cuda_B4_S1024_E4 1658.74 True
4306
+ binned_torch cuda_B4_S512_E2 773.70 True
4307
+ binned_torch cuda_B4_S512_E4 840.01 True
4308
+ gpt_oss_experts cuda_B1_S1024_E2 3.77 True
4309
+ gpt_oss_experts cuda_B1_S1024_E4 5.20 True
4310
+ gpt_oss_experts cuda_B1_S512_E2 2.61 True
4311
+ gpt_oss_experts cuda_B1_S512_E4 3.85 True
4312
+ gpt_oss_experts cuda_B4_S1024_E2 13.12 True
4313
+ gpt_oss_experts cuda_B4_S1024_E4 13.22 True
4314
+ gpt_oss_experts cuda_B4_S512_E2 6.64 True
4315
+ gpt_oss_experts cuda_B4_S512_E4 7.30 True
4316
 
4317
  GENERATING COMBINED VISUALIZATION
4318
 
 
4332
  <div class="uv-install-logs" id="uv-logs-combine">
4333
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4334
  <div class="uv-logs-content" style="display: none;">
4335
+ Installed 37 packages in 270ms
4336
  </div>
4337
  </div>
4338
  <div class="cell-artifacts">
 
4345
  <rdf:RDF>
4346
  <ns2:Work>
4347
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4348
+ <dc:date>2025-12-19T19:10:00.094905</dc:date>
4349
  <dc:format>image/svg+xml</dc:format>
4350
  <dc:creator>
4351
  <ns2:Agent>
4352
+ <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
4353
  </ns2:Agent>
4354
  </dc:creator>
4355
  </ns2:Work>
 
4364
  </g>
4365
  <g id="axes--1" class="axes">
4366
  <g id="patch_2">
4367
+ <path d="M 57.17 468.317269 L 845.766818 468.317269 L 845.766818 26.88 L 57.17 26.88 L 57.17 468.317269 z " style="fill: none" />
4368
  </g>
4369
  <g id="matplotlib.axis_1">
4370
  <g id="xtick_1">
4371
  <g id="grid-x--1" class="grid grid-x">
4372
+ <path d="M 93.01531 468.317269 L 93.01531 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4373
  </g>
4374
  <g id="line2d_1">
4375
  <defs>
4376
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
4377
  </defs>
4378
  <g>
4379
+ <use ns4:href="#mafb3703e5b" x="93.01531" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4380
  </g>
4381
  </g>
4382
  <g id="text_1">
4383
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.817431 544.791615) rotate(-45)">cuda_B1_S512_E2</text>
4384
  </g>
4385
  </g>
4386
  <g id="xtick_2">
4387
  <g id="grid-x--2" class="grid grid-x">
4388
+ <path d="M 195.430481 468.317269 L 195.430481 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4389
  </g>
4390
  <g id="line2d_2">
4391
  <g>
4392
+ <use ns4:href="#mafb3703e5b" x="195.430481" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4393
  </g>
4394
  </g>
4395
  <g id="text_2">
4396
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.232602 544.791615) rotate(-45)">cuda_B1_S512_E4</text>
4397
  </g>
4398
  </g>
4399
  <g id="xtick_3">
4400
  <g id="grid-x--3" class="grid grid-x">
4401
+ <path d="M 297.845652 468.317269 L 297.845652 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4402
  </g>
4403
  <g id="line2d_3">
4404
  <g>
4405
+ <use ns4:href="#mafb3703e5b" x="297.845652" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4406
  </g>
4407
  </g>
4408
  <g id="text_3">
4409
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.39829 549.290582) rotate(-45)">cuda_B1_S1024_E2</text>
4410
  </g>
4411
  </g>
4412
  <g id="xtick_4">
4413
  <g id="grid-x--4" class="grid grid-x">
4414
+ <path d="M 400.260823 468.317269 L 400.260823 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4415
  </g>
4416
  <g id="line2d_4">
4417
  <g>
4418
+ <use ns4:href="#mafb3703e5b" x="400.260823" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4419
  </g>
4420
  </g>
4421
  <g id="text_4">
4422
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.813461 549.290582) rotate(-45)">cuda_B1_S1024_E4</text>
4423
  </g>
4424
  </g>
4425
  <g id="xtick_5">
4426
  <g id="grid-x--5" class="grid grid-x">
4427
+ <path d="M 502.675995 468.317269 L 502.675995 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4428
  </g>
4429
  <g id="line2d_5">
4430
  <g>
4431
+ <use ns4:href="#mafb3703e5b" x="502.675995" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4432
  </g>
4433
  </g>
4434
  <g id="text_5">
4435
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.478116 544.791615) rotate(-45)">cuda_B4_S512_E2</text>
4436
  </g>
4437
  </g>
4438
  <g id="xtick_6">
4439
  <g id="grid-x--6" class="grid grid-x">
4440
+ <path d="M 605.091166 468.317269 L 605.091166 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4441
  </g>
4442
  <g id="line2d_6">
4443
  <g>
4444
+ <use ns4:href="#mafb3703e5b" x="605.091166" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4445
  </g>
4446
  </g>
4447
  <g id="text_6">
4448
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.893287 544.791615) rotate(-45)">cuda_B4_S512_E4</text>
4449
  </g>
4450
  </g>
4451
  <g id="xtick_7">
4452
  <g id="grid-x--7" class="grid grid-x">
4453
+ <path d="M 707.506337 468.317269 L 707.506337 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4454
  </g>
4455
  <g id="line2d_7">
4456
  <g>
4457
+ <use ns4:href="#mafb3703e5b" x="707.506337" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4458
  </g>
4459
  </g>
4460
  <g id="text_7">
4461
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.058975 549.290582) rotate(-45)">cuda_B4_S1024_E2</text>
4462
  </g>
4463
  </g>
4464
  <g id="xtick_8">
4465
  <g id="grid-x--8" class="grid grid-x">
4466
+ <path d="M 809.921508 468.317269 L 809.921508 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4467
  </g>
4468
  <g id="line2d_8">
4469
  <g>
4470
+ <use ns4:href="#mafb3703e5b" x="809.921508" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4471
  </g>
4472
  </g>
4473
  <g id="text_8">
4474
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.474146 549.290582) rotate(-45)">cuda_B4_S1024_E4</text>
4475
  </g>
4476
  </g>
4477
  <g id="label--x" class="xlabel">
4478
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.468409" y="562.556245" transform="rotate(-0 451.468409 562.556245)">Workload</text>
4479
  </g>
4480
  </g>
4481
  <g id="matplotlib.axis_2">
4482
  <g id="ytick_1">
4483
  <g id="grid-y--2" class="grid grid-y">
4484
+ <path d="M 57.17 448.88374 L 845.766818 448.88374 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4485
  </g>
4486
  <g id="line2d_9">
4487
  <defs>
4488
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4489
  </defs>
4490
  <g>
4491
+ <use ns4:href="#m0fca2865ba" x="57.17" y="448.88374" style="stroke: #000000; stroke-width: 0.8" />
4492
  </g>
4493
  </g>
4494
  <g id="text_9">
4495
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="452.682959" transform="rotate(-0 50.17 452.682959)">0</text>
4496
  </g>
4497
  </g>
4498
  <g id="ytick_2">
4499
  <g id="grid-y--3" class="grid grid-y">
4500
+ <path d="M 57.17 388.304965 L 845.766818 388.304965 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4501
  </g>
4502
  <g id="line2d_10">
4503
  <g>
4504
+ <use ns4:href="#m0fca2865ba" x="57.17" y="388.304965" style="stroke: #000000; stroke-width: 0.8" />
4505
  </g>
4506
  </g>
4507
  <g id="text_10">
4508
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="392.104184" transform="rotate(-0 50.17 392.104184)">250</text>
4509
  </g>
4510
  </g>
4511
  <g id="ytick_3">
4512
  <g id="grid-y--4" class="grid grid-y">
4513
+ <path d="M 57.17 327.726191 L 845.766818 327.726191 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4514
  </g>
4515
  <g id="line2d_11">
4516
  <g>
4517
+ <use ns4:href="#m0fca2865ba" x="57.17" y="327.726191" style="stroke: #000000; stroke-width: 0.8" />
4518
  </g>
4519
  </g>
4520
  <g id="text_11">
4521
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="331.52541" transform="rotate(-0 50.17 331.52541)">500</text>
4522
  </g>
4523
  </g>
4524
  <g id="ytick_4">
4525
  <g id="grid-y--5" class="grid grid-y">
4526
+ <path d="M 57.17 267.147416 L 845.766818 267.147416 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4527
  </g>
4528
  <g id="line2d_12">
4529
  <g>
4530
+ <use ns4:href="#m0fca2865ba" x="57.17" y="267.147416" style="stroke: #000000; stroke-width: 0.8" />
4531
  </g>
4532
  </g>
4533
  <g id="text_12">
4534
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="270.946635" transform="rotate(-0 50.17 270.946635)">750</text>
4535
  </g>
4536
  </g>
4537
  <g id="ytick_5">
4538
  <g id="grid-y--6" class="grid grid-y">
4539
+ <path d="M 57.17 206.568642 L 845.766818 206.568642 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4540
  </g>
4541
  <g id="line2d_13">
4542
  <g>
4543
+ <use ns4:href="#m0fca2865ba" x="57.17" y="206.568642" style="stroke: #000000; stroke-width: 0.8" />
4544
  </g>
4545
  </g>
4546
  <g id="text_13">
4547
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="210.367861" transform="rotate(-0 50.17 210.367861)">1000</text>
4548
  </g>
4549
  </g>
4550
  <g id="ytick_6">
4551
  <g id="grid-y--7" class="grid grid-y">
4552
+ <path d="M 57.17 145.989867 L 845.766818 145.989867 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4553
  </g>
4554
  <g id="line2d_14">
4555
  <g>
4556
+ <use ns4:href="#m0fca2865ba" x="57.17" y="145.989867" style="stroke: #000000; stroke-width: 0.8" />
4557
  </g>
4558
  </g>
4559
  <g id="text_14">
4560
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="149.789086" transform="rotate(-0 50.17 149.789086)">1250</text>
4561
  </g>
4562
  </g>
4563
  <g id="ytick_7">
4564
  <g id="grid-y--8" class="grid grid-y">
4565
+ <path d="M 57.17 85.411093 L 845.766818 85.411093 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4566
  </g>
4567
  <g id="line2d_15">
4568
  <g>
4569
+ <use ns4:href="#m0fca2865ba" x="57.17" y="85.411093" style="stroke: #000000; stroke-width: 0.8" />
4570
  </g>
4571
  </g>
4572
  <g id="text_15">
4573
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="89.210312" transform="rotate(-0 50.17 89.210312)">1500</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4574
  </g>
4575
  </g>
4576
  <g id="label--y" class="ylabel">
4577
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.640312" y="247.598635" transform="rotate(-90 18.640312 247.598635)">Latency P50 (ms)</text>
4578
  </g>
4579
  </g>
4580
  <g id="series--binned-torch" class="series">
4581
+ <path d="M 93.01531 410.663437 L 195.430481 399.252405 L 297.845652 356.001516 L 400.260823 346.76844 L 502.675995 261.404682 L 605.091166 245.335532 L 707.506337 82.088892 L 809.921508 46.94533 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4582
  <defs>
4583
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4584
  </defs>
4585
+ <g clip-path="url(#p5307ca50d8)">
4586
+ <use ns4:href="#md7efaf3aec" x="93.01531" y="410.663437" style="fill: #1f77b4; stroke: #1f77b4" />
4587
+ <use ns4:href="#md7efaf3aec" x="195.430481" y="399.252405" style="fill: #1f77b4; stroke: #1f77b4" />
4588
+ <use ns4:href="#md7efaf3aec" x="297.845652" y="356.001516" style="fill: #1f77b4; stroke: #1f77b4" />
4589
+ <use ns4:href="#md7efaf3aec" x="400.260823" y="346.76844" style="fill: #1f77b4; stroke: #1f77b4" />
4590
+ <use ns4:href="#md7efaf3aec" x="502.675995" y="261.404682" style="fill: #1f77b4; stroke: #1f77b4" />
4591
+ <use ns4:href="#md7efaf3aec" x="605.091166" y="245.335532" style="fill: #1f77b4; stroke: #1f77b4" />
4592
+ <use ns4:href="#md7efaf3aec" x="707.506337" y="82.088892" style="fill: #1f77b4; stroke: #1f77b4" />
4593
+ <use ns4:href="#md7efaf3aec" x="809.921508" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
4594
  </g>
4595
  </g>
4596
  <g id="series--gpt-oss-experts" class="series">
4597
+ <path d="M 93.01531 448.251939 L 195.430481 447.950554 L 297.845652 447.969402 L 400.260823 447.623067 L 502.675995 447.274263 L 605.091166 447.115423 L 707.506337 445.70441 L 809.921508 445.680583 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4598
  <defs>
4599
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4600
  </defs>
4601
+ <g clip-path="url(#p5307ca50d8)">
4602
+ <use ns4:href="#m9b8c54d372" x="93.01531" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
4603
+ <use ns4:href="#m9b8c54d372" x="195.430481" y="447.950554" style="fill: #ff7f0e; stroke: #ff7f0e" />
4604
+ <use ns4:href="#m9b8c54d372" x="297.845652" y="447.969402" style="fill: #ff7f0e; stroke: #ff7f0e" />
4605
+ <use ns4:href="#m9b8c54d372" x="400.260823" y="447.623067" style="fill: #ff7f0e; stroke: #ff7f0e" />
4606
+ <use ns4:href="#m9b8c54d372" x="502.675995" y="447.274263" style="fill: #ff7f0e; stroke: #ff7f0e" />
4607
+ <use ns4:href="#m9b8c54d372" x="605.091166" y="447.115423" style="fill: #ff7f0e; stroke: #ff7f0e" />
4608
+ <use ns4:href="#m9b8c54d372" x="707.506337" y="445.70441" style="fill: #ff7f0e; stroke: #ff7f0e" />
4609
+ <use ns4:href="#m9b8c54d372" x="809.921508" y="445.680583" style="fill: #ff7f0e; stroke: #ff7f0e" />
4610
  </g>
4611
  </g>
4612
  <g id="patch_3">
4613
+ <path d="M 57.17 468.317269 L 57.17 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4614
  </g>
4615
  <g id="patch_4">
4616
  <path d="M 845.766818 468.317269 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4617
  </g>
4618
  <g id="patch_5">
4619
+ <path d="M 57.17 468.317269 L 845.766818 468.317269 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4620
  </g>
4621
  <g id="patch_6">
4622
+ <path d="M 57.17 26.88 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4623
  </g>
4624
+ <g id="text_16">
4625
+ <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.468409" y="20.88" transform="rotate(-0 451.468409 20.88)">Attention Implementation Latency</text>
4626
  </g>
4627
  <g id="legend" class="legend">
4628
  <g id="patch_7">
4629
+ <path d="M 64.17 64.7925 L 176.96375 64.7925 Q 178.96375 64.7925 178.96375 62.7925 L 178.96375 33.88 Q 178.96375 31.88 176.96375 31.88 L 64.17 31.88 Q 62.17 31.88 62.17 33.88 L 62.17 62.7925 Q 62.17 64.7925 64.17 64.7925 L 64.17 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4630
  </g>
4631
+ <g id="line2d_16">
4632
+ <path d="M 66.17 39.978438 L 76.17 39.978438 L 86.17 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4633
  <g>
4634
+ <use ns4:href="#md7efaf3aec" x="76.17" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4635
  </g>
4636
  </g>
4637
  <g id="legend-label--binned-torch" class="legend">
4638
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.17" y="43.478438" transform="rotate(-0 94.17 43.478438)">binned_torch</text>
4639
  </g>
4640
+ <g id="line2d_17">
4641
+ <path d="M 66.17 54.934687 L 76.17 54.934687 L 86.17 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4642
  <g>
4643
+ <use ns4:href="#m9b8c54d372" x="76.17" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4644
  </g>
4645
  </g>
4646
  <g id="legend-label--gpt-oss-experts" class="legend">
4647
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.17" y="58.434687" transform="rotate(-0 94.17 58.434687)">gpt_oss_experts</text>
4648
  </g>
4649
  </g>
4650
  </g>
4651
  </g>
4652
  <defs>
4653
+ <clipPath id="p5307ca50d8">
4654
+ <rect x="57.17" y="26.88" width="788.596818" height="441.437269" />
4655
  </clipPath>
4656
  </defs>
4657
  </svg>
rotary/impls/artifacts/benchmark/rotary.jsonl CHANGED
@@ -1,24 +1,24 @@
1
- {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07309099999019963, "p50": 0.07444199997053147, "p90": 0.07482099999833736, "mean": 0.07456319998482286, "iqr": 0.00039000002516331733, "raw_times": [0.07443099997317404, 0.0760309999918718, 0.07482099999833736, 0.07444199997053147, 0.07309099999019963], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08203099997672325, "peak_bytes": 3178496, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590452924915553e-08, "mae_k": 1.5487040982975486e-08, "mse_q": 2.5241010080938753e-15, "mse_k": 2.364223539299626e-15, "ref": "rotary_torch"}, "err": null}
2
- {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09060200000021723, "p50": 0.09103200000026845, "p90": 0.09151199998314041, "mean": 0.09118959999341314, "iqr": 0.0008709999974598759, "raw_times": [0.09060200000021723, 0.09216099999775906, 0.09103200000026845, 0.09151199998314041, 0.09064099998568054], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09373199998208293, "peak_bytes": 6356992, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5508486939097565e-08, "mae_k": 1.567566698668088e-08, "mse_q": 2.3630110116356316e-15, "mse_k": 2.416562128626943e-15, "ref": "rotary_torch"}, "err": null}
3
- {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08705100003680855, "p50": 0.0875320000091051, "p90": 0.08769100003291896, "mean": 0.08876720002035654, "iqr": 0.0002699999868127634, "raw_times": [0.0874210000461062, 0.09414099997684389, 0.08705100003680855, 0.08769100003291896, 0.0875320000091051], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09279099998593665, "peak_bytes": 12615680, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5856898016863852e-08, "mae_k": 1.572981211950264e-08, "mse_q": 2.4771055025978386e-15, "mse_k": 2.4544071371937915e-15, "ref": "rotary_torch"}, "err": null}
4
- {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08560100002341642, "p50": 0.08801100000255246, "p90": 0.08860200000526675, "mean": 0.08908540002039445, "iqr": 0.0012109999829590379, "raw_times": [0.08739100002230771, 0.08801100000255246, 0.08860200000526675, 0.09582200004842889, 0.08560100002341642], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09329199997409887, "peak_bytes": 25231360, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5617658277733426e-08, "mae_k": 1.5788685914230882e-08, "mse_q": 2.4549424620164562e-15, "mse_k": 2.492823469483563e-15, "ref": "rotary_torch"}, "err": null}
5
- {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08852199999864752, "p50": 0.08866100000659571, "p90": 0.08963200002654048, "mean": 0.08911940001326002, "iqr": 0.0010109999948326731, "raw_times": [0.08963200002654048, 0.08862100003170781, 0.08852199999864752, 0.08866100000659571, 0.09016100000280858], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0921010000070055, "peak_bytes": 12779520, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5962712041073246e-08, "mae_k": 1.5743363945830424e-08, "mse_q": 2.534145124782417e-15, "mse_k": 2.451281585618423e-15, "ref": "rotary_torch"}, "err": null}
6
- {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08753199995226169, "p50": 0.08826099997349957, "p90": 0.08928100004368389, "mean": 0.08895959999790648, "iqr": 0.001079000014669873, "raw_times": [0.08753199995226169, 0.08928100004368389, 0.08820200002901402, 0.09152199999107324, 0.08826099997349957], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0922809999792662, "peak_bytes": 25427968, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.578730035589615e-08, "mae_k": 1.5859711766097462e-08, "mse_q": 2.440287521479536e-15, "mse_k": 2.477901290051784e-15, "ref": "rotary_torch"}, "err": null}
7
- {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08700099999714439, "p50": 0.08810100001710452, "p90": 0.08876099997223719, "mean": 0.08815519998961463, "iqr": 0.0012099999935344385, "raw_times": [0.08700099999714439, 0.0893619999828843, 0.08876099997223719, 0.08810100001710452, 0.08755099997870275], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09151099999371581, "peak_bytes": 50462720, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5775295736375483e-08, "mae_k": 1.5847881229547056e-08, "mse_q": 2.471039476146077e-15, "mse_k": 2.472378635235686e-15, "ref": "rotary_torch"}, "err": null}
8
- {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08636100000103397, "p50": 0.08706099998789796, "p90": 0.0880219999999099, "mean": 0.08728360000986868, "iqr": 0.0015599999869664316, "raw_times": [0.08646200001294346, 0.0885120000475581, 0.0880219999999099, 0.08706099998789796, 0.08636100000103397], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09124100000690305, "peak_bytes": 100925440, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5959869870130206e-08, "mae_k": 1.588083975434529e-08, "mse_q": 2.510663677418633e-15, "mse_k": 2.502786271009168e-15, "ref": "rotary_torch"}, "err": null}
9
- {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08730199999718025, "p50": 0.08869100003039421, "p90": 0.09006199996974829, "mean": 0.08888559999604695, "iqr": 0.0023309999619414157, "raw_times": [0.08773100000780687, 0.09064199997510514, 0.09006199996974829, 0.08730199999718025, 0.08869100003039421], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09268100001236235, "peak_bytes": 51118080, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5894533689220225e-08, "mae_k": 1.5873395042831362e-08, "mse_q": 2.5093181655819197e-15, "mse_k": 2.488611809911578e-15, "ref": "rotary_torch"}, "err": null}
10
- {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08675099996935387, "p50": 0.08854100002508858, "p90": 0.08863200002906524, "mean": 0.08941120000827141, "iqr": 0.00029099999210302485, "raw_times": [0.09479099998088714, 0.08863200002906524, 0.08854100002508858, 0.08675099996935387, 0.08834100003696221], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09256099997401179, "peak_bytes": 101711872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5936768349433805e-08, "mae_k": 1.5960043953100467e-08, "mse_q": 2.51039008577667e-15, "mse_k": 2.5111253103748867e-15, "ref": "rotary_torch"}, "err": null}
11
- {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08763099998532198, "p50": 0.08916199999475793, "p90": 0.08947200001330202, "mean": 0.08909940000876304, "iqr": 0.000891000013325538, "raw_times": [0.08947200001330202, 0.08763099998532198, 0.08916199999475793, 0.09065100005045679, 0.08858099999997648], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0931619999846589, "peak_bytes": 201850880, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 1.9073486328125e-06, "absmax_k": 9.5367431640625e-07, "mae_q": 1.586510300910504e-08, "mae_k": 1.5813935050346117e-08, "mse_q": 2.499836478770355e-15, "mse_k": 2.4755639026338358e-15, "ref": "rotary_torch"}, "err": null}
12
- {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2592540000136978, "p50": 0.2617740000232516, "p90": 0.2619539999955123, "mean": 0.2612200000157827, "iqr": 0.0011099999710495467, "raw_times": [0.2617740000232516, 0.2622740000219892, 0.2619539999955123, 0.26084400002446273, 0.2592540000136978], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2616440000338116, "peak_bytes": 403701760, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.581049247079136e-08, "mae_k": 1.5861061797295406e-08, "mse_q": 2.4735094242202705e-15, "mse_k": 2.486832828964107e-15, "ref": "rotary_torch"}, "err": null}
13
- {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08504199996650641, "p50": 0.08663200003411475, "p90": 0.0882019999721706, "mean": 0.08694359999026346, "iqr": 0.0022109999804342806, "raw_times": [0.08663200003411475, 0.0882019999721706, 0.08885099998678925, 0.08599099999173632, 0.08504199996650641], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08975200000804762, "peak_bytes": 137396224, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5824980437173508e-08, "mae_k": 1.5615324144846454e-08, "mse_q": 2.488090249374306e-15, "mse_k": 2.425079044911585e-15, "ref": "rotary_torch"}, "err": null}
14
- {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0891509999974005, "p50": 0.08992100003979431, "p90": 0.09012199996050185, "mean": 0.0899451999998746, "iqr": 0.0002709999762373627, "raw_times": [0.08985099998426449, 0.09068100001741186, 0.09012199996050185, 0.08992100003979431, 0.0891509999974005], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09275200000047334, "peak_bytes": 12648448, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5683587761827766e-08, "mae_k": 1.574532682013796e-08, "mse_q": 2.4310271220254415e-15, "mse_k": 2.4601385856313877e-15, "ref": "rotary_torch"}, "err": null}
15
- {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08789200001046993, "p50": 0.08992099998295089, "p90": 0.0902720000226509, "mean": 0.09012159999883806, "iqr": 0.0012010000318696257, "raw_times": [0.08789200001046993, 0.08907099999078127, 0.09345199998733733, 0.0902720000226509, 0.08992099998295089], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09202100000038627, "peak_bytes": 25198592, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5835009747888762e-08, "mae_k": 1.572560215379326e-08, "mse_q": 2.478222950813504e-15, "mse_k": 2.4541699679685603e-15, "ref": "rotary_torch"}, "err": null}
16
- {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08713099998658436, "p50": 0.08851199999071468, "p90": 0.08962200001860765, "mean": 0.09088959999417057, "iqr": 0.0023510000346504967, "raw_times": [0.08713099998658436, 0.08851199999071468, 0.08962200001860765, 0.08727099998395715, 0.10191199999098899], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08909199999607154, "peak_bytes": 50397184, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5876850056883995e-08, "mae_k": 1.5927410501603845e-08, "mse_q": 2.504224532953606e-15, "mse_k": 2.503892919554756e-15, "ref": "rotary_torch"}, "err": null}
17
- {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08660100002089166, "p50": 0.08889200000794517, "p90": 0.08962200001860765, "mean": 0.08841560002110782, "iqr": 0.002391000009538402, "raw_times": [0.08889200000794517, 0.08660100002089166, 0.08723100000906925, 0.08962200001860765, 0.08973200004902537], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10579199999938282, "peak_bytes": 25362432, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5820052823301012e-08, "mae_k": 1.580205122309053e-08, "mse_q": 2.4876468276264184e-15, "mse_k": 2.4866062476507165e-15, "ref": "rotary_torch"}, "err": null}
18
- {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08826199996292416, "p50": 0.08903100001589337, "p90": 0.0892219999855115, "mean": 0.0892053999905329, "iqr": 0.0008609999895270448, "raw_times": [0.08826199996292416, 0.09115099999235099, 0.0892219999855115, 0.08836099999598446, 0.08903100001589337], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09266099999649668, "peak_bytes": 50593792, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5823172105911e-08, "mae_k": 1.582038855474366e-08, "mse_q": 2.464257071579175e-15, "mse_k": 2.4775099608301526e-15, "ref": "rotary_torch"}, "err": null}
19
- {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08734199997206815, "p50": 0.0891519999868251, "p90": 0.09024100000942781, "mean": 0.0889337999979034, "iqr": 0.002558999995017075, "raw_times": [0.08734199997206815, 0.09024100000942781, 0.09025200000678524, 0.0891519999868251, 0.08768200001441073], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09219200001098216, "peak_bytes": 100794368, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5888783622131086e-08, "mae_k": 1.5861886026868888e-08, "mse_q": 2.4766798685418433e-15, "mse_k": 2.475923891636419e-15, "ref": "rotary_torch"}, "err": null}
20
- {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08766199999854507, "p50": 0.0882019999721706, "p90": 0.08848099997749159, "mean": 0.08829339999465446, "iqr": 0.0005199999577598646, "raw_times": [0.08766199999854507, 0.08848099997749159, 0.08916100000533334, 0.08796100001973173, 0.0882019999721706], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0894309999921461, "peak_bytes": 201588736, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5826390864503992e-08, "mae_k": 1.5792682717119533e-08, "mse_q": 2.480465258783123e-15, "mse_k": 2.475580631534544e-15, "ref": "rotary_torch"}, "err": null}
21
- {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08695100001432365, "p50": 0.08826100003034298, "p90": 0.08882200000925877, "mean": 0.08816519999754746, "iqr": 0.001351000037175254, "raw_times": [0.08747099997208352, 0.08882200000925877, 0.08826100003034298, 0.08695100001432365, 0.08932099996172838], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09112100002539592, "peak_bytes": 101449728, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.592899323554775e-08, "mae_k": 1.5925031959795888e-08, "mse_q": 2.50783882253954e-15, "mse_k": 2.5015648494992274e-15, "ref": "rotary_torch"}, "err": null}
22
- {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08768100002498613, "p50": 0.08872199998677388, "p90": 0.08966100000407096, "mean": 0.09043360000760003, "iqr": 0.0018989999830409943, "raw_times": [0.09834200000113924, 0.08966100000407096, 0.08872199998677388, 0.08768100002498613, 0.08776200002102996], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08996200000410681, "peak_bytes": 202375168, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590209919299923e-08, "mae_k": 1.590130160877834e-08, "mse_q": 2.4971026799330918e-15, "mse_k": 2.506967649153289e-15, "ref": "rotary_torch"}, "err": null}
23
- {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2556240000330945, "p50": 0.2579839999725664, "p90": 0.2584439999964161, "mean": 0.258233800002472, "iqr": 0.0005109999960950518, "raw_times": [0.2579839999725664, 0.2584439999964161, 0.2556240000330945, 0.2611840000099619, 0.25793300000032104], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2541540000038367, "peak_bytes": 403177472, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5847520629108658e-08, "mae_k": 1.5862454461057496e-08, "mse_q": 2.4917348203881045e-15, "mse_k": 2.491306009958557e-15, "ref": "rotary_torch"}, "err": null}
24
- {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8455130000015743, "p50": 0.8465030000479601, "p90": 0.850922999973136, "mean": 0.8485591999942699, "iqr": 0.005059999978129781, "raw_times": [0.850922999973136, 0.8465030000479601, 0.8458629999950062, 0.8455130000015743, 0.8539939999536728], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8586040000295725, "peak_bytes": 806354944, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.585225106737198e-08, "mae_k": 1.581303976649906e-08, "mse_q": 2.4866460581992374e-15, "mse_k": 2.4721545950211372e-15, "ref": "rotary_torch"}, "err": null}
 
1
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17331300000478223, "p50": 0.17603300000246236, "p90": 0.1797429999896849, "mean": 0.1784169999950791, "iqr": 0.0038800000083938357, "raw_times": [0.17603300000246236, 0.17586299998129107, 0.1797429999896849, 0.18713299999717492, 0.17331300000478223], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18657300000768373, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
2
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21556299998337636, "p50": 0.2165239999953883, "p90": 0.21698299997297, "mean": 0.21635159998822928, "iqr": 0.0013189999776841432, "raw_times": [0.2165239999953883, 0.21698299997297, 0.21566399999528585, 0.21556299998337636, 0.21702399999412592], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21905399995603148, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
3
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21391299998185787, "p50": 0.21503299996084024, "p90": 0.21681300000864212, "mean": 0.21537540000053923, "iqr": 0.0027289999593449465, "raw_times": [0.21503299996084024, 0.21408400004929717, 0.21681300000864212, 0.21703400000205875, 0.21391299998185787], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2214840000078766, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
4
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21356299998842587, "p50": 0.2151840000124139, "p90": 0.2162740000244412, "mean": 0.21532140000317668, "iqr": 0.0011410000411160581, "raw_times": [0.2162740000244412, 0.21513299998332513, 0.2151840000124139, 0.2164530000072773, 0.21356299998842587], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2165939999940747, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
5
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21375400001488742, "p50": 0.21507399998199617, "p90": 0.21535299998731716, "mean": 0.21505959999785773, "iqr": 0.0006099999723119254, "raw_times": [0.21474300001500524, 0.21375400001488742, 0.21535299998731716, 0.21507399998199617, 0.21637399999008267], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2174030000219318, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
6
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2112430000238419, "p50": 0.21400400004267794, "p90": 0.21425299996735703, "mean": 0.21312160000661606, "iqr": 0.002878999964650575, "raw_times": [0.21137400000270645, 0.21425299996735703, 0.21400400004267794, 0.214733999996497, 0.2112430000238419], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22874399996908323, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
7
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21246400001473376, "p50": 0.2133630000002995, "p90": 0.21390399996334963, "mean": 0.2133594000042649, "iqr": 0.0008009999419300584, "raw_times": [0.21396300002152202, 0.21246400001473376, 0.21390399996334963, 0.21310300002141958, 0.2133630000002995], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2195139999798812, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
8
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21415399999114015, "p50": 0.21443299999646115, "p90": 0.2147029999832739, "mean": 0.2148253999962435, "iqr": 0.000368999963029637, "raw_times": [0.21650299999009803, 0.21415399999114015, 0.21443299999646115, 0.21433400002024428, 0.2147029999832739], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2173330000232454, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
9
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21359300001222437, "p50": 0.2138830000149028, "p90": 0.21400299999640993, "mean": 0.21457699999700708, "iqr": 0.00012000003835055395, "raw_times": [0.21400299999640993, 0.2138830000149028, 0.21359300001222437, 0.21752300000343894, 0.21388299995805937], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2191329999732261, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
10
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21296400001347138, "p50": 0.21389400001226022, "p90": 0.21517300001505646, "mean": 0.21466560000362733, "iqr": 0.0013790000252811296, "raw_times": [0.21296400001347138, 0.21517300001505646, 0.21750299998757328, 0.21379399998977533, 0.21389400001226022], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21542299998600356, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
11
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2098030000183826, "p50": 0.21347300003071723, "p90": 0.21457399998325855, "mean": 0.21505920001345658, "iqr": 0.0023309999619414157, "raw_times": [0.2098030000183826, 0.21457399998325855, 0.21347300003071723, 0.21224300002131713, 0.22520300001360738], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21741399996244581, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
12
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22297399999615664, "p50": 0.22381400003723684, "p90": 0.22385300002270014, "mean": 0.2239618000203336, "iqr": 0.0007890000119914475, "raw_times": [0.2230640000107087, 0.22385300002270014, 0.22381400003723684, 0.22610400003486575, 0.22297399999615664], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22540300000173374, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
13
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21275400001741218, "p50": 0.21372400004793235, "p90": 0.21630299994512825, "mean": 0.22107159999222858, "iqr": 0.0030399999673136335, "raw_times": [0.21372400004793235, 0.24931399997285553, 0.21630299994512825, 0.21326299997781462, 0.21275400001741218], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21886299998641334, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
14
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21458399999119138, "p50": 0.21627299997817317, "p90": 0.21634299997685957, "mean": 0.21600339998713025, "iqr": 0.0007099999947968172, "raw_times": [0.21627299997817317, 0.21718400000736438, 0.21563299998206276, 0.21458399999119138, 0.21634299997685957], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2226130000053672, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
15
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2140940000003866, "p50": 0.215932999992674, "p90": 0.21619400001782196, "mean": 0.21597160000510485, "iqr": 0.0015699999948992627, "raw_times": [0.2140940000003866, 0.2146240000229227, 0.215932999992674, 0.21619400001782196, 0.21901299999171897], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2184540000484958, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
16
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21190300003581797, "p50": 0.21745400005102056, "p90": 0.21756400002459486, "mean": 0.21624960003236993, "iqr": 0.0009400000067216752, "raw_times": [0.21190300003581797, 0.21745400005102056, 0.21756400002459486, 0.2166240000178732, 0.21770300003254306], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25062399998887486, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
17
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21696300001394775, "p50": 0.21815399998104112, "p90": 0.21820400002070528, "mean": 0.21879360000411907, "iqr": 0.0004510000053414842, "raw_times": [0.2177530000153638, 0.21815399998104112, 0.2228939999895374, 0.21696300001394775, 0.21820400002070528], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2236640000319312, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
18
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21379300000035073, "p50": 0.21643299999141163, "p90": 0.21674399999938032, "mean": 0.21709340001052624, "iqr": 0.00039999997625272954, "raw_times": [0.21379300000035073, 0.21674399999938032, 0.21643299999141163, 0.2163440000231276, 0.22215300003836091], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21868300001415264, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
19
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2133630000002995, "p50": 0.21632300001783733, "p90": 0.21671399997558183, "mean": 0.21582319999424726, "iqr": 0.0009309999882134434, "raw_times": [0.21693299999014926, 0.2133630000002995, 0.21632300001783733, 0.21671399997558183, 0.21578299998736838], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2180729999849973, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
20
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21294399999760572, "p50": 0.21446299996341622, "p90": 0.21984300002486634, "mean": 0.21647359999406035, "iqr": 0.006489000043075066, "raw_times": [0.21335399998179128, 0.2217640000026222, 0.21984300002486634, 0.21294399999760572, 0.21446299996341622], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21826299996519083, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
21
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21578299998736838, "p50": 0.21700399997826025, "p90": 0.2204729999562005, "mean": 0.21918559997402554, "iqr": 0.004118999981983507, "raw_times": [0.21700399997826025, 0.22631399997408153, 0.2204729999562005, 0.216353999974217, 0.21578299998736838], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22091399995360916, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
22
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2135729999963587, "p50": 0.2144540000017514, "p90": 0.2173039999888715, "mean": 0.21536960000503313, "iqr": 0.003270999968663091, "raw_times": [0.21403300002020842, 0.2144540000017514, 0.2135729999963587, 0.2173039999888715, 0.21748400001797563], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22203300000001036, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
23
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22941399998899215, "p50": 0.23028300000760282, "p90": 0.23160400002097958, "mean": 0.23061779999125065, "iqr": 0.0017800000478018774, "raw_times": [0.231963999965501, 0.22941399998899215, 0.2298239999731777, 0.23160400002097958, 0.23028300000760282], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23190299998532282, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
24
+ {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.6428210000422041, "p50": 0.6484909999926458, "p90": 0.6486400000085268, "mean": 0.6472164000115299, "iqr": 0.0035200000070290116, "raw_times": [0.651010000012775, 0.6428210000422041, 0.6486400000085268, 0.6451200000014978, 0.6484909999926458], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.6451109999829896, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
rotary/impls/cells/benchmark.py CHANGED
@@ -4,7 +4,6 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
- # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
@@ -13,36 +12,46 @@
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- from kernels import get_kernel
17
 
18
- # Load the rotary kernel
19
- rotary = get_kernel("kernels-community/rotary")
20
 
 
 
 
 
 
 
 
 
 
21
 
22
- def hf_kernels_rotary(query, key, cos, sin, conj=False):
 
23
  rotary_dim = cos.shape[-1]
24
 
25
- # Clone to avoid modifying inputs
26
  q_out = query.clone()
27
  k_out = key.clone()
28
 
29
  # Apply rotation to query
30
  q1 = q_out[..., :rotary_dim]
31
  q2 = q_out[..., rotary_dim : 2 * rotary_dim]
32
- rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)
 
 
33
 
34
  # Apply rotation to key
35
  k1 = k_out[..., :rotary_dim]
36
  k2 = k_out[..., rotary_dim : 2 * rotary_dim]
37
- rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)
 
 
38
 
39
  return q_out, k_out
40
 
41
 
42
  run_benchmark(
43
  kernel_type=KernelTypeEnum.ROTARY,
44
- impl_name="hf_kernels_rotary",
45
- impl_tags={"family": "hf-kernels", "backend": "cuda"},
46
- impl_func=hf_kernels_rotary,
47
- dtype="float32",
48
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
 
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
15
 
 
 
16
 
17
+ def apply_rotary_torch(x1, x2, cos, sin, conj=False):
18
+ """Reference rotary implementation."""
19
+ if not conj:
20
+ out1 = x1 * cos - x2 * sin
21
+ out2 = x1 * sin + x2 * cos
22
+ else:
23
+ out1 = x1 * cos + x2 * sin
24
+ out2 = -x1 * sin + x2 * cos
25
+ return out1, out2
26
 
27
+
28
+ def torch_rotary(query, key, cos, sin, conj=False):
29
  rotary_dim = cos.shape[-1]
30
 
31
+ # Clone inputs to avoid modifying them
32
  q_out = query.clone()
33
  k_out = key.clone()
34
 
35
  # Apply rotation to query
36
  q1 = q_out[..., :rotary_dim]
37
  q2 = q_out[..., rotary_dim : 2 * rotary_dim]
38
+ q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
39
+ q_out[..., :rotary_dim] = q_out_1
40
+ q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2
41
 
42
  # Apply rotation to key
43
  k1 = k_out[..., :rotary_dim]
44
  k2 = k_out[..., rotary_dim : 2 * rotary_dim]
45
+ k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
46
+ k_out[..., :rotary_dim] = k_out_1
47
+ k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2
48
 
49
  return q_out, k_out
50
 
51
 
52
  run_benchmark(
53
  kernel_type=KernelTypeEnum.ROTARY,
54
+ impl_name="torch_eager",
55
+ impl_tags={"family": "pytorch", "backend": "eager"},
56
+ impl_func=torch_rotary,
 
57
  )
rotary/impls/hf_kernels_rotary.html CHANGED
The diff for this file is too large to render. See raw diff
 
rotary/impls/torch_rotary.html CHANGED
The diff for this file is too large to render. See raw diff
 
rotary/index.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
rotary/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 0a4f1b049a53cce2974edf15a23c4345c4b61ed101112698c1678c755429abdb
  • Pointer size: 130 Bytes
  • Size of remote file: 37.9 kB

Git LFS Details

  • SHA256: 04c99f3bdfb8e557a70edb1f042a847172d839fb06e1e2252c0b4df4cf0c1abe
  • Pointer size: 130 Bytes
  • Size of remote file: 37.9 kB
rotary/results/combined_results.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
@@ -3889,11 +3889,11 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-11-10T22:11:51.846305</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
3896
- <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
3897
  </ns2:Agent>
3898
  </dc:creator>
3899
  </ns2:Work>
@@ -4233,109 +4233,109 @@ body[data-tool="eraser"] .main-content {
4233
  <g id="matplotlib.axis_2">
4234
  <g id="ytick_1">
4235
  <g id="grid-y--2" class="grid grid-y">
4236
- <path d="M 47.72 393.137893 L 823.142937 393.137893 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4237
  </g>
4238
  <g id="line2d_25">
4239
  <defs>
4240
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4241
  </defs>
4242
  <g>
4243
- <use ns4:href="#m0fca2865ba" x="47.72" y="393.137893" style="stroke: #000000; stroke-width: 0.8" />
4244
  </g>
4245
  </g>
4246
  <g id="text_25">
4247
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="396.937111" transform="rotate(-0 40.72 396.937111)">0.1</text>
4248
  </g>
4249
  </g>
4250
  <g id="ytick_2">
4251
  <g id="grid-y--3" class="grid grid-y">
4252
- <path d="M 47.72 346.487139 L 823.142937 346.487139 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4253
  </g>
4254
  <g id="line2d_26">
4255
  <g>
4256
- <use ns4:href="#m0fca2865ba" x="47.72" y="346.487139" style="stroke: #000000; stroke-width: 0.8" />
4257
  </g>
4258
  </g>
4259
  <g id="text_26">
4260
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="350.286357" transform="rotate(-0 40.72 350.286357)">0.2</text>
4261
  </g>
4262
  </g>
4263
  <g id="ytick_3">
4264
  <g id="grid-y--4" class="grid grid-y">
4265
- <path d="M 47.72 299.836384 L 823.142937 299.836384 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4266
  </g>
4267
  <g id="line2d_27">
4268
  <g>
4269
- <use ns4:href="#m0fca2865ba" x="47.72" y="299.836384" style="stroke: #000000; stroke-width: 0.8" />
4270
  </g>
4271
  </g>
4272
  <g id="text_27">
4273
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="303.635603" transform="rotate(-0 40.72 303.635603)">0.3</text>
4274
  </g>
4275
  </g>
4276
  <g id="ytick_4">
4277
  <g id="grid-y--5" class="grid grid-y">
4278
- <path d="M 47.72 253.18563 L 823.142937 253.18563 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4279
  </g>
4280
  <g id="line2d_28">
4281
  <g>
4282
- <use ns4:href="#m0fca2865ba" x="47.72" y="253.18563" style="stroke: #000000; stroke-width: 0.8" />
4283
  </g>
4284
  </g>
4285
  <g id="text_28">
4286
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.984849" transform="rotate(-0 40.72 256.984849)">0.4</text>
4287
  </g>
4288
  </g>
4289
  <g id="ytick_5">
4290
  <g id="grid-y--6" class="grid grid-y">
4291
- <path d="M 47.72 206.534876 L 823.142937 206.534876 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4292
  </g>
4293
  <g id="line2d_29">
4294
  <g>
4295
- <use ns4:href="#m0fca2865ba" x="47.72" y="206.534876" style="stroke: #000000; stroke-width: 0.8" />
4296
  </g>
4297
  </g>
4298
  <g id="text_29">
4299
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="210.334095" transform="rotate(-0 40.72 210.334095)">0.5</text>
4300
  </g>
4301
  </g>
4302
  <g id="ytick_6">
4303
  <g id="grid-y--7" class="grid grid-y">
4304
- <path d="M 47.72 159.884122 L 823.142937 159.884122 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4305
  </g>
4306
  <g id="line2d_30">
4307
  <g>
4308
- <use ns4:href="#m0fca2865ba" x="47.72" y="159.884122" style="stroke: #000000; stroke-width: 0.8" />
4309
  </g>
4310
  </g>
4311
  <g id="text_30">
4312
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="163.683341" transform="rotate(-0 40.72 163.683341)">0.6</text>
4313
  </g>
4314
  </g>
4315
  <g id="ytick_7">
4316
  <g id="grid-y--8" class="grid grid-y">
4317
- <path d="M 47.72 113.233368 L 823.142937 113.233368 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4318
  </g>
4319
  <g id="line2d_31">
4320
  <g>
4321
- <use ns4:href="#m0fca2865ba" x="47.72" y="113.233368" style="stroke: #000000; stroke-width: 0.8" />
4322
  </g>
4323
  </g>
4324
  <g id="text_31">
4325
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="117.032587" transform="rotate(-0 40.72 117.032587)">0.7</text>
4326
  </g>
4327
  </g>
4328
  <g id="ytick_8">
4329
  <g id="grid-y--9" class="grid grid-y">
4330
- <path d="M 47.72 66.582614 L 823.142937 66.582614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4331
  </g>
4332
  <g id="line2d_32">
4333
  <g>
4334
- <use ns4:href="#m0fca2865ba" x="47.72" y="66.582614" style="stroke: #000000; stroke-width: 0.8" />
4335
  </g>
4336
  </g>
4337
  <g id="text_32">
4338
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="70.381833" transform="rotate(-0 40.72 70.381833)">0.8</text>
4339
  </g>
4340
  </g>
4341
  <g id="label--y" class="ylabel">
@@ -4343,67 +4343,67 @@ body[data-tool="eraser"] .main-content {
4343
  </g>
4344
  </g>
4345
  <g id="series--hf-kernels-rotary" class="series">
4346
- <path d="M 82.966497 405.060892 L 113.615625 397.321532 L 144.264753 398.954309 L 174.913881 398.730851 L 205.563009 398.427622 L 236.212137 398.614225 L 266.861265 398.688866 L 297.510393 399.174034 L 328.159521 398.413626 L 358.808648 398.483602 L 389.457776 398.193901 L 420.106904 317.669102 L 450.756032 399.374165 L 481.40516 397.839822 L 512.054288 397.839822 L 542.703416 398.497131 L 573.352544 398.319858 L 604.001672 398.255014 L 634.6508 398.198566 L 665.299928 398.641749 L 695.949056 398.614225 L 726.598184 398.399165 L 757.247312 319.437165 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4347
  <defs>
4348
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4349
  </defs>
4350
  <g clip-path="url(#p088c925177)">
4351
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
4352
- <use ns4:href="#md7efaf3aec" x="113.615625" y="397.321532" style="fill: #1f77b4; stroke: #1f77b4" />
4353
- <use ns4:href="#md7efaf3aec" x="144.264753" y="398.954309" style="fill: #1f77b4; stroke: #1f77b4" />
4354
- <use ns4:href="#md7efaf3aec" x="174.913881" y="398.730851" style="fill: #1f77b4; stroke: #1f77b4" />
4355
- <use ns4:href="#md7efaf3aec" x="205.563009" y="398.427622" style="fill: #1f77b4; stroke: #1f77b4" />
4356
- <use ns4:href="#md7efaf3aec" x="236.212137" y="398.614225" style="fill: #1f77b4; stroke: #1f77b4" />
4357
- <use ns4:href="#md7efaf3aec" x="266.861265" y="398.688866" style="fill: #1f77b4; stroke: #1f77b4" />
4358
- <use ns4:href="#md7efaf3aec" x="297.510393" y="399.174034" style="fill: #1f77b4; stroke: #1f77b4" />
4359
- <use ns4:href="#md7efaf3aec" x="328.159521" y="398.413626" style="fill: #1f77b4; stroke: #1f77b4" />
4360
- <use ns4:href="#md7efaf3aec" x="358.808648" y="398.483602" style="fill: #1f77b4; stroke: #1f77b4" />
4361
- <use ns4:href="#md7efaf3aec" x="389.457776" y="398.193901" style="fill: #1f77b4; stroke: #1f77b4" />
4362
- <use ns4:href="#md7efaf3aec" x="420.106904" y="317.669102" style="fill: #1f77b4; stroke: #1f77b4" />
4363
- <use ns4:href="#md7efaf3aec" x="450.756032" y="399.374165" style="fill: #1f77b4; stroke: #1f77b4" />
4364
- <use ns4:href="#md7efaf3aec" x="481.40516" y="397.839822" style="fill: #1f77b4; stroke: #1f77b4" />
4365
- <use ns4:href="#md7efaf3aec" x="512.054288" y="397.839822" style="fill: #1f77b4; stroke: #1f77b4" />
4366
- <use ns4:href="#md7efaf3aec" x="542.703416" y="398.497131" style="fill: #1f77b4; stroke: #1f77b4" />
4367
- <use ns4:href="#md7efaf3aec" x="573.352544" y="398.319858" style="fill: #1f77b4; stroke: #1f77b4" />
4368
- <use ns4:href="#md7efaf3aec" x="604.001672" y="398.255014" style="fill: #1f77b4; stroke: #1f77b4" />
4369
- <use ns4:href="#md7efaf3aec" x="634.6508" y="398.198566" style="fill: #1f77b4; stroke: #1f77b4" />
4370
- <use ns4:href="#md7efaf3aec" x="665.299928" y="398.641749" style="fill: #1f77b4; stroke: #1f77b4" />
4371
- <use ns4:href="#md7efaf3aec" x="695.949056" y="398.614225" style="fill: #1f77b4; stroke: #1f77b4" />
4372
- <use ns4:href="#md7efaf3aec" x="726.598184" y="398.399165" style="fill: #1f77b4; stroke: #1f77b4" />
4373
- <use ns4:href="#md7efaf3aec" x="757.247312" y="319.437165" style="fill: #1f77b4; stroke: #1f77b4" />
4374
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
4375
  </g>
4376
  </g>
4377
  <g id="series--torch-eager" class="series">
4378
- <path d="M 82.966497 355.036822 L 113.615625 330.908586 L 144.264753 334.739079 L 174.913881 338.396498 L 205.563009 332.592678 L 236.212137 334.314557 L 266.861265 335.452836 L 297.510393 335.401053 L 328.159521 334.869235 L 358.808648 336.725935 L 389.457776 336.395181 L 420.106904 334.766603 L 450.756032 336.861688 L 481.40516 335.536341 L 512.054288 335.867561 L 542.703416 335.462166 L 573.352544 334.402727 L 604.001672 337.314201 L 634.6508 334.710622 L 665.299928 335.835372 L 695.949056 335.587656 L 726.598184 335.364199 L 757.247312 332.345429 L 787.896439 138.835302 " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4379
  <defs>
4380
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4381
  </defs>
4382
  <g clip-path="url(#p088c925177)">
4383
- <use ns4:href="#m9b8c54d372" x="82.966497" y="355.036822" style="fill: #ff7f0e; stroke: #ff7f0e" />
4384
- <use ns4:href="#m9b8c54d372" x="113.615625" y="330.908586" style="fill: #ff7f0e; stroke: #ff7f0e" />
4385
- <use ns4:href="#m9b8c54d372" x="144.264753" y="334.739079" style="fill: #ff7f0e; stroke: #ff7f0e" />
4386
- <use ns4:href="#m9b8c54d372" x="174.913881" y="338.396498" style="fill: #ff7f0e; stroke: #ff7f0e" />
4387
- <use ns4:href="#m9b8c54d372" x="205.563009" y="332.592678" style="fill: #ff7f0e; stroke: #ff7f0e" />
4388
- <use ns4:href="#m9b8c54d372" x="236.212137" y="334.314557" style="fill: #ff7f0e; stroke: #ff7f0e" />
4389
- <use ns4:href="#m9b8c54d372" x="266.861265" y="335.452836" style="fill: #ff7f0e; stroke: #ff7f0e" />
4390
- <use ns4:href="#m9b8c54d372" x="297.510393" y="335.401053" style="fill: #ff7f0e; stroke: #ff7f0e" />
4391
- <use ns4:href="#m9b8c54d372" x="328.159521" y="334.869235" style="fill: #ff7f0e; stroke: #ff7f0e" />
4392
- <use ns4:href="#m9b8c54d372" x="358.808648" y="336.725935" style="fill: #ff7f0e; stroke: #ff7f0e" />
4393
- <use ns4:href="#m9b8c54d372" x="389.457776" y="336.395181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4394
- <use ns4:href="#m9b8c54d372" x="420.106904" y="334.766603" style="fill: #ff7f0e; stroke: #ff7f0e" />
4395
- <use ns4:href="#m9b8c54d372" x="450.756032" y="336.861688" style="fill: #ff7f0e; stroke: #ff7f0e" />
4396
- <use ns4:href="#m9b8c54d372" x="481.40516" y="335.536341" style="fill: #ff7f0e; stroke: #ff7f0e" />
4397
- <use ns4:href="#m9b8c54d372" x="512.054288" y="335.867561" style="fill: #ff7f0e; stroke: #ff7f0e" />
4398
- <use ns4:href="#m9b8c54d372" x="542.703416" y="335.462166" style="fill: #ff7f0e; stroke: #ff7f0e" />
4399
- <use ns4:href="#m9b8c54d372" x="573.352544" y="334.402727" style="fill: #ff7f0e; stroke: #ff7f0e" />
4400
- <use ns4:href="#m9b8c54d372" x="604.001672" y="337.314201" style="fill: #ff7f0e; stroke: #ff7f0e" />
4401
- <use ns4:href="#m9b8c54d372" x="634.6508" y="334.710622" style="fill: #ff7f0e; stroke: #ff7f0e" />
4402
- <use ns4:href="#m9b8c54d372" x="665.299928" y="335.835372" style="fill: #ff7f0e; stroke: #ff7f0e" />
4403
- <use ns4:href="#m9b8c54d372" x="695.949056" y="335.587656" style="fill: #ff7f0e; stroke: #ff7f0e" />
4404
- <use ns4:href="#m9b8c54d372" x="726.598184" y="335.364199" style="fill: #ff7f0e; stroke: #ff7f0e" />
4405
- <use ns4:href="#m9b8c54d372" x="757.247312" y="332.345429" style="fill: #ff7f0e; stroke: #ff7f0e" />
4406
- <use ns4:href="#m9b8c54d372" x="787.896439" y="138.835302" style="fill: #ff7f0e; stroke: #ff7f0e" />
4407
  </g>
4408
  </g>
4409
  <g id="patch_3">
@@ -4461,7 +4461,7 @@ body[data-tool="eraser"] .main-content {
4461
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4462
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4463
  </span> |
4464
- Cell: combine | 4.57s
4465
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4466
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4467
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4551,7 +4551,7 @@ impl wl p50(ms) ok
4551
  hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 True
4552
  hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 True
4553
  hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 True
4554
- hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.07 True
4555
  hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.26 True
4556
  hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 True
4557
  hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 True
@@ -4573,29 +4573,29 @@ hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09 True
4573
  hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 True
4574
  hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True
4575
  torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
4576
- torch_eager cuda_B1_S128_H32_D64_R32 0.23 True
4577
- torch_eager cuda_B1_S128_H8_D128_R64 0.23 True
4578
  torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
4579
- torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True
4580
- torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True
4581
- torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True
4582
- torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True
4583
- torch_eager cuda_B1_S512_H32_D128_R64 0.22 True
4584
- torch_eager cuda_B1_S512_H32_D64_R32 0.22 True
4585
- torch_eager cuda_B1_S512_H8_D128_R64 0.23 True
4586
- torch_eager cuda_B1_S512_H8_D64_R32 0.23 True
4587
  torch_eager cuda_B2_S128_H32_D128_R64 0.22 True
4588
  torch_eager cuda_B2_S128_H32_D64_R32 0.22 True
4589
  torch_eager cuda_B2_S128_H8_D128_R64 0.22 True
4590
- torch_eager cuda_B2_S128_H8_D64_R32 0.22 True
4591
  torch_eager cuda_B2_S2048_H32_D128_R64 0.65 True
4592
  torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
4593
- torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True
4594
  torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
4595
- torch_eager cuda_B2_S512_H32_D128_R64 0.22 True
4596
- torch_eager cuda_B2_S512_H32_D64_R32 0.23 True
4597
  torch_eager cuda_B2_S512_H8_D128_R64 0.22 True
4598
- torch_eager cuda_B2_S512_H8_D64_R32 0.23 True
4599
 
4600
  GENERATING COMBINED VISUALIZATION
4601
 
@@ -4615,7 +4615,7 @@ Implementations included:
4615
  <div class="uv-install-logs" id="uv-logs-combine">
4616
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4617
  <div class="uv-logs-content" style="display: none;">
4618
- Installed 37 packages in 282ms
4619
  </div>
4620
  </div>
4621
  <div class="cell-artifacts">
@@ -4628,11 +4628,11 @@ Installed 37 packages in 282ms
4628
  <rdf:RDF>
4629
  <ns2:Work>
4630
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4631
- <dc:date>2025-11-10T22:11:51.846305</dc:date>
4632
  <dc:format>image/svg+xml</dc:format>
4633
  <dc:creator>
4634
  <ns2:Agent>
4635
- <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
4636
  </ns2:Agent>
4637
  </dc:creator>
4638
  </ns2:Work>
@@ -4972,109 +4972,109 @@ Installed 37 packages in 282ms
4972
  <g id="matplotlib.axis_2">
4973
  <g id="ytick_1">
4974
  <g id="grid-y--2" class="grid grid-y">
4975
- <path d="M 47.72 393.137893 L 823.142937 393.137893 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4976
  </g>
4977
  <g id="line2d_25">
4978
  <defs>
4979
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4980
  </defs>
4981
  <g>
4982
- <use ns4:href="#m0fca2865ba" x="47.72" y="393.137893" style="stroke: #000000; stroke-width: 0.8" />
4983
  </g>
4984
  </g>
4985
  <g id="text_25">
4986
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="396.937111" transform="rotate(-0 40.72 396.937111)">0.1</text>
4987
  </g>
4988
  </g>
4989
  <g id="ytick_2">
4990
  <g id="grid-y--3" class="grid grid-y">
4991
- <path d="M 47.72 346.487139 L 823.142937 346.487139 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4992
  </g>
4993
  <g id="line2d_26">
4994
  <g>
4995
- <use ns4:href="#m0fca2865ba" x="47.72" y="346.487139" style="stroke: #000000; stroke-width: 0.8" />
4996
  </g>
4997
  </g>
4998
  <g id="text_26">
4999
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="350.286357" transform="rotate(-0 40.72 350.286357)">0.2</text>
5000
  </g>
5001
  </g>
5002
  <g id="ytick_3">
5003
  <g id="grid-y--4" class="grid grid-y">
5004
- <path d="M 47.72 299.836384 L 823.142937 299.836384 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5005
  </g>
5006
  <g id="line2d_27">
5007
  <g>
5008
- <use ns4:href="#m0fca2865ba" x="47.72" y="299.836384" style="stroke: #000000; stroke-width: 0.8" />
5009
  </g>
5010
  </g>
5011
  <g id="text_27">
5012
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="303.635603" transform="rotate(-0 40.72 303.635603)">0.3</text>
5013
  </g>
5014
  </g>
5015
  <g id="ytick_4">
5016
  <g id="grid-y--5" class="grid grid-y">
5017
- <path d="M 47.72 253.18563 L 823.142937 253.18563 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5018
  </g>
5019
  <g id="line2d_28">
5020
  <g>
5021
- <use ns4:href="#m0fca2865ba" x="47.72" y="253.18563" style="stroke: #000000; stroke-width: 0.8" />
5022
  </g>
5023
  </g>
5024
  <g id="text_28">
5025
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.984849" transform="rotate(-0 40.72 256.984849)">0.4</text>
5026
  </g>
5027
  </g>
5028
  <g id="ytick_5">
5029
  <g id="grid-y--6" class="grid grid-y">
5030
- <path d="M 47.72 206.534876 L 823.142937 206.534876 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5031
  </g>
5032
  <g id="line2d_29">
5033
  <g>
5034
- <use ns4:href="#m0fca2865ba" x="47.72" y="206.534876" style="stroke: #000000; stroke-width: 0.8" />
5035
  </g>
5036
  </g>
5037
  <g id="text_29">
5038
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="210.334095" transform="rotate(-0 40.72 210.334095)">0.5</text>
5039
  </g>
5040
  </g>
5041
  <g id="ytick_6">
5042
  <g id="grid-y--7" class="grid grid-y">
5043
- <path d="M 47.72 159.884122 L 823.142937 159.884122 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5044
  </g>
5045
  <g id="line2d_30">
5046
  <g>
5047
- <use ns4:href="#m0fca2865ba" x="47.72" y="159.884122" style="stroke: #000000; stroke-width: 0.8" />
5048
  </g>
5049
  </g>
5050
  <g id="text_30">
5051
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="163.683341" transform="rotate(-0 40.72 163.683341)">0.6</text>
5052
  </g>
5053
  </g>
5054
  <g id="ytick_7">
5055
  <g id="grid-y--8" class="grid grid-y">
5056
- <path d="M 47.72 113.233368 L 823.142937 113.233368 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5057
  </g>
5058
  <g id="line2d_31">
5059
  <g>
5060
- <use ns4:href="#m0fca2865ba" x="47.72" y="113.233368" style="stroke: #000000; stroke-width: 0.8" />
5061
  </g>
5062
  </g>
5063
  <g id="text_31">
5064
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="117.032587" transform="rotate(-0 40.72 117.032587)">0.7</text>
5065
  </g>
5066
  </g>
5067
  <g id="ytick_8">
5068
  <g id="grid-y--9" class="grid grid-y">
5069
- <path d="M 47.72 66.582614 L 823.142937 66.582614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5070
  </g>
5071
  <g id="line2d_32">
5072
  <g>
5073
- <use ns4:href="#m0fca2865ba" x="47.72" y="66.582614" style="stroke: #000000; stroke-width: 0.8" />
5074
  </g>
5075
  </g>
5076
  <g id="text_32">
5077
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="70.381833" transform="rotate(-0 40.72 70.381833)">0.8</text>
5078
  </g>
5079
  </g>
5080
  <g id="label--y" class="ylabel">
@@ -5082,67 +5082,67 @@ Installed 37 packages in 282ms
5082
  </g>
5083
  </g>
5084
  <g id="series--hf-kernels-rotary" class="series">
5085
- <path d="M 82.966497 405.060892 L 113.615625 397.321532 L 144.264753 398.954309 L 174.913881 398.730851 L 205.563009 398.427622 L 236.212137 398.614225 L 266.861265 398.688866 L 297.510393 399.174034 L 328.159521 398.413626 L 358.808648 398.483602 L 389.457776 398.193901 L 420.106904 317.669102 L 450.756032 399.374165 L 481.40516 397.839822 L 512.054288 397.839822 L 542.703416 398.497131 L 573.352544 398.319858 L 604.001672 398.255014 L 634.6508 398.198566 L 665.299928 398.641749 L 695.949056 398.614225 L 726.598184 398.399165 L 757.247312 319.437165 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
5086
  <defs>
5087
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
5088
  </defs>
5089
  <g clip-path="url(#p088c925177)">
5090
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
5091
- <use ns4:href="#md7efaf3aec" x="113.615625" y="397.321532" style="fill: #1f77b4; stroke: #1f77b4" />
5092
- <use ns4:href="#md7efaf3aec" x="144.264753" y="398.954309" style="fill: #1f77b4; stroke: #1f77b4" />
5093
- <use ns4:href="#md7efaf3aec" x="174.913881" y="398.730851" style="fill: #1f77b4; stroke: #1f77b4" />
5094
- <use ns4:href="#md7efaf3aec" x="205.563009" y="398.427622" style="fill: #1f77b4; stroke: #1f77b4" />
5095
- <use ns4:href="#md7efaf3aec" x="236.212137" y="398.614225" style="fill: #1f77b4; stroke: #1f77b4" />
5096
- <use ns4:href="#md7efaf3aec" x="266.861265" y="398.688866" style="fill: #1f77b4; stroke: #1f77b4" />
5097
- <use ns4:href="#md7efaf3aec" x="297.510393" y="399.174034" style="fill: #1f77b4; stroke: #1f77b4" />
5098
- <use ns4:href="#md7efaf3aec" x="328.159521" y="398.413626" style="fill: #1f77b4; stroke: #1f77b4" />
5099
- <use ns4:href="#md7efaf3aec" x="358.808648" y="398.483602" style="fill: #1f77b4; stroke: #1f77b4" />
5100
- <use ns4:href="#md7efaf3aec" x="389.457776" y="398.193901" style="fill: #1f77b4; stroke: #1f77b4" />
5101
- <use ns4:href="#md7efaf3aec" x="420.106904" y="317.669102" style="fill: #1f77b4; stroke: #1f77b4" />
5102
- <use ns4:href="#md7efaf3aec" x="450.756032" y="399.374165" style="fill: #1f77b4; stroke: #1f77b4" />
5103
- <use ns4:href="#md7efaf3aec" x="481.40516" y="397.839822" style="fill: #1f77b4; stroke: #1f77b4" />
5104
- <use ns4:href="#md7efaf3aec" x="512.054288" y="397.839822" style="fill: #1f77b4; stroke: #1f77b4" />
5105
- <use ns4:href="#md7efaf3aec" x="542.703416" y="398.497131" style="fill: #1f77b4; stroke: #1f77b4" />
5106
- <use ns4:href="#md7efaf3aec" x="573.352544" y="398.319858" style="fill: #1f77b4; stroke: #1f77b4" />
5107
- <use ns4:href="#md7efaf3aec" x="604.001672" y="398.255014" style="fill: #1f77b4; stroke: #1f77b4" />
5108
- <use ns4:href="#md7efaf3aec" x="634.6508" y="398.198566" style="fill: #1f77b4; stroke: #1f77b4" />
5109
- <use ns4:href="#md7efaf3aec" x="665.299928" y="398.641749" style="fill: #1f77b4; stroke: #1f77b4" />
5110
- <use ns4:href="#md7efaf3aec" x="695.949056" y="398.614225" style="fill: #1f77b4; stroke: #1f77b4" />
5111
- <use ns4:href="#md7efaf3aec" x="726.598184" y="398.399165" style="fill: #1f77b4; stroke: #1f77b4" />
5112
- <use ns4:href="#md7efaf3aec" x="757.247312" y="319.437165" style="fill: #1f77b4; stroke: #1f77b4" />
5113
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
5114
  </g>
5115
  </g>
5116
  <g id="series--torch-eager" class="series">
5117
- <path d="M 82.966497 355.036822 L 113.615625 330.908586 L 144.264753 334.739079 L 174.913881 338.396498 L 205.563009 332.592678 L 236.212137 334.314557 L 266.861265 335.452836 L 297.510393 335.401053 L 328.159521 334.869235 L 358.808648 336.725935 L 389.457776 336.395181 L 420.106904 334.766603 L 450.756032 336.861688 L 481.40516 335.536341 L 512.054288 335.867561 L 542.703416 335.462166 L 573.352544 334.402727 L 604.001672 337.314201 L 634.6508 334.710622 L 665.299928 335.835372 L 695.949056 335.587656 L 726.598184 335.364199 L 757.247312 332.345429 L 787.896439 138.835302 " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5118
  <defs>
5119
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5120
  </defs>
5121
  <g clip-path="url(#p088c925177)">
5122
- <use ns4:href="#m9b8c54d372" x="82.966497" y="355.036822" style="fill: #ff7f0e; stroke: #ff7f0e" />
5123
- <use ns4:href="#m9b8c54d372" x="113.615625" y="330.908586" style="fill: #ff7f0e; stroke: #ff7f0e" />
5124
- <use ns4:href="#m9b8c54d372" x="144.264753" y="334.739079" style="fill: #ff7f0e; stroke: #ff7f0e" />
5125
- <use ns4:href="#m9b8c54d372" x="174.913881" y="338.396498" style="fill: #ff7f0e; stroke: #ff7f0e" />
5126
- <use ns4:href="#m9b8c54d372" x="205.563009" y="332.592678" style="fill: #ff7f0e; stroke: #ff7f0e" />
5127
- <use ns4:href="#m9b8c54d372" x="236.212137" y="334.314557" style="fill: #ff7f0e; stroke: #ff7f0e" />
5128
- <use ns4:href="#m9b8c54d372" x="266.861265" y="335.452836" style="fill: #ff7f0e; stroke: #ff7f0e" />
5129
- <use ns4:href="#m9b8c54d372" x="297.510393" y="335.401053" style="fill: #ff7f0e; stroke: #ff7f0e" />
5130
- <use ns4:href="#m9b8c54d372" x="328.159521" y="334.869235" style="fill: #ff7f0e; stroke: #ff7f0e" />
5131
- <use ns4:href="#m9b8c54d372" x="358.808648" y="336.725935" style="fill: #ff7f0e; stroke: #ff7f0e" />
5132
- <use ns4:href="#m9b8c54d372" x="389.457776" y="336.395181" style="fill: #ff7f0e; stroke: #ff7f0e" />
5133
- <use ns4:href="#m9b8c54d372" x="420.106904" y="334.766603" style="fill: #ff7f0e; stroke: #ff7f0e" />
5134
- <use ns4:href="#m9b8c54d372" x="450.756032" y="336.861688" style="fill: #ff7f0e; stroke: #ff7f0e" />
5135
- <use ns4:href="#m9b8c54d372" x="481.40516" y="335.536341" style="fill: #ff7f0e; stroke: #ff7f0e" />
5136
- <use ns4:href="#m9b8c54d372" x="512.054288" y="335.867561" style="fill: #ff7f0e; stroke: #ff7f0e" />
5137
- <use ns4:href="#m9b8c54d372" x="542.703416" y="335.462166" style="fill: #ff7f0e; stroke: #ff7f0e" />
5138
- <use ns4:href="#m9b8c54d372" x="573.352544" y="334.402727" style="fill: #ff7f0e; stroke: #ff7f0e" />
5139
- <use ns4:href="#m9b8c54d372" x="604.001672" y="337.314201" style="fill: #ff7f0e; stroke: #ff7f0e" />
5140
- <use ns4:href="#m9b8c54d372" x="634.6508" y="334.710622" style="fill: #ff7f0e; stroke: #ff7f0e" />
5141
- <use ns4:href="#m9b8c54d372" x="665.299928" y="335.835372" style="fill: #ff7f0e; stroke: #ff7f0e" />
5142
- <use ns4:href="#m9b8c54d372" x="695.949056" y="335.587656" style="fill: #ff7f0e; stroke: #ff7f0e" />
5143
- <use ns4:href="#m9b8c54d372" x="726.598184" y="335.364199" style="fill: #ff7f0e; stroke: #ff7f0e" />
5144
- <use ns4:href="#m9b8c54d372" x="757.247312" y="332.345429" style="fill: #ff7f0e; stroke: #ff7f0e" />
5145
- <use ns4:href="#m9b8c54d372" x="787.896439" y="138.835302" style="fill: #ff7f0e; stroke: #ff7f0e" />
5146
  </g>
5147
  </g>
5148
  <g id="patch_3">
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T19:09:41.164726</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
3896
+ <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
3897
  </ns2:Agent>
3898
  </dc:creator>
3899
  </ns2:Work>
 
4233
  <g id="matplotlib.axis_2">
4234
  <g id="ytick_1">
4235
  <g id="grid-y--2" class="grid grid-y">
4236
+ <path d="M 47.72 394.065769 L 823.142937 394.065769 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4237
  </g>
4238
  <g id="line2d_25">
4239
  <defs>
4240
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4241
  </defs>
4242
  <g>
4243
+ <use ns4:href="#m0fca2865ba" x="47.72" y="394.065769" style="stroke: #000000; stroke-width: 0.8" />
4244
  </g>
4245
  </g>
4246
  <g id="text_25">
4247
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="397.864988" transform="rotate(-0 40.72 397.864988)">0.1</text>
4248
  </g>
4249
  </g>
4250
  <g id="ytick_2">
4251
  <g id="grid-y--3" class="grid grid-y">
4252
+ <path d="M 47.72 347.214212 L 823.142937 347.214212 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4253
  </g>
4254
  <g id="line2d_26">
4255
  <g>
4256
+ <use ns4:href="#m0fca2865ba" x="47.72" y="347.214212" style="stroke: #000000; stroke-width: 0.8" />
4257
  </g>
4258
  </g>
4259
  <g id="text_26">
4260
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="351.013431" transform="rotate(-0 40.72 351.013431)">0.2</text>
4261
  </g>
4262
  </g>
4263
  <g id="ytick_3">
4264
  <g id="grid-y--4" class="grid grid-y">
4265
+ <path d="M 47.72 300.362656 L 823.142937 300.362656 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4266
  </g>
4267
  <g id="line2d_27">
4268
  <g>
4269
+ <use ns4:href="#m0fca2865ba" x="47.72" y="300.362656" style="stroke: #000000; stroke-width: 0.8" />
4270
  </g>
4271
  </g>
4272
  <g id="text_27">
4273
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="304.161875" transform="rotate(-0 40.72 304.161875)">0.3</text>
4274
  </g>
4275
  </g>
4276
  <g id="ytick_4">
4277
  <g id="grid-y--5" class="grid grid-y">
4278
+ <path d="M 47.72 253.511099 L 823.142937 253.511099 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4279
  </g>
4280
  <g id="line2d_28">
4281
  <g>
4282
+ <use ns4:href="#m0fca2865ba" x="47.72" y="253.511099" style="stroke: #000000; stroke-width: 0.8" />
4283
  </g>
4284
  </g>
4285
  <g id="text_28">
4286
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.310318" transform="rotate(-0 40.72 257.310318)">0.4</text>
4287
  </g>
4288
  </g>
4289
  <g id="ytick_5">
4290
  <g id="grid-y--6" class="grid grid-y">
4291
+ <path d="M 47.72 206.659543 L 823.142937 206.659543 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4292
  </g>
4293
  <g id="line2d_29">
4294
  <g>
4295
+ <use ns4:href="#m0fca2865ba" x="47.72" y="206.659543" style="stroke: #000000; stroke-width: 0.8" />
4296
  </g>
4297
  </g>
4298
  <g id="text_29">
4299
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="210.458761" transform="rotate(-0 40.72 210.458761)">0.5</text>
4300
  </g>
4301
  </g>
4302
  <g id="ytick_6">
4303
  <g id="grid-y--7" class="grid grid-y">
4304
+ <path d="M 47.72 159.807986 L 823.142937 159.807986 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4305
  </g>
4306
  <g id="line2d_30">
4307
  <g>
4308
+ <use ns4:href="#m0fca2865ba" x="47.72" y="159.807986" style="stroke: #000000; stroke-width: 0.8" />
4309
  </g>
4310
  </g>
4311
  <g id="text_30">
4312
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="163.607205" transform="rotate(-0 40.72 163.607205)">0.6</text>
4313
  </g>
4314
  </g>
4315
  <g id="ytick_7">
4316
  <g id="grid-y--8" class="grid grid-y">
4317
+ <path d="M 47.72 112.956429 L 823.142937 112.956429 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4318
  </g>
4319
  <g id="line2d_31">
4320
  <g>
4321
+ <use ns4:href="#m0fca2865ba" x="47.72" y="112.956429" style="stroke: #000000; stroke-width: 0.8" />
4322
  </g>
4323
  </g>
4324
  <g id="text_31">
4325
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="116.755648" transform="rotate(-0 40.72 116.755648)">0.7</text>
4326
  </g>
4327
  </g>
4328
  <g id="ytick_8">
4329
  <g id="grid-y--9" class="grid grid-y">
4330
+ <path d="M 47.72 66.104873 L 823.142937 66.104873 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4331
  </g>
4332
  <g id="line2d_32">
4333
  <g>
4334
+ <use ns4:href="#m0fca2865ba" x="47.72" y="66.104873" style="stroke: #000000; stroke-width: 0.8" />
4335
  </g>
4336
  </g>
4337
  <g id="text_32">
4338
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="69.904092" transform="rotate(-0 40.72 69.904092)">0.8</text>
4339
  </g>
4340
  </g>
4341
  <g id="label--y" class="ylabel">
 
4343
  </g>
4344
  </g>
4345
  <g id="series--hf-kernels-rotary" class="series">
4346
+ <path d="M 82.966497 405.060892 L 113.615625 399.43402 L 144.264753 400.029504 L 174.913881 399.696858 L 205.563009 399.059208 L 236.212137 400.038874 L 266.861265 399.640636 L 297.510393 400.038405 L 328.159521 400.006078 L 358.808648 399.691704 L 389.457776 399.640167 L 420.106904 318.131109 L 450.756032 400.455853 L 481.40516 400.197701 L 512.054288 399.907221 L 542.703416 399.860838 L 573.352544 400.20754 L 604.001672 400.567828 L 634.6508 399.780722 L 665.299928 400.403848 L 695.949056 399.312675 L 726.598184 400.328885 L 757.247312 320.371082 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4347
  <defs>
4348
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4349
  </defs>
4350
  <g clip-path="url(#p088c925177)">
4351
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
4352
+ <use ns4:href="#md7efaf3aec" x="113.615625" y="399.43402" style="fill: #1f77b4; stroke: #1f77b4" />
4353
+ <use ns4:href="#md7efaf3aec" x="144.264753" y="400.029504" style="fill: #1f77b4; stroke: #1f77b4" />
4354
+ <use ns4:href="#md7efaf3aec" x="174.913881" y="399.696858" style="fill: #1f77b4; stroke: #1f77b4" />
4355
+ <use ns4:href="#md7efaf3aec" x="205.563009" y="399.059208" style="fill: #1f77b4; stroke: #1f77b4" />
4356
+ <use ns4:href="#md7efaf3aec" x="236.212137" y="400.038874" style="fill: #1f77b4; stroke: #1f77b4" />
4357
+ <use ns4:href="#md7efaf3aec" x="266.861265" y="399.640636" style="fill: #1f77b4; stroke: #1f77b4" />
4358
+ <use ns4:href="#md7efaf3aec" x="297.510393" y="400.038405" style="fill: #1f77b4; stroke: #1f77b4" />
4359
+ <use ns4:href="#md7efaf3aec" x="328.159521" y="400.006078" style="fill: #1f77b4; stroke: #1f77b4" />
4360
+ <use ns4:href="#md7efaf3aec" x="358.808648" y="399.691704" style="fill: #1f77b4; stroke: #1f77b4" />
4361
+ <use ns4:href="#md7efaf3aec" x="389.457776" y="399.640167" style="fill: #1f77b4; stroke: #1f77b4" />
4362
+ <use ns4:href="#md7efaf3aec" x="420.106904" y="318.131109" style="fill: #1f77b4; stroke: #1f77b4" />
4363
+ <use ns4:href="#md7efaf3aec" x="450.756032" y="400.455853" style="fill: #1f77b4; stroke: #1f77b4" />
4364
+ <use ns4:href="#md7efaf3aec" x="481.40516" y="400.197701" style="fill: #1f77b4; stroke: #1f77b4" />
4365
+ <use ns4:href="#md7efaf3aec" x="512.054288" y="399.907221" style="fill: #1f77b4; stroke: #1f77b4" />
4366
+ <use ns4:href="#md7efaf3aec" x="542.703416" y="399.860838" style="fill: #1f77b4; stroke: #1f77b4" />
4367
+ <use ns4:href="#md7efaf3aec" x="573.352544" y="400.20754" style="fill: #1f77b4; stroke: #1f77b4" />
4368
+ <use ns4:href="#md7efaf3aec" x="604.001672" y="400.567828" style="fill: #1f77b4; stroke: #1f77b4" />
4369
+ <use ns4:href="#md7efaf3aec" x="634.6508" y="399.780722" style="fill: #1f77b4; stroke: #1f77b4" />
4370
+ <use ns4:href="#md7efaf3aec" x="665.299928" y="400.403848" style="fill: #1f77b4; stroke: #1f77b4" />
4371
+ <use ns4:href="#md7efaf3aec" x="695.949056" y="399.312675" style="fill: #1f77b4; stroke: #1f77b4" />
4372
+ <use ns4:href="#md7efaf3aec" x="726.598184" y="400.328885" style="fill: #1f77b4; stroke: #1f77b4" />
4373
+ <use ns4:href="#md7efaf3aec" x="757.247312" y="320.371082" style="fill: #1f77b4; stroke: #1f77b4" />
4374
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
4375
  </g>
4376
  </g>
4377
  <g id="series--torch-eager" class="series">
4378
+ <path d="M 82.966497 358.443125 L 113.615625 339.472461 L 144.264753 340.171018 L 174.913881 340.100272 L 205.563009 340.151809 L 236.212137 340.65312 L 266.861265 340.953439 L 297.510393 340.452127 L 328.159521 340.709811 L 358.808648 340.704657 L 389.457776 340.901902 L 420.106904 336.056983 L 450.756032 340.784305 L 481.40516 339.590059 L 512.054288 339.749354 L 542.703416 339.036742 L 573.352544 338.708781 L 604.001672 339.515096 L 634.6508 339.566633 L 665.299928 340.438072 L 695.949056 339.247574 L 726.598184 340.442288 L 757.247312 333.026156 L 787.896439 137.089198 " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4379
  <defs>
4380
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4381
  </defs>
4382
  <g clip-path="url(#p088c925177)">
4383
+ <use ns4:href="#m9b8c54d372" x="82.966497" y="358.443125" style="fill: #ff7f0e; stroke: #ff7f0e" />
4384
+ <use ns4:href="#m9b8c54d372" x="113.615625" y="339.472461" style="fill: #ff7f0e; stroke: #ff7f0e" />
4385
+ <use ns4:href="#m9b8c54d372" x="144.264753" y="340.171018" style="fill: #ff7f0e; stroke: #ff7f0e" />
4386
+ <use ns4:href="#m9b8c54d372" x="174.913881" y="340.100272" style="fill: #ff7f0e; stroke: #ff7f0e" />
4387
+ <use ns4:href="#m9b8c54d372" x="205.563009" y="340.151809" style="fill: #ff7f0e; stroke: #ff7f0e" />
4388
+ <use ns4:href="#m9b8c54d372" x="236.212137" y="340.65312" style="fill: #ff7f0e; stroke: #ff7f0e" />
4389
+ <use ns4:href="#m9b8c54d372" x="266.861265" y="340.953439" style="fill: #ff7f0e; stroke: #ff7f0e" />
4390
+ <use ns4:href="#m9b8c54d372" x="297.510393" y="340.452127" style="fill: #ff7f0e; stroke: #ff7f0e" />
4391
+ <use ns4:href="#m9b8c54d372" x="328.159521" y="340.709811" style="fill: #ff7f0e; stroke: #ff7f0e" />
4392
+ <use ns4:href="#m9b8c54d372" x="358.808648" y="340.704657" style="fill: #ff7f0e; stroke: #ff7f0e" />
4393
+ <use ns4:href="#m9b8c54d372" x="389.457776" y="340.901902" style="fill: #ff7f0e; stroke: #ff7f0e" />
4394
+ <use ns4:href="#m9b8c54d372" x="420.106904" y="336.056983" style="fill: #ff7f0e; stroke: #ff7f0e" />
4395
+ <use ns4:href="#m9b8c54d372" x="450.756032" y="340.784305" style="fill: #ff7f0e; stroke: #ff7f0e" />
4396
+ <use ns4:href="#m9b8c54d372" x="481.40516" y="339.590059" style="fill: #ff7f0e; stroke: #ff7f0e" />
4397
+ <use ns4:href="#m9b8c54d372" x="512.054288" y="339.749354" style="fill: #ff7f0e; stroke: #ff7f0e" />
4398
+ <use ns4:href="#m9b8c54d372" x="542.703416" y="339.036742" style="fill: #ff7f0e; stroke: #ff7f0e" />
4399
+ <use ns4:href="#m9b8c54d372" x="573.352544" y="338.708781" style="fill: #ff7f0e; stroke: #ff7f0e" />
4400
+ <use ns4:href="#m9b8c54d372" x="604.001672" y="339.515096" style="fill: #ff7f0e; stroke: #ff7f0e" />
4401
+ <use ns4:href="#m9b8c54d372" x="634.6508" y="339.566633" style="fill: #ff7f0e; stroke: #ff7f0e" />
4402
+ <use ns4:href="#m9b8c54d372" x="665.299928" y="340.438072" style="fill: #ff7f0e; stroke: #ff7f0e" />
4403
+ <use ns4:href="#m9b8c54d372" x="695.949056" y="339.247574" style="fill: #ff7f0e; stroke: #ff7f0e" />
4404
+ <use ns4:href="#m9b8c54d372" x="726.598184" y="340.442288" style="fill: #ff7f0e; stroke: #ff7f0e" />
4405
+ <use ns4:href="#m9b8c54d372" x="757.247312" y="333.026156" style="fill: #ff7f0e; stroke: #ff7f0e" />
4406
+ <use ns4:href="#m9b8c54d372" x="787.896439" y="137.089198" style="fill: #ff7f0e; stroke: #ff7f0e" />
4407
  </g>
4408
  </g>
4409
  <g id="patch_3">
 
4461
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4462
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4463
  </span> |
4464
+ Cell: combine | 4.85s
4465
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4466
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4467
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4551
  hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 True
4552
  hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 True
4553
  hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 True
4554
+ hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 True
4555
  hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.26 True
4556
  hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 True
4557
  hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 True
 
4573
  hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 True
4574
  hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True
4575
  torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
4576
+ torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
4577
+ torch_eager cuda_B1_S128_H8_D128_R64 0.22 True
4578
  torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
4579
+ torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True
4580
+ torch_eager cuda_B1_S2048_H32_D64_R32 0.21 True
4581
+ torch_eager cuda_B1_S2048_H8_D128_R64 0.21 True
4582
+ torch_eager cuda_B1_S2048_H8_D64_R32 0.21 True
4583
+ torch_eager cuda_B1_S512_H32_D128_R64 0.21 True
4584
+ torch_eager cuda_B1_S512_H32_D64_R32 0.21 True
4585
+ torch_eager cuda_B1_S512_H8_D128_R64 0.21 True
4586
+ torch_eager cuda_B1_S512_H8_D64_R32 0.22 True
4587
  torch_eager cuda_B2_S128_H32_D128_R64 0.22 True
4588
  torch_eager cuda_B2_S128_H32_D64_R32 0.22 True
4589
  torch_eager cuda_B2_S128_H8_D128_R64 0.22 True
4590
+ torch_eager cuda_B2_S128_H8_D64_R32 0.21 True
4591
  torch_eager cuda_B2_S2048_H32_D128_R64 0.65 True
4592
  torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
4593
+ torch_eager cuda_B2_S2048_H8_D128_R64 0.21 True
4594
  torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
4595
+ torch_eager cuda_B2_S512_H32_D128_R64 0.21 True
4596
+ torch_eager cuda_B2_S512_H32_D64_R32 0.22 True
4597
  torch_eager cuda_B2_S512_H8_D128_R64 0.22 True
4598
+ torch_eager cuda_B2_S512_H8_D64_R32 0.22 True
4599
 
4600
  GENERATING COMBINED VISUALIZATION
4601
 
 
4615
  <div class="uv-install-logs" id="uv-logs-combine">
4616
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4617
  <div class="uv-logs-content" style="display: none;">
4618
+ Installed 37 packages in 330ms
4619
  </div>
4620
  </div>
4621
  <div class="cell-artifacts">
 
4628
  <rdf:RDF>
4629
  <ns2:Work>
4630
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4631
+ <dc:date>2025-12-19T19:09:41.164726</dc:date>
4632
  <dc:format>image/svg+xml</dc:format>
4633
  <dc:creator>
4634
  <ns2:Agent>
4635
+ <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
4636
  </ns2:Agent>
4637
  </dc:creator>
4638
  </ns2:Work>
 
4972
  <g id="matplotlib.axis_2">
4973
  <g id="ytick_1">
4974
  <g id="grid-y--2" class="grid grid-y">
4975
+ <path d="M 47.72 394.065769 L 823.142937 394.065769 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4976
  </g>
4977
  <g id="line2d_25">
4978
  <defs>
4979
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4980
  </defs>
4981
  <g>
4982
+ <use ns4:href="#m0fca2865ba" x="47.72" y="394.065769" style="stroke: #000000; stroke-width: 0.8" />
4983
  </g>
4984
  </g>
4985
  <g id="text_25">
4986
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="397.864988" transform="rotate(-0 40.72 397.864988)">0.1</text>
4987
  </g>
4988
  </g>
4989
  <g id="ytick_2">
4990
  <g id="grid-y--3" class="grid grid-y">
4991
+ <path d="M 47.72 347.214212 L 823.142937 347.214212 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4992
  </g>
4993
  <g id="line2d_26">
4994
  <g>
4995
+ <use ns4:href="#m0fca2865ba" x="47.72" y="347.214212" style="stroke: #000000; stroke-width: 0.8" />
4996
  </g>
4997
  </g>
4998
  <g id="text_26">
4999
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="351.013431" transform="rotate(-0 40.72 351.013431)">0.2</text>
5000
  </g>
5001
  </g>
5002
  <g id="ytick_3">
5003
  <g id="grid-y--4" class="grid grid-y">
5004
+ <path d="M 47.72 300.362656 L 823.142937 300.362656 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5005
  </g>
5006
  <g id="line2d_27">
5007
  <g>
5008
+ <use ns4:href="#m0fca2865ba" x="47.72" y="300.362656" style="stroke: #000000; stroke-width: 0.8" />
5009
  </g>
5010
  </g>
5011
  <g id="text_27">
5012
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="304.161875" transform="rotate(-0 40.72 304.161875)">0.3</text>
5013
  </g>
5014
  </g>
5015
  <g id="ytick_4">
5016
  <g id="grid-y--5" class="grid grid-y">
5017
+ <path d="M 47.72 253.511099 L 823.142937 253.511099 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5018
  </g>
5019
  <g id="line2d_28">
5020
  <g>
5021
+ <use ns4:href="#m0fca2865ba" x="47.72" y="253.511099" style="stroke: #000000; stroke-width: 0.8" />
5022
  </g>
5023
  </g>
5024
  <g id="text_28">
5025
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.310318" transform="rotate(-0 40.72 257.310318)">0.4</text>
5026
  </g>
5027
  </g>
5028
  <g id="ytick_5">
5029
  <g id="grid-y--6" class="grid grid-y">
5030
+ <path d="M 47.72 206.659543 L 823.142937 206.659543 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5031
  </g>
5032
  <g id="line2d_29">
5033
  <g>
5034
+ <use ns4:href="#m0fca2865ba" x="47.72" y="206.659543" style="stroke: #000000; stroke-width: 0.8" />
5035
  </g>
5036
  </g>
5037
  <g id="text_29">
5038
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="210.458761" transform="rotate(-0 40.72 210.458761)">0.5</text>
5039
  </g>
5040
  </g>
5041
  <g id="ytick_6">
5042
  <g id="grid-y--7" class="grid grid-y">
5043
+ <path d="M 47.72 159.807986 L 823.142937 159.807986 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5044
  </g>
5045
  <g id="line2d_30">
5046
  <g>
5047
+ <use ns4:href="#m0fca2865ba" x="47.72" y="159.807986" style="stroke: #000000; stroke-width: 0.8" />
5048
  </g>
5049
  </g>
5050
  <g id="text_30">
5051
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="163.607205" transform="rotate(-0 40.72 163.607205)">0.6</text>
5052
  </g>
5053
  </g>
5054
  <g id="ytick_7">
5055
  <g id="grid-y--8" class="grid grid-y">
5056
+ <path d="M 47.72 112.956429 L 823.142937 112.956429 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5057
  </g>
5058
  <g id="line2d_31">
5059
  <g>
5060
+ <use ns4:href="#m0fca2865ba" x="47.72" y="112.956429" style="stroke: #000000; stroke-width: 0.8" />
5061
  </g>
5062
  </g>
5063
  <g id="text_31">
5064
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="116.755648" transform="rotate(-0 40.72 116.755648)">0.7</text>
5065
  </g>
5066
  </g>
5067
  <g id="ytick_8">
5068
  <g id="grid-y--9" class="grid grid-y">
5069
+ <path d="M 47.72 66.104873 L 823.142937 66.104873 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5070
  </g>
5071
  <g id="line2d_32">
5072
  <g>
5073
+ <use ns4:href="#m0fca2865ba" x="47.72" y="66.104873" style="stroke: #000000; stroke-width: 0.8" />
5074
  </g>
5075
  </g>
5076
  <g id="text_32">
5077
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="69.904092" transform="rotate(-0 40.72 69.904092)">0.8</text>
5078
  </g>
5079
  </g>
5080
  <g id="label--y" class="ylabel">
 
5082
  </g>
5083
  </g>
5084
  <g id="series--hf-kernels-rotary" class="series">
5085
+ <path d="M 82.966497 405.060892 L 113.615625 399.43402 L 144.264753 400.029504 L 174.913881 399.696858 L 205.563009 399.059208 L 236.212137 400.038874 L 266.861265 399.640636 L 297.510393 400.038405 L 328.159521 400.006078 L 358.808648 399.691704 L 389.457776 399.640167 L 420.106904 318.131109 L 450.756032 400.455853 L 481.40516 400.197701 L 512.054288 399.907221 L 542.703416 399.860838 L 573.352544 400.20754 L 604.001672 400.567828 L 634.6508 399.780722 L 665.299928 400.403848 L 695.949056 399.312675 L 726.598184 400.328885 L 757.247312 320.371082 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
5086
  <defs>
5087
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
5088
  </defs>
5089
  <g clip-path="url(#p088c925177)">
5090
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
5091
+ <use ns4:href="#md7efaf3aec" x="113.615625" y="399.43402" style="fill: #1f77b4; stroke: #1f77b4" />
5092
+ <use ns4:href="#md7efaf3aec" x="144.264753" y="400.029504" style="fill: #1f77b4; stroke: #1f77b4" />
5093
+ <use ns4:href="#md7efaf3aec" x="174.913881" y="399.696858" style="fill: #1f77b4; stroke: #1f77b4" />
5094
+ <use ns4:href="#md7efaf3aec" x="205.563009" y="399.059208" style="fill: #1f77b4; stroke: #1f77b4" />
5095
+ <use ns4:href="#md7efaf3aec" x="236.212137" y="400.038874" style="fill: #1f77b4; stroke: #1f77b4" />
5096
+ <use ns4:href="#md7efaf3aec" x="266.861265" y="399.640636" style="fill: #1f77b4; stroke: #1f77b4" />
5097
+ <use ns4:href="#md7efaf3aec" x="297.510393" y="400.038405" style="fill: #1f77b4; stroke: #1f77b4" />
5098
+ <use ns4:href="#md7efaf3aec" x="328.159521" y="400.006078" style="fill: #1f77b4; stroke: #1f77b4" />
5099
+ <use ns4:href="#md7efaf3aec" x="358.808648" y="399.691704" style="fill: #1f77b4; stroke: #1f77b4" />
5100
+ <use ns4:href="#md7efaf3aec" x="389.457776" y="399.640167" style="fill: #1f77b4; stroke: #1f77b4" />
5101
+ <use ns4:href="#md7efaf3aec" x="420.106904" y="318.131109" style="fill: #1f77b4; stroke: #1f77b4" />
5102
+ <use ns4:href="#md7efaf3aec" x="450.756032" y="400.455853" style="fill: #1f77b4; stroke: #1f77b4" />
5103
+ <use ns4:href="#md7efaf3aec" x="481.40516" y="400.197701" style="fill: #1f77b4; stroke: #1f77b4" />
5104
+ <use ns4:href="#md7efaf3aec" x="512.054288" y="399.907221" style="fill: #1f77b4; stroke: #1f77b4" />
5105
+ <use ns4:href="#md7efaf3aec" x="542.703416" y="399.860838" style="fill: #1f77b4; stroke: #1f77b4" />
5106
+ <use ns4:href="#md7efaf3aec" x="573.352544" y="400.20754" style="fill: #1f77b4; stroke: #1f77b4" />
5107
+ <use ns4:href="#md7efaf3aec" x="604.001672" y="400.567828" style="fill: #1f77b4; stroke: #1f77b4" />
5108
+ <use ns4:href="#md7efaf3aec" x="634.6508" y="399.780722" style="fill: #1f77b4; stroke: #1f77b4" />
5109
+ <use ns4:href="#md7efaf3aec" x="665.299928" y="400.403848" style="fill: #1f77b4; stroke: #1f77b4" />
5110
+ <use ns4:href="#md7efaf3aec" x="695.949056" y="399.312675" style="fill: #1f77b4; stroke: #1f77b4" />
5111
+ <use ns4:href="#md7efaf3aec" x="726.598184" y="400.328885" style="fill: #1f77b4; stroke: #1f77b4" />
5112
+ <use ns4:href="#md7efaf3aec" x="757.247312" y="320.371082" style="fill: #1f77b4; stroke: #1f77b4" />
5113
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
5114
  </g>
5115
  </g>
5116
  <g id="series--torch-eager" class="series">
5117
+ <path d="M 82.966497 358.443125 L 113.615625 339.472461 L 144.264753 340.171018 L 174.913881 340.100272 L 205.563009 340.151809 L 236.212137 340.65312 L 266.861265 340.953439 L 297.510393 340.452127 L 328.159521 340.709811 L 358.808648 340.704657 L 389.457776 340.901902 L 420.106904 336.056983 L 450.756032 340.784305 L 481.40516 339.590059 L 512.054288 339.749354 L 542.703416 339.036742 L 573.352544 338.708781 L 604.001672 339.515096 L 634.6508 339.566633 L 665.299928 340.438072 L 695.949056 339.247574 L 726.598184 340.442288 L 757.247312 333.026156 L 787.896439 137.089198 " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5118
  <defs>
5119
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5120
  </defs>
5121
  <g clip-path="url(#p088c925177)">
5122
+ <use ns4:href="#m9b8c54d372" x="82.966497" y="358.443125" style="fill: #ff7f0e; stroke: #ff7f0e" />
5123
+ <use ns4:href="#m9b8c54d372" x="113.615625" y="339.472461" style="fill: #ff7f0e; stroke: #ff7f0e" />
5124
+ <use ns4:href="#m9b8c54d372" x="144.264753" y="340.171018" style="fill: #ff7f0e; stroke: #ff7f0e" />
5125
+ <use ns4:href="#m9b8c54d372" x="174.913881" y="340.100272" style="fill: #ff7f0e; stroke: #ff7f0e" />
5126
+ <use ns4:href="#m9b8c54d372" x="205.563009" y="340.151809" style="fill: #ff7f0e; stroke: #ff7f0e" />
5127
+ <use ns4:href="#m9b8c54d372" x="236.212137" y="340.65312" style="fill: #ff7f0e; stroke: #ff7f0e" />
5128
+ <use ns4:href="#m9b8c54d372" x="266.861265" y="340.953439" style="fill: #ff7f0e; stroke: #ff7f0e" />
5129
+ <use ns4:href="#m9b8c54d372" x="297.510393" y="340.452127" style="fill: #ff7f0e; stroke: #ff7f0e" />
5130
+ <use ns4:href="#m9b8c54d372" x="328.159521" y="340.709811" style="fill: #ff7f0e; stroke: #ff7f0e" />
5131
+ <use ns4:href="#m9b8c54d372" x="358.808648" y="340.704657" style="fill: #ff7f0e; stroke: #ff7f0e" />
5132
+ <use ns4:href="#m9b8c54d372" x="389.457776" y="340.901902" style="fill: #ff7f0e; stroke: #ff7f0e" />
5133
+ <use ns4:href="#m9b8c54d372" x="420.106904" y="336.056983" style="fill: #ff7f0e; stroke: #ff7f0e" />
5134
+ <use ns4:href="#m9b8c54d372" x="450.756032" y="340.784305" style="fill: #ff7f0e; stroke: #ff7f0e" />
5135
+ <use ns4:href="#m9b8c54d372" x="481.40516" y="339.590059" style="fill: #ff7f0e; stroke: #ff7f0e" />
5136
+ <use ns4:href="#m9b8c54d372" x="512.054288" y="339.749354" style="fill: #ff7f0e; stroke: #ff7f0e" />
5137
+ <use ns4:href="#m9b8c54d372" x="542.703416" y="339.036742" style="fill: #ff7f0e; stroke: #ff7f0e" />
5138
+ <use ns4:href="#m9b8c54d372" x="573.352544" y="338.708781" style="fill: #ff7f0e; stroke: #ff7f0e" />
5139
+ <use ns4:href="#m9b8c54d372" x="604.001672" y="339.515096" style="fill: #ff7f0e; stroke: #ff7f0e" />
5140
+ <use ns4:href="#m9b8c54d372" x="634.6508" y="339.566633" style="fill: #ff7f0e; stroke: #ff7f0e" />
5141
+ <use ns4:href="#m9b8c54d372" x="665.299928" y="340.438072" style="fill: #ff7f0e; stroke: #ff7f0e" />
5142
+ <use ns4:href="#m9b8c54d372" x="695.949056" y="339.247574" style="fill: #ff7f0e; stroke: #ff7f0e" />
5143
+ <use ns4:href="#m9b8c54d372" x="726.598184" y="340.442288" style="fill: #ff7f0e; stroke: #ff7f0e" />
5144
+ <use ns4:href="#m9b8c54d372" x="757.247312" y="333.026156" style="fill: #ff7f0e; stroke: #ff7f0e" />
5145
+ <use ns4:href="#m9b8c54d372" x="787.896439" y="137.089198" style="fill: #ff7f0e; stroke: #ff7f0e" />
5146
  </g>
5147
  </g>
5148
  <g id="patch_3">