drbh HF Staff commited on
Commit
dac61af
·
verified ·
1 Parent(s): d87c146

Upload folder using huggingface_hub

Browse files
Files changed (49) hide show
  1. activation/impls/artifacts/benchmark/activation.jsonl +9 -9
  2. activation/impls/cells/benchmark.py +7 -13
  3. activation/impls/hf_kernels_swiglu.html +98 -97
  4. activation/impls/torch_swiglu.html +123 -129
  5. activation/index.html +1 -1
  6. activation/results_linux/artifacts/combine/latency.svg +3 -0
  7. activation/results_linux/cells/combine.py +26 -0
  8. activation/results_linux/combined_results.html +0 -0
  9. activation/results_linux/index.html +88 -0
  10. causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +24 -24
  11. causal_conv1d/impls/cells/benchmark.py +9 -18
  12. causal_conv1d/impls/hf_kernels_causal_conv1d.html +0 -0
  13. causal_conv1d/impls/torch_causal_conv1d.html +0 -0
  14. causal_conv1d/results/artifacts/combine/latency.svg +2 -2
  15. causal_conv1d/results/combined_results.html +131 -131
  16. deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl +4 -4
  17. deformable_detr/impls/cells/benchmark.py +94 -18
  18. deformable_detr/impls/hf_kernels_deformable_detr.html +79 -80
  19. deformable_detr/impls/torch_deformable_detr.html +97 -103
  20. deformable_detr/results/artifacts/combine/latency.svg +2 -2
  21. deformable_detr/results/combined_results.html +126 -230
  22. flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -6
  23. flash_attn/impls/cells/benchmark.py +9 -11
  24. flash_attn/impls/flash_attention.html +142 -142
  25. flash_attn/impls/hf_kernels_flash_attn.html +94 -94
  26. flash_attn/impls/hf_kernels_flash_attn3.html +83 -81
  27. flash_attn/impls/mem_efficient_attention.html +140 -134
  28. flash_attn/impls/sage_attention.html +12 -10
  29. flash_attn/impls/xformers.html +90 -90
  30. flash_attn/results/artifacts/combine/latency.svg +2 -2
  31. flash_attn/results/combined_results.html +143 -143
  32. index.html +27 -1
  33. layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +4 -4
  34. layer_norm/impls/hf_kernels_layer_norm.html +58 -54
  35. layer_norm/impls/torch_layer_norm.html +100 -54
  36. layer_norm/results/artifacts/combine/latency.svg +2 -2
  37. layer_norm/results/combined_results.html +53 -53
  38. openai_moe/impls/artifacts/benchmark/openai_moe.jsonl +8 -8
  39. openai_moe/impls/binned_torch.html +194 -188
  40. openai_moe/impls/cells/benchmark.py +40 -95
  41. openai_moe/impls/gpt_oss_moe.html +192 -194
  42. openai_moe/results/artifacts/combine/latency.svg +2 -2
  43. openai_moe/results/combined_results.html +94 -94
  44. rotary/impls/artifacts/benchmark/rotary.jsonl +24 -24
  45. rotary/impls/cells/benchmark.py +12 -21
  46. rotary/impls/hf_kernels_rotary.html +0 -0
  47. rotary/impls/torch_rotary.html +0 -0
  48. rotary/results/artifacts/combine/latency.svg +2 -2
  49. rotary/results/combined_results.html +171 -171
activation/impls/artifacts/benchmark/activation.jsonl CHANGED
@@ -1,9 +1,9 @@
1
- {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024160000066331122, "p50": 0.024919999987105257, "p90": 0.025289999939559493, "mean": 0.025252000023101573, "iqr": 0.0006499999471998308, "raw_times": [0.025289999939559493, 0.02725000013015233, 0.024639999992359662, 0.024919999987105257, 0.024160000066331122], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030839999908494065, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
2
- {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027569999929255573, "p50": 0.029069999982311856, "p90": 0.029229999881863478, "mean": 0.029034399949523504, "iqr": 0.0008489998890581774, "raw_times": [0.027569999929255573, 0.030920999961381312, 0.0283809999928053, 0.029229999881863478, 0.029069999982311856], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03184999991390214, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
3
- {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0278800000614865, "p50": 0.02896099999816215, "p90": 0.029151000035199104, "mean": 0.02896060000239231, "iqr": 0.0004910000370728085, "raw_times": [0.028659999998126295, 0.03015099991898751, 0.029151000035199104, 0.02896099999816215, 0.0278800000614865], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03205000007255876, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
4
- {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027180999950360274, "p50": 0.028851000024587847, "p90": 0.029309999945326126, "mean": 0.02889839993258647, "iqr": 0.000470000031782547, "raw_times": [0.027180999950360274, 0.028851000024587847, 0.02883999991354358, 0.03030999982911453, 0.029309999945326126], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030620000188719132, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
5
- {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027590999934545835, "p50": 0.028819999897677917, "p90": 0.02953100010927301, "mean": 0.02878659997804789, "iqr": 0.0017000002117129043, "raw_times": [0.027590999934545835, 0.02953100010927301, 0.027830999897560105, 0.03016000005118258, 0.028819999897677917], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031159999934970983, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
6
- {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026730000172392465, "p50": 0.028800999871236854, "p90": 0.02885999992940924, "mean": 0.028368599987516063, "iqr": 0.0005089998467155965, "raw_times": [0.026730000172392465, 0.02885999992940924, 0.02910099988184811, 0.028800999871236854, 0.028351000082693645], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030940999977246975, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
7
- {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02737999989221862, "p50": 0.0283800000033807, "p90": 0.02853099999811093, "mean": 0.028162599983261316, "iqr": 0.0007899998308857903, "raw_times": [0.02737999989221862, 0.0283800000033807, 0.028780999855371192, 0.02774100016722514, 0.02853099999811093], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.034010999797828845, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
8
- {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02824100010911934, "p50": 0.028820000125051592, "p90": 0.02886099991883384, "mean": 0.029222400007711258, "iqr": 0.00022099993657320738, "raw_times": [0.02824100010911934, 0.02886099991883384, 0.028639999982260633, 0.03154999990329088, 0.028820000125051592], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029901000061727245, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
9
- {"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02627100002428051, "p50": 0.02855000002455199, "p90": 0.02863100007743924, "mean": 0.028174600083730184, "iqr": 0.0002599999788799323, "raw_times": [0.02627100002428051, 0.028371000098559307, 0.02863100007743924, 0.02905000019381987, 0.02855000002455199], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02980999988722033, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
 
1
+ {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04233100003148138, "p50": 0.043751000021075015, "p90": 0.044161999994685175, "mean": 0.04361539999990782, "iqr": 0.001740000016070553, "raw_times": [0.044161999994685175, 0.04541099997368292, 0.04242199997861462, 0.043751000021075015, 0.04233100003148138], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05063100002189458, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
2
+ {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.054010999974707374, "p50": 0.05540200004361395, "p90": 0.05709199990633351, "mean": 0.057631800018498325, "iqr": 0.0019599997358454857, "raw_times": [0.054010999974707374, 0.05513200017048803, 0.06652199999734876, 0.05540200004361395, 0.05709199990633351], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05926099993303069, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
3
+ {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05346099987946218, "p50": 0.054341999884854886, "p90": 0.05543199995372561, "mean": 0.054953799917711876, "iqr": 0.001390000079481979, "raw_times": [0.05346099987946218, 0.05543199995372561, 0.05749199999627308, 0.054341999884854886, 0.05404199987424363], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05924099991716503, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
4
+ {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.053281999953469494, "p50": 0.054581999847869156, "p90": 0.05551200001718826, "mean": 0.054651799973726156, "iqr": 0.0014510001165035646, "raw_times": [0.05406099990068469, 0.05582200014941918, 0.05551200001718826, 0.054581999847869156, 0.053281999953469494], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05814099995404831, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
5
+ {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05309099992700794, "p50": 0.05449100012810959, "p90": 0.05478200000652578, "mean": 0.05435540001599293, "iqr": 0.0010310000106983352, "raw_times": [0.05449100012810959, 0.055662000022493885, 0.05375099999582744, 0.05309099992700794, 0.05478200000652578], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057451999964541756, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
6
+ {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051550999842220335, "p50": 0.052460999995673774, "p90": 0.05307099991114228, "mean": 0.05247719996077649, "iqr": 0.000889999910214101, "raw_times": [0.051550999842220335, 0.05307099991114228, 0.05218100000092818, 0.052460999995673774, 0.05312200005391787], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07207299995570793, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
7
+ {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052890999995725, "p50": 0.05325200004335784, "p90": 0.054772000112279784, "mean": 0.053839400061406195, "iqr": 0.001821000068957801, "raw_times": [0.05295100004332198, 0.054772000112279784, 0.05325200004335784, 0.052890999995725, 0.055331000112346373], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05688200008080457, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
8
+ {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05233100000623381, "p50": 0.054522000027645845, "p90": 0.05475100010698952, "mean": 0.05385140002545086, "iqr": 0.0020300001324358163, "raw_times": [0.052720999974553706, 0.05475100010698952, 0.05233100000623381, 0.054932000011831406, 0.054522000027645845], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056971000049088616, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
9
+ {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052550999953382416, "p50": 0.05365099991649913, "p90": 0.053941000032864395, "mean": 0.053534999960902496, "iqr": 0.0006200000370881753, "raw_times": [0.05421099990599032, 0.05365099991649913, 0.052550999953382416, 0.053941000032864395, 0.05332099999577622], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058042000091518275, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
activation/impls/cells/benchmark.py CHANGED
@@ -4,7 +4,6 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
- # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
@@ -13,22 +12,17 @@
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- from kernels import get_kernel
17
 
18
- # Load the activation kernel
19
- activation = get_kernel("kernels-community/activation")
20
 
21
-
22
- def hf_kernels_swiglu(input_tensor):
23
- hidden_dim = input_tensor.shape[-1] // 2
24
- out_shape = input_tensor.shape[:-1] + (hidden_dim,)
25
- out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
26
- return activation.silu_and_mul(out, input_tensor)
27
 
28
 
29
  run_benchmark(
30
  kernel_type=KernelTypeEnum.ACTIVATION,
31
- impl_name="hf_kernels_swiglu",
32
- impl_tags={"family": "hf-kernels", "backend": "cuda"},
33
- impl_func=hf_kernels_swiglu,
34
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
 
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
15
+ import torch, torch.nn.functional as F
16
 
 
 
17
 
18
+ def swiglu_eager(x):
19
+ d = x.shape[-1] // 2
20
+ return F.silu(x[..., :d]) * x[..., d:]
 
 
 
21
 
22
 
23
  run_benchmark(
24
  kernel_type=KernelTypeEnum.ACTIVATION,
25
+ impl_name="torch_eager",
26
+ impl_tags={"family":"hf-kernels", "backend":"eager"},
27
+ impl_func=swiglu_eager,
28
  )
activation/impls/hf_kernels_swiglu.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.24s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3905,7 +3905,7 @@ Cell: nv | 0.24s
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
- <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:56:03 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.24s
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
- | N/A 30C P0 77W / 350W | 0MiB / 46068MiB | 10% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.24s
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
- Cell: benchmark | 4.62s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3995,17 +3995,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 72.704us 1747.69% 72.704us 72.704us 1
3999
- hf_kernels_swiglu 10.22% 211.154us 99.32% 2.053ms 2.053ms 0.000us 0.00% 5.600us 5.600us 1
4000
- _activation_23bf3fb::silu_and_mul 1.00% 20.580us 87.11% 1.800ms 600.140us 4.160us 100.00% 5.600us 1.867us 3
4001
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.160us 100.00% 4.160us 1.387us 3
4002
- Activity Buffer Request 84.13% 1.739ms 84.13% 1.739ms 1.739ms 1.440us 34.62% 1.440us 1.440us 1
4003
- aten::empty 1.99% 41.071us 1.99% 41.071us 13.690us 0.000us 0.00% 0.000us 0.000us 3
4004
- cudaLaunchKernel 1.99% 41.111us 1.99% 41.111us 13.704us 0.000us 0.00% 0.000us 0.000us 3
4005
- cudaDeviceSynchronize 0.68% 14.100us 0.68% 14.100us 14.100us 0.000us 0.00% 0.000us 0.000us 1
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
- Self CPU time total: 2.067ms
4008
- Self CUDA time total: 4.160us
4009
 
4010
 
4011
 
@@ -4015,17 +4015,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 105.278us 2696.67% 105.278us 105.278us 1
4019
- hf_kernels_swiglu 7.13% 139.913us 99.69% 1.957ms 1.957ms 0.000us 0.00% 5.216us 5.216us 1
4020
- _activation_23bf3fb::silu_and_mul 1.22% 23.859us 91.38% 1.794ms 598.043us 3.904us 100.00% 5.216us 1.739us 3
4021
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.904us 100.00% 3.904us 1.301us 3
4022
- Activity Buffer Request 88.47% 1.737ms 88.47% 1.737ms 1.737ms 1.312us 33.61% 1.312us 1.312us 1
4023
- aten::empty 1.19% 23.420us 1.19% 23.420us 7.807us 0.000us 0.00% 0.000us 0.000us 3
4024
- cudaLaunchKernel 1.70% 33.281us 1.70% 33.281us 11.094us 0.000us 0.00% 0.000us 0.000us 3
4025
- cudaDeviceSynchronize 0.31% 6.000us 0.31% 6.000us 6.000us 0.000us 0.00% 0.000us 0.000us 1
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
- Self CPU time total: 1.963ms
4028
- Self CUDA time total: 3.904us
4029
 
4030
 
4031
 
@@ -4035,17 +4035,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.849us 1275.09% 62.849us 62.849us 1
4039
- hf_kernels_swiglu 5.51% 105.232us 99.72% 1.903ms 1.903ms 0.000us 0.00% 6.594us 6.594us 1
4040
- _activation_23bf3fb::silu_and_mul 1.04% 19.839us 93.23% 1.779ms 593.100us 4.929us 100.00% 6.594us 2.198us 3
4041
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.929us 100.00% 4.929us 1.643us 3
4042
- Activity Buffer Request 90.86% 1.734ms 90.86% 1.734ms 1.734ms 1.665us 33.78% 1.665us 1.665us 1
4043
- aten::empty 0.98% 18.730us 0.98% 18.730us 6.243us 0.000us 0.00% 0.000us 0.000us 3
4044
- cudaLaunchKernel 1.33% 25.362us 1.33% 25.362us 8.454us 0.000us 0.00% 0.000us 0.000us 3
4045
- cudaDeviceSynchronize 0.28% 5.330us 0.28% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
- Self CPU time total: 1.909ms
4048
- Self CUDA time total: 4.929us
4049
 
4050
 
4051
 
@@ -4055,17 +4055,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
4055
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4056
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4057
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4058
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.512us 1515.79% 64.512us 64.512us 1
4059
- hf_kernels_swiglu 5.00% 107.783us 99.78% 2.152ms 2.152ms 0.000us 0.00% 5.696us 5.696us 1
4060
- _activation_23bf3fb::silu_and_mul 0.93% 20.060us 93.90% 2.025ms 675.114us 4.256us 100.00% 5.696us 1.899us 3
4061
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.256us 100.00% 4.256us 1.419us 3
4062
- Activity Buffer Request 82.83% 1.787ms 82.83% 1.787ms 1.787ms 1.440us 33.83% 1.440us 1.440us 1
4063
- aten::empty 0.89% 19.099us 0.89% 19.099us 6.366us 0.000us 0.00% 0.000us 0.000us 3
4064
- cudaLaunchKernel 10.14% 218.744us 10.14% 218.744us 72.915us 0.000us 0.00% 0.000us 0.000us 3
4065
- cudaDeviceSynchronize 0.22% 4.671us 0.22% 4.671us 4.671us 0.000us 0.00% 0.000us 0.000us 1
4066
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4067
- Self CPU time total: 2.157ms
4068
- Self CUDA time total: 4.256us
4069
 
4070
 
4071
 
@@ -4075,17 +4075,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4077
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4078
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 60.319us 1029.86% 60.319us 60.319us 1
4079
- hf_kernels_swiglu 13.59% 83.190us 99.22% 607.209us 607.209us 0.000us 0.00% 7.809us 7.809us 1
4080
- _activation_23bf3fb::silu_and_mul 3.33% 20.351us 82.60% 505.509us 168.503us 5.857us 100.00% 7.809us 2.603us 3
4081
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.857us 100.00% 5.857us 1.952us 3
4082
- Activity Buffer Request 46.13% 282.314us 46.13% 282.314us 282.314us 1.952us 33.33% 1.952us 1.952us 1
4083
- aten::empty 3.02% 18.510us 3.02% 18.510us 6.170us 0.000us 0.00% 0.000us 0.000us 3
4084
- cudaLaunchKernel 33.14% 202.844us 33.14% 202.844us 67.615us 0.000us 0.00% 0.000us 0.000us 3
4085
- cudaDeviceSynchronize 0.78% 4.791us 0.78% 4.791us 4.791us 0.000us 0.00% 0.000us 0.000us 1
4086
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4087
- Self CPU time total: 612.000us
4088
- Self CUDA time total: 5.857us
4089
 
4090
 
4091
 
@@ -4095,17 +4095,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
4095
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4096
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.087us 899.57% 69.087us 69.087us 1
4099
- hf_kernels_swiglu 5.09% 105.021us 99.75% 2.059ms 2.059ms 0.000us 0.00% 10.240us 10.240us 1
4100
- _activation_23bf3fb::silu_and_mul 0.96% 19.861us 93.70% 1.934ms 644.594us 7.680us 100.00% 10.240us 3.413us 3
4101
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.680us 100.00% 7.680us 2.560us 3
4102
- Activity Buffer Request 83.16% 1.716ms 83.16% 1.716ms 1.716ms 2.560us 33.33% 2.560us 2.560us 1
4103
- aten::empty 0.96% 19.840us 0.96% 19.840us 6.613us 0.000us 0.00% 0.000us 0.000us 3
4104
- cudaLaunchKernel 9.57% 197.533us 9.57% 197.533us 65.844us 0.000us 0.00% 0.000us 0.000us 3
4105
- cudaDeviceSynchronize 0.25% 5.209us 0.25% 5.209us 5.209us 0.000us 0.00% 0.000us 0.000us 1
4106
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4107
- Self CPU time total: 2.064ms
4108
- Self CUDA time total: 7.680us
4109
 
4110
 
4111
 
@@ -4115,17 +4115,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
4115
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4116
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4117
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4118
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.615us 969.59% 63.615us 63.615us 1
4119
- hf_kernels_swiglu 4.67% 99.430us 99.78% 2.123ms 2.123ms 0.000us 0.00% 8.769us 8.769us 1
4120
- _activation_23bf3fb::silu_and_mul 0.94% 19.910us 94.25% 2.005ms 668.341us 6.561us 100.00% 8.769us 2.923us 3
4121
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.561us 100.00% 6.561us 2.187us 3
4122
- Activity Buffer Request 84.26% 1.793ms 84.26% 1.793ms 1.793ms 2.208us 33.65% 2.208us 2.208us 1
4123
- aten::empty 0.86% 18.221us 0.86% 18.221us 6.074us 0.000us 0.00% 0.000us 0.000us 3
4124
- cudaLaunchKernel 9.05% 192.544us 9.05% 192.544us 64.181us 0.000us 0.00% 0.000us 0.000us 3
4125
- cudaDeviceSynchronize 0.22% 4.771us 0.22% 4.771us 4.771us 0.000us 0.00% 0.000us 0.000us 1
4126
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4127
- Self CPU time total: 2.127ms
4128
- Self CUDA time total: 6.561us
4129
 
4130
 
4131
 
@@ -4135,17 +4135,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
4135
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4136
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4137
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4138
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 58.655us 627.73% 58.655us 58.655us 1
4139
- hf_kernels_swiglu 14.96% 80.683us 99.03% 533.948us 533.948us 0.000us 0.00% 12.480us 12.480us 1
4140
- _activation_23bf3fb::silu_and_mul 3.95% 21.299us 80.75% 435.406us 145.135us 9.344us 100.00% 12.480us 4.160us 3
4141
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.344us 100.00% 9.344us 3.115us 3
4142
- Activity Buffer Request 41.04% 221.264us 41.04% 221.264us 221.264us 3.136us 33.56% 3.136us 3.136us 1
4143
- aten::empty 3.31% 17.859us 3.31% 17.859us 5.953us 0.000us 0.00% 0.000us 0.000us 3
4144
- cudaLaunchKernel 35.77% 192.843us 35.77% 192.843us 64.281us 0.000us 0.00% 0.000us 0.000us 3
4145
- cudaDeviceSynchronize 0.97% 5.240us 0.97% 5.240us 5.240us 0.000us 0.00% 0.000us 0.000us 1
4146
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4147
- Self CPU time total: 539.188us
4148
- Self CUDA time total: 9.344us
4149
 
4150
 
4151
 
@@ -4155,17 +4155,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
4155
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4156
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4157
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4158
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 60.863us 469.62% 60.863us 60.863us 1
4159
- hf_kernels_swiglu 16.50% 95.821us 99.18% 576.059us 576.059us 0.000us 0.00% 17.312us 17.312us 1
4160
- _activation_23bf3fb::silu_and_mul 3.50% 20.301us 79.69% 462.858us 154.286us 12.960us 100.00% 17.312us 5.771us 3
4161
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 12.960us 100.00% 12.960us 4.320us 3
4162
- Activity Buffer Request 43.18% 250.794us 43.18% 250.794us 250.794us 4.352us 33.58% 4.352us 4.352us 1
4163
- aten::empty 2.99% 17.380us 2.99% 17.380us 5.793us 0.000us 0.00% 0.000us 0.000us 3
4164
- cudaLaunchKernel 33.01% 191.763us 33.01% 191.763us 63.921us 0.000us 0.00% 0.000us 0.000us 3
4165
- cudaDeviceSynchronize 0.82% 4.790us 0.82% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1
4166
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4167
- Self CPU time total: 580.849us
4168
- Self CUDA time total: 12.960us
4169
 
4170
 
4171
  impl wl p50(ms) ok
@@ -4182,12 +4182,13 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
4182
  <div class="uv-install-logs" id="uv-logs-benchmark">
4183
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4184
  <div class="uv-logs-content" style="display: none;">
4185
- Installed 14 packages in 12ms
4186
  </div>
4187
  </div>
4188
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4189
- Fetching 7 files: 14%|█▍ | 1/7 [00:00&lt;00:02, 3.00it/s]
4190
- Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 17.15it/s]</div>
 
4191
  <div class="cell-artifacts">
4192
  <h4>Artifacts:</h4>
4193
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.29s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 19:54:13 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
 
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
+ | N/A 35C P0 120W / 350W | 0MiB / 46068MiB | 100% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
 
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
+ Cell: benchmark | 8.35s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 85.600us 2073.64% 85.600us 85.600us 1
3999
+ hf_kernels_swiglu 8.76% 183.666us 99.29% 2.081ms 2.081ms 0.000us 0.00% 5.568us 5.568us 1
4000
+ _activation_23bf3fb::silu_and_mul 0.98% 20.570us 88.50% 1.855ms 618.341us 4.128us 100.00% 5.568us 1.856us 3
4001
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.128us 100.00% 4.128us 1.376us 3
4002
+ Activity Buffer Request 85.39% 1.790ms 85.39% 1.790ms 1.790ms 1.440us 34.88% 1.440us 1.440us 1
4003
+ aten::empty 2.03% 42.471us 2.03% 42.471us 14.157us 0.000us 0.00% 0.000us 0.000us 3
4004
+ cudaLaunchKernel 2.13% 44.611us 2.13% 44.611us 14.870us 0.000us 0.00% 0.000us 0.000us 3
4005
+ cudaDeviceSynchronize 0.71% 14.820us 0.71% 14.820us 14.820us 0.000us 0.00% 0.000us 0.000us 1
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
+ Self CPU time total: 2.096ms
4008
+ Self CUDA time total: 4.128us
4009
 
4010
 
4011
 
 
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.111us 1666.52% 66.111us 66.111us 1
4019
+ hf_kernels_swiglu 4.94% 94.004us 99.69% 1.897ms 1.897ms 0.000us 0.00% 5.311us 5.311us 1
4020
+ _activation_23bf3fb::silu_and_mul 0.99% 18.841us 93.73% 1.783ms 594.417us 3.967us 100.00% 5.311us 1.770us 3
4021
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.967us 100.00% 3.967us 1.322us 3
4022
+ Activity Buffer Request 91.36% 1.738ms 91.36% 1.738ms 1.738ms 1.344us 33.88% 1.344us 1.344us 1
4023
+ aten::empty 1.01% 19.260us 1.01% 19.260us 6.420us 0.000us 0.00% 0.000us 0.000us 3
4024
+ cudaLaunchKernel 1.38% 26.230us 1.38% 26.230us 8.743us 0.000us 0.00% 0.000us 0.000us 3
4025
+ cudaDeviceSynchronize 0.31% 5.950us 0.31% 5.950us 5.950us 0.000us 0.00% 0.000us 0.000us 1
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
+ Self CPU time total: 1.902ms
4028
+ Self CUDA time total: 3.967us
4029
 
4030
 
4031
 
 
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.479us 1380.35% 68.479us 68.479us 1
4039
+ hf_kernels_swiglu 4.69% 88.684us 99.71% 1.886ms 1.886ms 0.000us 0.00% 6.625us 6.625us 1
4040
+ _activation_23bf3fb::silu_and_mul 0.99% 18.661us 94.04% 1.778ms 592.827us 4.961us 100.00% 6.625us 2.208us 3
4041
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.961us 100.00% 4.961us 1.654us 3
4042
+ Activity Buffer Request 91.53% 1.731ms 91.53% 1.731ms 1.731ms 1.664us 33.54% 1.664us 1.664us 1
4043
+ aten::empty 0.98% 18.610us 0.98% 18.610us 6.203us 0.000us 0.00% 0.000us 0.000us 3
4044
+ cudaLaunchKernel 1.52% 28.800us 1.52% 28.800us 9.600us 0.000us 0.00% 0.000us 0.000us 3
4045
+ cudaDeviceSynchronize 0.29% 5.500us 0.29% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
+ Self CPU time total: 1.891ms
4048
+ Self CUDA time total: 4.961us
4049
 
4050
 
4051
 
 
4055
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4056
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4057
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4058
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.368us 1547.76% 66.368us 66.368us 1
4059
+ hf_kernels_swiglu 4.25% 87.402us 99.76% 2.051ms 2.051ms 0.000us 0.00% 5.760us 5.760us 1
4060
+ _activation_23bf3fb::silu_and_mul 0.97% 19.981us 94.58% 1.945ms 648.228us 4.288us 100.00% 5.760us 1.920us 3
4061
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.288us 100.00% 4.288us 1.429us 3
4062
+ Activity Buffer Request 83.83% 1.724ms 83.83% 1.724ms 1.724ms 1.472us 34.33% 1.472us 1.472us 1
4063
+ aten::empty 0.93% 19.111us 0.93% 19.111us 6.370us 0.000us 0.00% 0.000us 0.000us 3
4064
+ cudaLaunchKernel 9.77% 200.885us 9.77% 200.885us 66.962us 0.000us 0.00% 0.000us 0.000us 3
4065
+ cudaDeviceSynchronize 0.24% 5.020us 0.24% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1
4066
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4067
+ Self CPU time total: 2.056ms
4068
+ Self CUDA time total: 4.288us
4069
 
4070
 
4071
 
 
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4077
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4078
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.360us 1131.72% 67.360us 67.360us 1
4079
+ hf_kernels_swiglu 4.31% 89.293us 99.77% 2.067ms 2.067ms 0.000us 0.00% 7.968us 7.968us 1
4080
+ _activation_23bf3fb::silu_and_mul 0.98% 20.220us 94.55% 1.959ms 652.859us 5.952us 100.00% 7.968us 2.656us 3
4081
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.952us 100.00% 5.952us 1.984us 3
4082
+ Activity Buffer Request 85.78% 1.777ms 85.78% 1.777ms 1.777ms 2.016us 33.87% 2.016us 2.016us 1
4083
+ aten::empty 0.91% 18.861us 0.91% 18.861us 6.287us 0.000us 0.00% 0.000us 0.000us 3
4084
+ cudaLaunchKernel 7.79% 161.464us 7.79% 161.464us 53.821us 0.000us 0.00% 0.000us 0.000us 3
4085
+ cudaDeviceSynchronize 0.23% 4.820us 0.23% 4.820us 4.820us 0.000us 0.00% 0.000us 0.000us 1
4086
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4087
+ Self CPU time total: 2.072ms
4088
+ Self CUDA time total: 5.952us
4089
 
4090
 
4091
 
 
4095
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4096
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.574us 830.43% 64.574us 64.574us 1
4099
+ hf_kernels_swiglu 18.42% 86.111us 98.86% 462.073us 462.073us 0.000us 0.00% 10.367us 10.367us 1
4100
+ _activation_23bf3fb::silu_and_mul 4.27% 19.980us 76.48% 357.451us 119.150us 7.776us 100.00% 10.367us 3.456us 3
4101
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 100.00% 7.776us 2.592us 3
4102
+ Activity Buffer Request 38.90% 181.805us 38.90% 181.805us 181.805us 2.591us 33.32% 2.591us 2.591us 1
4103
+ aten::empty 3.96% 18.511us 3.96% 18.511us 6.170us 0.000us 0.00% 0.000us 0.000us 3
4104
+ cudaLaunchKernel 33.30% 155.666us 33.30% 155.666us 51.889us 0.000us 0.00% 0.000us 0.000us 3
4105
+ cudaDeviceSynchronize 1.14% 5.330us 1.14% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1
4106
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4107
+ Self CPU time total: 467.403us
4108
+ Self CUDA time total: 7.776us
4109
 
4110
 
4111
 
 
4115
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4116
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4117
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4118
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.527us 943.95% 62.527us 62.527us 1
4119
+ hf_kernels_swiglu 18.86% 83.092us 98.85% 435.523us 435.523us 0.000us 0.00% 8.832us 8.832us 1
4120
+ _activation_23bf3fb::silu_and_mul 4.63% 20.380us 75.83% 334.080us 111.360us 6.624us 100.00% 8.832us 2.944us 3
4121
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 100.00% 6.624us 2.208us 3
4122
+ Activity Buffer Request 36.44% 160.555us 36.44% 160.555us 160.555us 2.208us 33.33% 2.208us 2.208us 1
4123
+ aten::empty 4.17% 18.351us 4.17% 18.351us 6.117us 0.000us 0.00% 0.000us 0.000us 3
4124
+ cudaLaunchKernel 34.76% 153.145us 34.76% 153.145us 51.048us 0.000us 0.00% 0.000us 0.000us 3
4125
+ cudaDeviceSynchronize 1.15% 5.060us 1.15% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1
4126
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4127
+ Self CPU time total: 440.583us
4128
+ Self CUDA time total: 6.624us
4129
 
4130
 
4131
 
 
4135
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4136
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4137
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4138
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.184us 732.88% 69.184us 69.184us 1
4139
+ hf_kernels_swiglu 4.54% 90.562us 99.76% 1.988ms 1.988ms 0.000us 0.00% 12.608us 12.608us 1
4140
+ _activation_23bf3fb::silu_and_mul 1.02% 20.260us 94.19% 1.877ms 625.705us 9.440us 100.00% 12.608us 4.203us 3
4141
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.440us 100.00% 9.440us 3.147us 3
4142
+ Activity Buffer Request 85.41% 1.702ms 85.41% 1.702ms 1.702ms 3.168us 33.56% 3.168us 3.168us 1
4143
+ aten::empty 1.03% 20.450us 1.03% 20.450us 6.817us 0.000us 0.00% 0.000us 0.000us 3
4144
+ cudaLaunchKernel 7.76% 154.666us 7.76% 154.666us 51.555us 0.000us 0.00% 0.000us 0.000us 3
4145
+ cudaDeviceSynchronize 0.24% 4.870us 0.24% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1
4146
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4147
+ Self CPU time total: 1.993ms
4148
+ Self CUDA time total: 9.440us
4149
 
4150
 
4151
 
 
4155
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4156
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4157
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4158
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.376us 499.51% 65.376us 65.376us 1
4159
+ hf_kernels_swiglu 19.52% 83.334us 98.75% 421.512us 421.512us 0.000us 0.00% 17.472us 17.472us 1
4160
+ _activation_23bf3fb::silu_and_mul 4.53% 19.340us 74.78% 319.198us 106.399us 13.088us 100.00% 17.472us 5.824us 3
4161
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.088us 100.00% 13.088us 4.363us 3
4162
+ Activity Buffer Request 34.31% 146.444us 34.31% 146.444us 146.444us 4.384us 33.50% 4.384us 4.384us 1
4163
+ aten::empty 4.45% 18.980us 4.45% 18.980us 6.327us 0.000us 0.00% 0.000us 0.000us 3
4164
+ cudaLaunchKernel 35.94% 153.414us 35.94% 153.414us 51.138us 0.000us 0.00% 0.000us 0.000us 3
4165
+ cudaDeviceSynchronize 1.25% 5.350us 1.25% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1
4166
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4167
+ Self CPU time total: 426.862us
4168
+ Self CUDA time total: 13.088us
4169
 
4170
 
4171
  impl wl p50(ms) ok
 
4182
  <div class="uv-install-logs" id="uv-logs-benchmark">
4183
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4184
  <div class="uv-logs-content" style="display: none;">
4185
+ Installed 51 packages in 320ms
4186
  </div>
4187
  </div>
4188
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4189
+ Fetching 7 files: 14%|█▍ | 1/7 [00:00&lt;00:01, 5.80it/s]
4190
+ Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 13.68it/s]
4191
+ Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 17.69it/s]</div>
4192
  <div class="cell-artifacts">
4193
  <h4>Artifacts:</h4>
4194
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
activation/impls/torch_swiglu.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.24s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,7 +3904,7 @@ Cell: nv | 0.24s
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
- <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:56:03 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.24s
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
- | N/A 30C P0 77W / 350W | 0MiB / 46068MiB | 10% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
@@ -3935,9 +3935,9 @@ Cell: nv | 0.24s
3935
  <span class="collapse-indicators">
3936
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3939
  </span> |
3940
- Cell: benchmark | 7.23s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3987,20 +3987,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 170.752us 1351.10% 170.752us 170.752us 1
3991
- torch_eager 8.36% 195.202us 99.35% 2.320ms 2.320ms 0.000us 0.00% 14.941us 14.941us 1
3992
- aten::silu 2.60% 60.811us 86.31% 2.016ms 671.908us 6.463us 51.14% 8.766us 2.922us 3
3993
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.463us 51.14% 6.463us 2.154us 3
3994
- aten::mul 1.36% 31.870us 2.27% 52.962us 17.654us 6.175us 48.86% 6.175us 2.058us 3
3995
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.175us 48.86% 6.175us 2.058us 3
3996
- Activity Buffer Request 81.78% 1.910ms 81.78% 1.910ms 1.910ms 2.303us 18.22% 2.303us 2.303us 1
3997
- aten::slice 1.97% 46.103us 2.42% 56.432us 9.405us 0.000us 0.00% 0.000us 0.000us 6
3998
- aten::as_strided 0.44% 10.329us 0.44% 10.329us 1.721us 0.000us 0.00% 0.000us 0.000us 6
3999
- cudaLaunchKernel 2.83% 66.203us 2.83% 66.203us 11.034us 0.000us 0.00% 0.000us 0.000us 6
4000
- cudaDeviceSynchronize 0.65% 15.081us 0.65% 15.081us 15.081us 0.000us 0.00% 0.000us 0.000us 1
4001
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4002
- Self CPU time total: 2.335ms
4003
- Self CUDA time total: 12.638us
4004
 
4005
 
4006
 
@@ -4010,20 +4010,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 142.911us 1157.08% 142.911us 142.911us 1
4014
- torch_eager 5.43% 102.941us 99.70% 1.891ms 1.891ms 0.000us 0.00% 14.495us 14.495us 1
4015
- aten::silu 2.14% 40.580us 90.39% 1.715ms 571.523us 6.399us 51.81% 8.543us 2.848us 3
4016
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.81% 6.399us 2.133us 3
4017
- aten::mul 1.41% 26.703us 2.36% 44.783us 14.928us 5.952us 48.19% 5.952us 1.984us 3
4018
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.952us 48.19% 5.952us 1.984us 3
4019
- Activity Buffer Request 86.86% 1.648ms 86.86% 1.648ms 1.648ms 2.144us 17.36% 2.144us 2.144us 1
4020
- aten::slice 1.25% 23.641us 1.52% 28.820us 4.803us 0.000us 0.00% 0.000us 0.000us 6
4021
- aten::as_strided 0.27% 5.179us 0.27% 5.179us 0.863us 0.000us 0.00% 0.000us 0.000us 6
4022
- cudaLaunchKernel 2.34% 44.460us 2.34% 44.460us 7.410us 0.000us 0.00% 0.000us 0.000us 6
4023
- cudaDeviceSynchronize 0.30% 5.691us 0.30% 5.691us 5.691us 0.000us 0.00% 0.000us 0.000us 1
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
- Self CPU time total: 1.897ms
4026
- Self CUDA time total: 12.351us
4027
 
4028
 
4029
 
@@ -4033,20 +4033,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
4033
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4034
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 146.017us 1102.26% 146.017us 146.017us 1
4037
- torch_eager 5.52% 107.884us 99.72% 1.948ms 1.948ms 0.000us 0.00% 15.519us 15.519us 1
4038
- aten::silu 2.05% 40.061us 90.43% 1.767ms 588.983us 6.783us 51.20% 9.055us 3.018us 3
4039
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.783us 51.20% 6.783us 2.261us 3
4040
- aten::mul 1.30% 25.470us 2.24% 43.800us 14.600us 6.464us 48.80% 6.464us 2.155us 3
4041
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.80% 6.464us 2.155us 3
4042
- Activity Buffer Request 87.02% 1.700ms 87.02% 1.700ms 1.700ms 2.272us 17.15% 2.272us 2.272us 1
4043
- aten::slice 1.26% 24.689us 1.53% 29.809us 4.968us 0.000us 0.00% 0.000us 0.000us 6
4044
- aten::as_strided 0.26% 5.120us 0.26% 5.120us 0.853us 0.000us 0.00% 0.000us 0.000us 6
4045
- cudaLaunchKernel 2.30% 44.851us 2.30% 44.851us 7.475us 0.000us 0.00% 0.000us 0.000us 6
4046
- cudaDeviceSynchronize 0.28% 5.500us 0.28% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
- Self CPU time total: 1.954ms
4049
- Self CUDA time total: 13.247us
4050
 
4051
 
4052
 
@@ -4056,20 +4056,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 147.168us 1164.30% 147.168us 147.168us 1
4060
- torch_eager 6.37% 108.862us 99.70% 1.705ms 1.705ms 0.000us 0.00% 14.816us 14.816us 1
4061
- aten::silu 2.27% 38.759us 89.04% 1.523ms 507.511us 6.496us 51.39% 8.672us 2.891us 3
4062
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.496us 51.39% 6.496us 2.165us 3
4063
- aten::mul 1.56% 26.620us 2.60% 44.441us 14.814us 6.144us 48.61% 6.144us 2.048us 3
4064
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.144us 48.61% 6.144us 2.048us 3
4065
- Activity Buffer Request 74.65% 1.277ms 74.65% 1.277ms 1.277ms 2.176us 17.22% 2.176us 2.176us 1
4066
- aten::slice 1.39% 23.842us 1.70% 29.081us 4.847us 0.000us 0.00% 0.000us 0.000us 6
4067
- aten::as_strided 0.31% 5.239us 0.31% 5.239us 0.873us 0.000us 0.00% 0.000us 0.000us 6
4068
- cudaLaunchKernel 13.16% 225.035us 13.16% 225.035us 37.506us 0.000us 0.00% 0.000us 0.000us 6
4069
- cudaDeviceSynchronize 0.30% 5.120us 0.30% 5.120us 5.120us 0.000us 0.00% 0.000us 0.000us 1
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
- Self CPU time total: 1.710ms
4072
- Self CUDA time total: 12.640us
4073
 
4074
 
4075
 
@@ -4079,20 +4079,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4081
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4082
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 146.271us 1106.86% 146.271us 146.271us 1
4083
- torch_eager 4.97% 106.601us 99.77% 2.139ms 2.139ms 0.000us 0.00% 15.486us 15.486us 1
4084
- aten::silu 1.88% 40.251us 91.37% 1.959ms 652.944us 6.751us 51.09% 9.022us 3.007us 3
4085
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.751us 51.09% 6.751us 2.250us 3
4086
- aten::mul 1.15% 24.611us 1.97% 42.221us 14.074us 6.464us 48.91% 6.464us 2.155us 3
4087
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.91% 6.464us 2.155us 3
4088
- Activity Buffer Request 80.01% 1.715ms 80.01% 1.715ms 1.715ms 2.271us 17.19% 2.271us 2.271us 1
4089
- aten::slice 1.17% 25.129us 1.45% 31.071us 5.179us 0.000us 0.00% 0.000us 0.000us 6
4090
- aten::as_strided 0.28% 5.942us 0.28% 5.942us 0.990us 0.000us 0.00% 0.000us 0.000us 6
4091
- cudaLaunchKernel 10.31% 220.963us 10.31% 220.963us 36.827us 0.000us 0.00% 0.000us 0.000us 6
4092
- cudaDeviceSynchronize 0.23% 5.031us 0.23% 5.031us 5.031us 0.000us 0.00% 0.000us 0.000us 1
4093
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4094
- Self CPU time total: 2.144ms
4095
- Self CUDA time total: 13.215us
4096
 
4097
 
4098
 
@@ -4102,20 +4102,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4104
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4105
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 139.706us 900.17% 139.706us 139.706us 1
4106
- torch_eager 16.34% 103.162us 99.17% 626.050us 626.050us 0.000us 0.00% 18.208us 18.208us 1
4107
- aten::silu 6.36% 40.131us 71.62% 452.127us 150.709us 7.968us 51.34% 10.656us 3.552us 3
4108
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 51.34% 7.968us 2.656us 3
4109
- aten::mul 3.68% 23.240us 6.43% 40.610us 13.537us 7.552us 48.66% 7.552us 2.517us 3
4110
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.66% 7.552us 2.517us 3
4111
- Activity Buffer Request 33.83% 213.593us 33.83% 213.593us 213.593us 2.688us 17.32% 2.688us 2.688us 1
4112
- aten::slice 3.84% 24.240us 4.78% 30.151us 5.025us 0.000us 0.00% 0.000us 0.000us 6
4113
- aten::as_strided 0.94% 5.911us 0.94% 5.911us 0.985us 0.000us 0.00% 0.000us 0.000us 6
4114
- cudaLaunchKernel 34.18% 215.773us 34.18% 215.773us 35.962us 0.000us 0.00% 0.000us 0.000us 6
4115
- cudaDeviceSynchronize 0.83% 5.229us 0.83% 5.229us 5.229us 0.000us 0.00% 0.000us 0.000us 1
4116
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4117
- Self CPU time total: 631.279us
4118
- Self CUDA time total: 15.520us
4119
 
4120
 
4121
 
@@ -4125,20 +4125,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
4125
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4126
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 149.023us 1039.50% 149.023us 149.023us 1
4129
- torch_eager 4.97% 105.151us 99.76% 2.112ms 2.112ms 0.000us 0.00% 16.800us 16.800us 1
4130
- aten::silu 1.93% 40.940us 91.23% 1.932ms 643.947us 7.360us 51.34% 9.824us 3.275us 3
4131
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 51.34% 7.360us 2.453us 3
4132
- aten::mul 1.20% 25.341us 2.15% 45.422us 15.141us 6.976us 48.66% 6.976us 2.325us 3
4133
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.976us 48.66% 6.976us 2.325us 3
4134
- Activity Buffer Request 80.00% 1.694ms 80.00% 1.694ms 1.694ms 2.464us 17.19% 2.464us 2.464us 1
4135
- aten::slice 1.16% 24.531us 1.41% 29.941us 4.990us 0.000us 0.00% 0.000us 0.000us 6
4136
- aten::as_strided 0.26% 5.410us 0.26% 5.410us 0.902us 0.000us 0.00% 0.000us 0.000us 6
4137
- cudaLaunchKernel 10.25% 217.014us 10.25% 217.014us 36.169us 0.000us 0.00% 0.000us 0.000us 6
4138
- cudaDeviceSynchronize 0.24% 5.140us 0.24% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1
4139
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4140
- Self CPU time total: 2.117ms
4141
- Self CUDA time total: 14.336us
4142
 
4143
 
4144
 
@@ -4148,20 +4148,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
4148
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4149
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4150
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4151
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 229.537us 1472.90% 229.537us 229.537us 1
4152
- torch_eager 26.22% 183.030us 99.31% 693.152us 693.152us 0.000us 0.00% 18.272us 18.272us 1
4153
- aten::silu 5.68% 39.610us 61.61% 430.047us 143.349us 7.967us 51.12% 10.655us 3.552us 3
4154
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.967us 51.12% 7.967us 2.656us 3
4155
- aten::mul 3.79% 26.431us 6.97% 48.673us 16.224us 7.617us 48.88% 7.617us 2.539us 3
4156
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.617us 48.88% 7.617us 2.539us 3
4157
- Activity Buffer Request 27.95% 195.093us 27.95% 195.093us 195.093us 2.688us 17.25% 2.688us 2.688us 1
4158
- aten::slice 3.65% 25.463us 4.50% 31.402us 5.234us 0.000us 0.00% 0.000us 0.000us 6
4159
- aten::as_strided 0.85% 5.939us 0.85% 5.939us 0.990us 0.000us 0.00% 0.000us 0.000us 6
4160
- cudaLaunchKernel 31.17% 217.586us 31.17% 217.586us 36.264us 0.000us 0.00% 0.000us 0.000us 6
4161
- cudaDeviceSynchronize 0.69% 4.809us 0.69% 4.809us 4.809us 0.000us 0.00% 0.000us 0.000us 1
4162
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4163
- Self CPU time total: 697.961us
4164
- Self CUDA time total: 15.584us
4165
 
4166
 
4167
 
@@ -4171,24 +4171,24 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
4171
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4172
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4173
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4174
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 162.367us 719.68% 162.367us 162.367us 1
4175
- torch_eager 5.30% 112.718us 99.76% 2.123ms 2.123ms 0.000us 0.00% 26.497us 26.497us 1
4176
- aten::silu 1.99% 42.361us 90.94% 1.935ms 644.944us 11.584us 51.35% 15.520us 5.173us 3
4177
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.584us 51.35% 11.584us 3.861us 3
4178
- aten::mul 1.24% 26.291us 2.09% 44.551us 14.850us 10.977us 48.65% 10.977us 3.659us 3
4179
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.977us 48.65% 10.977us 3.659us 3
4180
- Activity Buffer Request 79.75% 1.697ms 79.75% 1.697ms 1.697ms 3.936us 17.45% 3.936us 3.936us 1
4181
- aten::slice 1.18% 25.032us 1.43% 30.473us 5.079us 0.000us 0.00% 0.000us 0.000us 6
4182
- aten::as_strided 0.26% 5.441us 0.26% 5.441us 0.907us 0.000us 0.00% 0.000us 0.000us 6
4183
- cudaLaunchKernel 10.06% 214.034us 10.06% 214.034us 35.672us 0.000us 0.00% 0.000us 0.000us 6
4184
- cudaDeviceSynchronize 0.24% 5.051us 0.24% 5.051us 5.051us 0.000us 0.00% 0.000us 0.000us 1
4185
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4186
- Self CPU time total: 2.128ms
4187
- Self CUDA time total: 22.561us
4188
 
4189
 
4190
  impl wl p50(ms) ok
4191
- torch_eager cuda_T128_D1024 0.05 True
4192
  torch_eager cuda_T128_D2048 0.05 True
4193
  torch_eager cuda_T128_D768 0.04 True
4194
  torch_eager cuda_T256_D1024 0.05 True
@@ -4198,12 +4198,6 @@ torch_eager cuda_T512_D1024 0.05 True
4198
  torch_eager cuda_T512_D2048 0.05 True
4199
  torch_eager cuda_T512_D768 0.05 True
4200
  </pre></div>
4201
- <div class="uv-install-logs" id="uv-logs-benchmark">
4202
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4203
- <div class="uv-logs-content" style="display: none;">
4204
- Installed 37 packages in 324ms
4205
- </div>
4206
- </div>
4207
  <div class="cell-artifacts">
4208
  <h4>Artifacts:</h4>
4209
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.29s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 19:54:13 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
 
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
+ | N/A 35C P0 120W / 350W | 0MiB / 46068MiB | 100% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
 
3935
  <span class="collapse-indicators">
3936
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3939
  </span> |
3940
+ Cell: benchmark | 3.69s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 187.393us 1471.25% 187.393us 187.393us 1
3991
+ torch_eager 9.10% 197.603us 99.31% 2.157ms 2.157ms 0.000us 0.00% 15.073us 15.073us 1
3992
+ aten::silu 2.86% 62.203us 85.13% 1.849ms 616.358us 6.561us 51.51% 8.897us 2.966us 3
3993
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.561us 51.51% 6.561us 2.187us 3
3994
+ aten::mul 1.58% 34.212us 2.55% 55.432us 18.477us 6.176us 48.49% 6.176us 2.059us 3
3995
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
3996
+ Activity Buffer Request 80.18% 1.741ms 80.18% 1.741ms 1.741ms 2.336us 18.34% 2.336us 2.336us 1
3997
+ aten::slice 2.02% 43.964us 2.53% 55.013us 9.169us 0.000us 0.00% 0.000us 0.000us 6
3998
+ aten::as_strided 0.51% 11.049us 0.51% 11.049us 1.842us 0.000us 0.00% 0.000us 0.000us 6
3999
+ cudaLaunchKernel 3.07% 66.630us 3.07% 66.630us 11.105us 0.000us 0.00% 0.000us 0.000us 6
4000
+ cudaDeviceSynchronize 0.69% 14.920us 0.69% 14.920us 14.920us 0.000us 0.00% 0.000us 0.000us 1
4001
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4002
+ Self CPU time total: 2.172ms
4003
+ Self CUDA time total: 12.737us
4004
 
4005
 
4006
 
 
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.648us 1289.36% 159.648us 159.648us 1
4014
+ torch_eager 6.57% 137.523us 99.70% 2.087ms 2.087ms 0.000us 0.00% 14.526us 14.526us 1
4015
+ aten::silu 2.02% 42.391us 89.22% 1.868ms 622.711us 6.399us 51.68% 8.543us 2.848us 3
4016
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.68% 6.399us 2.133us 3
4017
+ aten::mul 1.43% 29.882us 2.35% 49.282us 16.427us 5.983us 48.32% 5.983us 1.994us 3
4018
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.983us 48.32% 5.983us 1.994us 3
4019
+ Activity Buffer Request 85.88% 1.798ms 85.88% 1.798ms 1.798ms 2.144us 17.32% 2.144us 2.144us 1
4020
+ aten::slice 1.30% 27.292us 1.55% 32.512us 5.419us 0.000us 0.00% 0.000us 0.000us 6
4021
+ aten::as_strided 0.25% 5.220us 0.25% 5.220us 0.870us 0.000us 0.00% 0.000us 0.000us 6
4022
+ cudaLaunchKernel 2.25% 47.061us 2.25% 47.061us 7.843us 0.000us 0.00% 0.000us 0.000us 6
4023
+ cudaDeviceSynchronize 0.30% 6.330us 0.30% 6.330us 6.330us 0.000us 0.00% 0.000us 0.000us 1
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
+ Self CPU time total: 2.094ms
4026
+ Self CUDA time total: 12.382us
4027
 
4028
 
4029
 
 
4033
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4034
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.680us 1172.29% 155.680us 155.680us 1
4037
+ torch_eager 6.64% 129.562us 99.68% 1.946ms 1.946ms 0.000us 0.00% 15.584us 15.584us 1
4038
+ aten::silu 2.16% 42.182us 89.10% 1.739ms 579.704us 6.848us 51.57% 9.152us 3.051us 3
4039
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.848us 51.57% 6.848us 2.283us 3
4040
+ aten::mul 1.46% 28.592us 2.38% 46.553us 15.518us 6.432us 48.43% 6.432us 2.144us 3
4041
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.43% 6.432us 2.144us 3
4042
+ Activity Buffer Request 85.60% 1.671ms 85.60% 1.671ms 1.671ms 2.304us 17.35% 2.304us 2.304us 1
4043
+ aten::slice 1.31% 25.640us 1.56% 30.540us 5.090us 0.000us 0.00% 0.000us 0.000us 6
4044
+ aten::as_strided 0.25% 4.900us 0.25% 4.900us 0.817us 0.000us 0.00% 0.000us 0.000us 6
4045
+ cudaLaunchKernel 2.26% 44.052us 2.26% 44.052us 7.342us 0.000us 0.00% 0.000us 0.000us 6
4046
+ cudaDeviceSynchronize 0.32% 6.150us 0.32% 6.150us 6.150us 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
+ Self CPU time total: 1.952ms
4049
+ Self CUDA time total: 13.280us
4050
 
4051
 
4052
 
 
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 160.289us 1264.91% 160.289us 160.289us 1
4060
+ torch_eager 6.06% 136.754us 99.75% 2.252ms 2.252ms 0.000us 0.00% 14.880us 14.880us 1
4061
+ aten::silu 1.87% 42.159us 90.17% 2.036ms 678.503us 6.560us 51.77% 8.768us 2.923us 3
4062
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.77% 6.560us 2.187us 3
4063
+ aten::mul 1.25% 28.231us 2.20% 49.632us 16.544us 6.112us 48.23% 6.112us 2.037us 3
4064
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.112us 48.23% 6.112us 2.037us 3
4065
+ Activity Buffer Request 79.28% 1.790ms 79.28% 1.790ms 1.790ms 2.208us 17.42% 2.208us 2.208us 1
4066
+ aten::slice 1.09% 24.671us 1.32% 29.801us 4.967us 0.000us 0.00% 0.000us 0.000us 6
4067
+ aten::as_strided 0.23% 5.130us 0.23% 5.130us 0.855us 0.000us 0.00% 0.000us 0.000us 6
4068
+ cudaLaunchKernel 9.98% 225.208us 9.98% 225.208us 37.535us 0.000us 0.00% 0.000us 0.000us 6
4069
+ cudaDeviceSynchronize 0.25% 5.621us 0.25% 5.621us 5.621us 0.000us 0.00% 0.000us 0.000us 1
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
+ Self CPU time total: 2.257ms
4072
+ Self CUDA time total: 12.672us
4073
 
4074
 
4075
 
 
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4081
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4082
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.295us 1196.72% 159.295us 159.295us 1
4083
+ torch_eager 6.43% 135.135us 99.75% 2.096ms 2.096ms 0.000us 0.00% 15.615us 15.615us 1
4084
+ aten::silu 2.00% 41.931us 89.60% 1.883ms 627.518us 6.815us 51.20% 9.119us 3.040us 3
4085
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.815us 51.20% 6.815us 2.272us 3
4086
+ aten::mul 1.42% 29.749us 2.27% 47.691us 15.897us 6.496us 48.80% 6.496us 2.165us 3
4087
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.496us 48.80% 6.496us 2.165us 3
4088
+ Activity Buffer Request 79.61% 1.673ms 79.61% 1.673ms 1.673ms 2.304us 17.31% 2.304us 2.304us 1
4089
+ aten::slice 1.22% 25.650us 1.46% 30.630us 5.105us 0.000us 0.00% 0.000us 0.000us 6
4090
+ aten::as_strided 0.24% 4.980us 0.24% 4.980us 0.830us 0.000us 0.00% 0.000us 0.000us 6
4091
+ cudaLaunchKernel 8.84% 185.847us 8.84% 185.847us 30.974us 0.000us 0.00% 0.000us 0.000us 6
4092
+ cudaDeviceSynchronize 0.25% 5.161us 0.25% 5.161us 5.161us 0.000us 0.00% 0.000us 0.000us 1
4093
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4094
+ Self CPU time total: 2.101ms
4095
+ Self CUDA time total: 13.311us
4096
 
4097
 
4098
 
 
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4104
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4105
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.432us 1014.55% 158.432us 158.432us 1
4106
+ torch_eager 6.38% 140.261us 99.75% 2.192ms 2.192ms 0.000us 0.00% 18.304us 18.304us 1
4107
+ aten::silu 1.93% 42.492us 89.81% 1.973ms 657.799us 8.000us 51.23% 10.688us 3.563us 3
4108
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 8.000us 51.23% 8.000us 2.667us 3
4109
+ aten::mul 1.27% 27.872us 2.10% 46.122us 15.374us 7.616us 48.77% 7.616us 2.539us 3
4110
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 48.77% 7.616us 2.539us 3
4111
+ Activity Buffer Request 80.61% 1.771ms 80.61% 1.771ms 1.771ms 2.688us 17.21% 2.688us 2.688us 1
4112
+ aten::slice 1.22% 26.832us 1.46% 31.992us 5.332us 0.000us 0.00% 0.000us 0.000us 6
4113
+ aten::as_strided 0.23% 5.160us 0.23% 5.160us 0.860us 0.000us 0.00% 0.000us 0.000us 6
4114
+ cudaLaunchKernel 8.09% 177.845us 8.09% 177.845us 29.641us 0.000us 0.00% 0.000us 0.000us 6
4115
+ cudaDeviceSynchronize 0.25% 5.530us 0.25% 5.530us 5.530us 0.000us 0.00% 0.000us 0.000us 1
4116
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4117
+ Self CPU time total: 2.197ms
4118
+ Self CUDA time total: 15.616us
4119
 
4120
 
4121
 
 
4125
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4126
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.083us 1097.65% 158.083us 158.083us 1
4129
+ torch_eager 6.35% 128.334us 99.75% 2.015ms 2.015ms 0.000us 0.00% 16.898us 16.898us 1
4130
+ aten::silu 2.10% 42.419us 89.46% 1.807ms 602.407us 7.394us 51.34% 9.890us 3.297us 3
4131
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.394us 51.34% 7.394us 2.465us 3
4132
+ aten::mul 1.39% 28.141us 2.40% 48.382us 16.127us 7.008us 48.66% 7.008us 2.336us 3
4133
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.66% 7.008us 2.336us 3
4134
+ Activity Buffer Request 79.49% 1.606ms 79.49% 1.606ms 1.606ms 2.496us 17.33% 2.496us 2.496us 1
4135
+ aten::slice 1.27% 25.691us 1.54% 31.081us 5.180us 0.000us 0.00% 0.000us 0.000us 6
4136
+ aten::as_strided 0.27% 5.390us 0.27% 5.390us 0.898us 0.000us 0.00% 0.000us 0.000us 6
4137
+ cudaLaunchKernel 8.88% 179.306us 8.88% 179.306us 29.884us 0.000us 0.00% 0.000us 0.000us 6
4138
+ cudaDeviceSynchronize 0.25% 5.100us 0.25% 5.100us 5.100us 0.000us 0.00% 0.000us 0.000us 1
4139
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4140
+ Self CPU time total: 2.020ms
4141
+ Self CUDA time total: 14.402us
4142
 
4143
 
4144
 
 
4148
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4149
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4150
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4151
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.658us 1020.18% 158.658us 158.658us 1
4152
+ torch_eager 5.52% 111.823us 99.73% 2.019ms 2.019ms 0.000us 0.00% 18.240us 18.240us 1
4153
+ aten::silu 2.12% 42.830us 90.25% 1.827ms 609.110us 7.936us 51.03% 10.624us 3.541us 3
4154
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.03% 7.936us 2.645us 3
4155
+ aten::mul 1.37% 27.772us 2.44% 49.332us 16.444us 7.616us 48.97% 7.616us 2.539us 3
4156
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 48.97% 7.616us 2.539us 3
4157
+ Activity Buffer Request 80.18% 1.623ms 80.18% 1.623ms 1.623ms 2.688us 17.28% 2.688us 2.688us 1
4158
+ aten::slice 1.25% 25.302us 1.51% 30.641us 5.107us 0.000us 0.00% 0.000us 0.000us 6
4159
+ aten::as_strided 0.26% 5.339us 0.26% 5.339us 0.890us 0.000us 0.00% 0.000us 0.000us 6
4160
+ cudaLaunchKernel 9.02% 182.624us 9.02% 182.624us 30.437us 0.000us 0.00% 0.000us 0.000us 6
4161
+ cudaDeviceSynchronize 0.27% 5.520us 0.27% 5.520us 5.520us 0.000us 0.00% 0.000us 0.000us 1
4162
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4163
+ Self CPU time total: 2.025ms
4164
+ Self CUDA time total: 15.552us
4165
 
4166
 
4167
 
 
4171
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4172
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4173
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4174
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 164.002us 726.96% 164.002us 164.002us 1
4175
+ torch_eager 5.39% 111.814us 99.74% 2.071ms 2.071ms 0.000us 0.00% 26.464us 26.464us 1
4176
+ aten::silu 2.07% 43.010us 90.61% 1.881ms 627.114us 11.616us 51.49% 15.520us 5.173us 3
4177
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.616us 51.49% 11.616us 3.872us 3
4178
+ aten::mul 1.37% 28.451us 2.32% 48.232us 16.077us 10.944us 48.51% 10.944us 3.648us 3
4179
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.944us 48.51% 10.944us 3.648us 3
4180
+ Activity Buffer Request 80.76% 1.677ms 80.76% 1.677ms 1.677ms 3.904us 17.30% 3.904us 3.904us 1
4181
+ aten::slice 1.14% 23.769us 1.41% 29.310us 4.885us 0.000us 0.00% 0.000us 0.000us 6
4182
+ aten::as_strided 0.27% 5.541us 0.27% 5.541us 0.923us 0.000us 0.00% 0.000us 0.000us 6
4183
+ cudaLaunchKernel 8.74% 181.415us 8.74% 181.415us 30.236us 0.000us 0.00% 0.000us 0.000us 6
4184
+ cudaDeviceSynchronize 0.26% 5.500us 0.26% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1
4185
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4186
+ Self CPU time total: 2.076ms
4187
+ Self CUDA time total: 22.560us
4188
 
4189
 
4190
  impl wl p50(ms) ok
4191
+ torch_eager cuda_T128_D1024 0.06 True
4192
  torch_eager cuda_T128_D2048 0.05 True
4193
  torch_eager cuda_T128_D768 0.04 True
4194
  torch_eager cuda_T256_D1024 0.05 True
 
4198
  torch_eager cuda_T512_D2048 0.05 True
4199
  torch_eager cuda_T512_D768 0.05 True
4200
  </pre></div>
 
 
 
 
 
 
4201
  <div class="cell-artifacts">
4202
  <h4>Artifacts:</h4>
4203
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
activation/index.html CHANGED
@@ -83,7 +83,7 @@
83
  <h1>Index of /activation</h1>
84
  <ul>
85
  <li><a href='impls/index.html' class='dir'>impls/</a></li>
86
- <li><a href='results/index.html' class='dir'>results/</a></li>
87
  </ul>
88
  </body>
89
  </html>
 
83
  <h1>Index of /activation</h1>
84
  <ul>
85
  <li><a href='impls/index.html' class='dir'>impls/</a></li>
86
+ <li><a href='results_linux/index.html' class='dir'>results_linux/</a></li>
87
  </ul>
88
  </body>
89
  </html>
activation/results_linux/artifacts/combine/latency.svg ADDED

Git LFS Details

  • SHA256: e0ebb1563a7889f083e3a946d975f77f909986c659d9bdfad99579689fd355e5
  • Pointer size: 130 Bytes
  • Size of remote file: 21.5 kB
activation/results_linux/cells/combine.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch==2.8.0",
6
+ # "kernels-benchmark-tools",
7
+ # "matplotlib",
8
+ # ]
9
+ #
10
+ # [tool.uv.sources]
11
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
12
+ # ///
13
+ from kernels_benchmark_tools.core.visuals import generate_combined_results
14
+
15
+ # Map display names to uvnote environment variables
16
+ cache_env_map = {
17
+ "HF Kernels SwiGLU": "UVNOTE_FILE_HF_KERNELS_SWIGLU_BENCHMARK",
18
+ "PyTorch SwiGLU": "UVNOTE_FILE_TORCH_SWIGLU_BENCHMARK",
19
+ }
20
+
21
+ # Generate combined results with visualization
22
+ generate_combined_results(
23
+ cache_env_map=cache_env_map,
24
+ output_filename="activation.jsonl",
25
+ svg_filename="latency.svg"
26
+ )
activation/results_linux/combined_results.html ADDED
The diff for this file is too large to render. See raw diff
 
activation/results_linux/index.html ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /activation/results_linux</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /activation/results_linux</h1>
84
+ <ul>
85
+ <li><a href='combined_results.html' class='file'>combined_results.html</a></li>
86
+ </ul>
87
+ </body>
88
+ </html>
causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl CHANGED
@@ -1,24 +1,24 @@
1
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07121199996618088, "p50": 0.07280099998752121, "p90": 0.07319100001268453, "mean": 0.07264140000415864, "iqr": 0.0009299999987888441, "raw_times": [0.07226100001389568, 0.07280099998752121, 0.07121199996618088, 0.07319100001268453, 0.0737420000405109], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07994200001348872, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
2
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08309200001122008, "p50": 0.08365100001128667, "p90": 0.08423100001664352, "mean": 0.08372920000283557, "iqr": 0.0009500000146545062, "raw_times": [0.08328100000198901, 0.08439099997303856, 0.08365100001128667, 0.08309200001122008, 0.08423100001664352], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08683200002224112, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
3
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08177099999784332, "p50": 0.08268100003760992, "p90": 0.08301100001517625, "mean": 0.08273520002148871, "iqr": 0.0009299999987888441, "raw_times": [0.0820810000163874, 0.08413200004042665, 0.08301100001517625, 0.08268100003760992, 0.08177099999784332], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08564099999830432, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
4
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08037100002411535, "p50": 0.0813119999634182, "p90": 0.08183099998859689, "mean": 0.08147719998987668, "iqr": 0.0007300000106624793, "raw_times": [0.08037100002411535, 0.08183099998859689, 0.08277099999531856, 0.08110099997793441, 0.0813119999634182], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08382099997561454, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
5
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07959199996321331, "p50": 0.08119099999248647, "p90": 0.08157100000971695, "mean": 0.08080360000803921, "iqr": 0.0016889999869817984, "raw_times": [0.08157100000971695, 0.08178200005204417, 0.08119099999248647, 0.07988200002273516, 0.07959199996321331], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08668100002751089, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
6
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0801809999870784, "p50": 0.08213100005605156, "p90": 0.08258200000454963, "mean": 0.08208560000184661, "iqr": 0.0013300000318849925, "raw_times": [0.0801809999870784, 0.08213100005605156, 0.08428199998888886, 0.08258200000454963, 0.08125199997266463], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08745099995621786, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
7
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07887199996048366, "p50": 0.08016100002805615, "p90": 0.08020199999236866, "mean": 0.0800293999986934, "iqr": 7.099998811099795e-05, "raw_times": [0.08078100000830091, 0.08013100000425766, 0.08016100002805615, 0.08020199999236866, 0.07887199996048366], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0863010000102804, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
8
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07991100000026563, "p50": 0.0806710000347266, "p90": 0.08072099996070392, "mean": 0.08052299999690149, "iqr": 0.0007399999617518915, "raw_times": [0.0806710000347266, 0.07998099999895203, 0.08133099998985927, 0.08072099996070392, 0.07991100000026563], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08638100001689963, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
9
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07996199997251097, "p50": 0.08190199997670788, "p90": 0.08266099996490084, "mean": 0.08240359998126223, "iqr": 0.0024089999897114467, "raw_times": [0.07996199997251097, 0.08724100001700208, 0.08266099996490084, 0.08025199997518939, 0.08190199997670788], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0852109999982531, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
10
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08023199995932373, "p50": 0.08207100000845458, "p90": 0.08239099997808808, "mean": 0.08726959998739403, "iqr": 0.0012489999789977446, "raw_times": [0.08239099997808808, 0.11051199999201344, 0.08023199995932373, 0.08114199999909033, 0.08207100000845458], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0862620000248171, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
11
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.1583530000175415, "p50": 0.15909299997929338, "p90": 0.15925299999253184, "mean": 0.15925679999782005, "iqr": 0.00036099999078942346, "raw_times": [0.15909299997929338, 0.15889200000174242, 0.16069299999799114, 0.15925299999253184, 0.1583530000175415], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1604330000191112, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
12
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16316300002472417, "p50": 0.164162999965356, "p90": 0.16488300002492906, "mean": 0.16412500000342334, "iqr": 0.001400000030571391, "raw_times": [0.164162999965356, 0.16316300002472417, 0.16488300002492906, 0.1649330000077498, 0.16348299999435767], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.16465299995616078, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
13
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07947099999228158, "p50": 0.08210099997540965, "p90": 0.08814200003826045, "mean": 0.08720339999399584, "iqr": 0.0077410000471900275, "raw_times": [0.08814200003826045, 0.08040099999107042, 0.10590199997295713, 0.08210099997540965, 0.07947099999228158], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08537200000091616, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
14
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07920100000546881, "p50": 0.0799709999910192, "p90": 0.0812010000004193, "mean": 0.08044520000112243, "iqr": 0.0014700000292577897, "raw_times": [0.07973099997116151, 0.08212200003754333, 0.0812010000004193, 0.07920100000546881, 0.0799709999910192], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0854219999837369, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
15
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07912099999884958, "p50": 0.08021200000030149, "p90": 0.08024099997783196, "mean": 0.07987539999021465, "iqr": 0.0010200000133409048, "raw_times": [0.07912099999884958, 0.07922099996449106, 0.08021200000030149, 0.08024099997783196, 0.08058200000959914], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.084330999982285, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
16
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08098199998585187, "p50": 0.08176099998991049, "p90": 0.08322100001123545, "mean": 0.08342759999777627, "iqr": 0.0014889999988554337, "raw_times": [0.08098199998585187, 0.08322100001123545, 0.08176099998991049, 0.08173200001238001, 0.08944199998950353], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08506099999294747, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
17
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07974099997909434, "p50": 0.0811310000017329, "p90": 0.08221100000582737, "mean": 0.08252540000057706, "iqr": 0.001659000020026724, "raw_times": [0.07974099997909434, 0.08221100000582737, 0.0811310000017329, 0.08899200003043006, 0.08055199998580065], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0852420000114762, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
18
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08050200000297991, "p50": 0.08192199999257355, "p90": 0.08227199998600554, "mean": 0.08175159999836978, "iqr": 0.000530999955117295, "raw_times": [0.08174100003088824, 0.08050200000297991, 0.08192199999257355, 0.08227199998600554, 0.08232099997940168], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0854219999837369, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
19
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07911099999091675, "p50": 0.08030099996858553, "p90": 0.08107100001097933, "mean": 0.08047899999610308, "iqr": 0.0018000000068241206, "raw_times": [0.08030099996858553, 0.0826410000058786, 0.08107100001097933, 0.07911099999091675, 0.07927100000415521], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0837320000073305, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
20
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0812719999885303, "p50": 0.08212100004811873, "p90": 0.08214100000714097, "mean": 0.08211120000396477, "iqr": 0.00043000000005122274, "raw_times": [0.08171100000708975, 0.08212100004811873, 0.08331099996894409, 0.08214100000714097, 0.0812719999885303], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0876720000064779, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
21
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09240200000704135, "p50": 0.09415100004162014, "p90": 0.09504199999810226, "mean": 0.09392180000986627, "iqr": 0.002310000013494573, "raw_times": [0.09240200000704135, 0.09273199998460768, 0.09504199999810226, 0.09528200001795994, 0.09415100004162014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09613200001012956, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
22
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09898100000782506, "p50": 0.09988099998281541, "p90": 0.10064200000670098, "mean": 0.1000036000050386, "iqr": 0.000919999990856013, "raw_times": [0.09972200001584497, 0.09898100000782506, 0.09988099998281541, 0.10079200001200661, 0.10064200000670098], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10166200002004189, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
23
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4828179999663007, "p50": 0.4840080000008129, "p90": 0.4856980000340627, "mean": 0.48515199999883407, "iqr": 0.0026800000227922283, "raw_times": [0.4830180000112705, 0.4828179999663007, 0.4856980000340627, 0.49021799998172355, 0.4840080000008129], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4863379999733297, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
24
- {"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.49664799996662623, "p50": 0.4980280000381754, "p90": 0.5011090000266449, "mean": 0.4989826000155517, "iqr": 0.004230000001825829, "raw_times": [0.4980280000381754, 0.502249000021493, 0.49664799996662623, 0.4968790000248191, 0.5011090000266449], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.49949900000001435, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
 
1
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04411100007928326, "p50": 0.04549100003714557, "p90": 0.046580999878642615, "mean": 0.045672999976886786, "iqr": 0.0013399999261309858, "raw_times": [0.04524099995251163, 0.04549100003714557, 0.04694099993685086, 0.046580999878642615, 0.04411100007928326], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05584099994848657, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
2
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05282100005388202, "p50": 0.05301199985296989, "p90": 0.053511999794864096, "mean": 0.053615399929185514, "iqr": 0.0006809998467360856, "raw_times": [0.05282100005388202, 0.05301199985296989, 0.05283099994812801, 0.055900999996083556, 0.053511999794864096], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05840199992235284, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
3
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05104100000608014, "p50": 0.05199099996389123, "p90": 0.052391000053830794, "mean": 0.05195319999984349, "iqr": 0.0011390000054234406, "raw_times": [0.05125200004840735, 0.05309099992700794, 0.05199099996389123, 0.05104100000608014, 0.052391000053830794], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05686200006493891, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
4
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049951999926634016, "p50": 0.052720999974553706, "p90": 0.05275099988466536, "mean": 0.05206719993111619, "iqr": 0.001500000053056283, "raw_times": [0.049951999926634016, 0.052720999974553706, 0.0536610000381188, 0.05275099988466536, 0.05125099983160908], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056871999959184905, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
5
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04956099996888952, "p50": 0.05162100001143699, "p90": 0.05194100003791391, "mean": 0.05130520003149286, "iqr": 0.0009590000900061568, "raw_times": [0.05194100003791391, 0.052421000191316125, 0.05098199994790775, 0.04956099996888952, 0.05162100001143699], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05496100015989214, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
6
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05030099987379799, "p50": 0.05167099993741431, "p90": 0.05240099994807679, "mean": 0.05315919993336138, "iqr": 0.00083000008999079, "raw_times": [0.05030099987379799, 0.05985200004943181, 0.05240099994807679, 0.051570999858086, 0.05167099993741431], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055181000107040745, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
7
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04961100012224051, "p50": 0.05098099995848315, "p90": 0.05157100008545967, "mean": 0.05098120004731754, "iqr": 0.0007500000265281415, "raw_times": [0.04961100012224051, 0.051922000011472846, 0.05082100005893153, 0.05098099995848315, 0.05157100008545967], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053892000096311676, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
8
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049450999995315215, "p50": 0.049821000175143126, "p90": 0.05057099997429759, "mean": 0.050117000046157045, "iqr": 0.0009599998520570807, "raw_times": [0.049450999995315215, 0.049821000175143126, 0.05113099996378878, 0.05057099997429759, 0.04961100012224051], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0559909999537922, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
9
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04905100013274932, "p50": 0.05060100011178292, "p90": 0.05113099996378878, "mean": 0.051737200055868016, "iqr": 0.0005899998996028444, "raw_times": [0.057362000006833114, 0.05113099996378878, 0.05060100011178292, 0.05054100006418594, 0.04905100013274932], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05664199989041663, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
10
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05021099991608935, "p50": 0.051011000095968484, "p90": 0.05146100011188537, "mean": 0.05368720007936645, "iqr": 0.0007099999947968172, "raw_times": [0.05075100011708855, 0.05021099991608935, 0.06500200015580049, 0.051011000095968484, 0.05146100011188537], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05477099989548151, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
11
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04905100013274932, "p50": 0.0506710000536259, "p90": 0.05074099999546888, "mean": 0.05034520004301157, "iqr": 0.0009190000582748326, "raw_times": [0.04905100013274932, 0.04982199993719405, 0.05074099999546888, 0.0506710000536259, 0.05144100009601971], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05602199985332845, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
12
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04956099996888952, "p50": 0.05081099993731186, "p90": 0.05169099995327997, "mean": 0.05088699999760138, "iqr": 0.001219999830937013, "raw_times": [0.04956099996888952, 0.05081099993731186, 0.05169099995327997, 0.05047100012234296, 0.051901000006182585], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09174200022243895, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
13
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04988099999536644, "p50": 0.05099100008010282, "p90": 0.05105099990032613, "mean": 0.052221000032659504, "iqr": 0.0010199998996540671, "raw_times": [0.050031000000672066, 0.05105099990032613, 0.05915100018683006, 0.04988099999536644, 0.05099100008010282], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05531200008590531, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
14
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05021099991608935, "p50": 0.051081000037811464, "p90": 0.05169199994270457, "mean": 0.0512771999638062, "iqr": 0.0008909998996387003, "raw_times": [0.05169199994270457, 0.05080100004306587, 0.051081000037811464, 0.05021099991608935, 0.05260099987935973], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05523100003301806, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
15
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05031099999541766, "p50": 0.05089100000077451, "p90": 0.05121100002725143, "mean": 0.05099300005895202, "iqr": 0.0007899998308857903, "raw_times": [0.05031099999541766, 0.05121100002725143, 0.05213100007495086, 0.05042100019636564, 0.05089100000077451], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05383200004871469, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
16
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04998199983674567, "p50": 0.05088100010652852, "p90": 0.05478100001710118, "mean": 0.05373740004870342, "iqr": 0.004679999847212457, "raw_times": [0.04998199983674567, 0.05478100001710118, 0.05010100016988872, 0.05088100010652852, 0.06294200011325302], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055170999985421076, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
17
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0490120000904426, "p50": 0.050441999974282226, "p90": 0.0519509999321599, "mean": 0.05078939998384158, "iqr": 0.0017099998785852222, "raw_times": [0.0490120000904426, 0.05230099986874848, 0.050441999974282226, 0.05024100005357468, 0.0519509999321599], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055800999916755245, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
18
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04950099992129253, "p50": 0.050521000048320275, "p90": 0.05074099999546888, "mean": 0.05043319997639628, "iqr": 0.0008590000106778461, "raw_times": [0.04988199998479104, 0.05152099993210868, 0.050521000048320275, 0.05074099999546888, 0.04950099992129253], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054772000112279784, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
19
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049130999968838296, "p50": 0.05138200003784732, "p90": 0.05160199998499593, "mean": 0.05141159999766387, "iqr": 0.0003800000740739051, "raw_times": [0.049130999968838296, 0.05372100008571579, 0.05122199991092202, 0.05160199998499593, 0.05138200003784732], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05565100013882329, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
20
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05034100013290299, "p50": 0.05126099995322875, "p90": 0.051630999905682984, "mean": 0.051170999995520106, "iqr": 0.0007999999525054591, "raw_times": [0.05126099995322875, 0.05034100013290299, 0.051630999905682984, 0.05179100003260828, 0.050830999953177525], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05449200011753419, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
21
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05069099984211789, "p50": 0.05091200000606477, "p90": 0.05127099984747474, "mean": 0.07049399991956307, "iqr": 0.00038899997889529914, "raw_times": [0.05069099984211789, 0.1487140000335785, 0.05091200000606477, 0.05088199986857944, 0.05127099984747474], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054521000038221246, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
22
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049631000138106174, "p50": 0.05295099981594831, "p90": 0.05334100001164188, "mean": 0.057009199963431456, "iqr": 0.0027099999897473026, "raw_times": [0.049631000138106174, 0.05295099981594831, 0.05334100001164188, 0.05063100002189458, 0.07849199982956634], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05502199996954005, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
23
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0506710000536259, "p50": 0.05091100001664017, "p90": 0.051161999863325036, "mean": 0.0510071999997308, "iqr": 0.00031099989428184927, "raw_times": [0.05091100001664017, 0.0506710000536259, 0.05085099996904319, 0.05144100009601971, 0.051161999863325036], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057221000133722555, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
24
+ {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04936100003760657, "p50": 0.050112000053559314, "p90": 0.05135099991093739, "mean": 0.050513199994384195, "iqr": 0.0015199998415482696, "raw_times": [0.04936100003760657, 0.05135099991093739, 0.04983100006938912, 0.05191099990042858, 0.050112000053559314], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05625199992209673, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
causal_conv1d/impls/cells/benchmark.py CHANGED
@@ -4,37 +4,28 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
10
  # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
11
  # ///
12
  import torch
13
- import torch.nn.functional as F
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
16
 
 
 
17
 
18
- def torch_causal_conv1d(input_tensor, weight, bias):
19
- # Convert to weight dtype for computation
20
- x = input_tensor.to(weight.dtype)
21
- dim = weight.shape[0]
22
- width = weight.shape[1]
23
- seqlen = input_tensor.shape[-1]
24
 
25
- # Depthwise causal conv1d using PyTorch
26
- out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
27
-
28
- # Truncate to original sequence length
29
- out = out[..., :seqlen]
30
-
31
- # Convert back to original dtype
32
- return out.to(input_tensor.dtype)
33
 
34
 
35
  run_benchmark(
36
  kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
37
- impl_name="torch_eager",
38
- impl_tags={"family": "pytorch", "backend": "eager"},
39
- impl_func=torch_causal_conv1d,
40
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
+ # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
11
  # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
12
  # ///
13
  import torch
 
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
+ from kernels import get_kernel
17
 
18
+ # Load the causal conv1d kernel
19
+ causal_conv1d = get_kernel("kernels-community/causal-conv1d")
20
 
 
 
 
 
 
 
21
 
22
+ def hf_kernels_causal_conv1d(input_tensor, weight, bias):
23
+ return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias)
 
 
 
 
 
 
24
 
25
 
26
  run_benchmark(
27
  kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
28
+ impl_name="hf_kernels_causal_conv1d",
29
+ impl_tags={"family": "hf-kernels", "backend": "cuda"},
30
+ impl_func=hf_kernels_causal_conv1d,
31
  )
causal_conv1d/impls/hf_kernels_causal_conv1d.html CHANGED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/impls/torch_causal_conv1d.html CHANGED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 69a09b74de90d8a70bffd10eab24eac79df8e4954f3e91129689ad6a56422eed
  • Pointer size: 130 Bytes
  • Size of remote file: 35.4 kB

Git LFS Details

  • SHA256: a2617076455d3985f32d3652d376c64caac8acd7513e105352d8ccd515d5c005
  • Pointer size: 130 Bytes
  • Size of remote file: 35.4 kB
causal_conv1d/results/combined_results.html CHANGED
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-12-19T19:09:46.065014</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
@@ -4233,70 +4233,70 @@ body[data-tool="eraser"] .main-content {
4233
  <g id="matplotlib.axis_2">
4234
  <g id="ytick_1">
4235
  <g id="grid-y--2" class="grid grid-y">
4236
- <path d="M 47.72 375.695489 L 831.034248 375.695489 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4237
  </g>
4238
  <g id="line2d_25">
4239
  <defs>
4240
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4241
  </defs>
4242
  <g>
4243
- <use ns4:href="#m0fca2865ba" x="47.72" y="375.695489" style="stroke: #000000; stroke-width: 0.8" />
4244
  </g>
4245
  </g>
4246
  <g id="text_25">
4247
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.494708" transform="rotate(-0 40.72 379.494708)">0.1</text>
4248
  </g>
4249
  </g>
4250
  <g id="ytick_2">
4251
  <g id="grid-y--3" class="grid grid-y">
4252
- <path d="M 47.72 292.764994 L 831.034248 292.764994 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4253
  </g>
4254
  <g id="line2d_26">
4255
  <g>
4256
- <use ns4:href="#m0fca2865ba" x="47.72" y="292.764994" style="stroke: #000000; stroke-width: 0.8" />
4257
  </g>
4258
  </g>
4259
  <g id="text_26">
4260
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.564213" transform="rotate(-0 40.72 296.564213)">0.2</text>
4261
  </g>
4262
  </g>
4263
  <g id="ytick_3">
4264
  <g id="grid-y--4" class="grid grid-y">
4265
- <path d="M 47.72 209.834499 L 831.034248 209.834499 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4266
  </g>
4267
  <g id="line2d_27">
4268
  <g>
4269
- <use ns4:href="#m0fca2865ba" x="47.72" y="209.834499" style="stroke: #000000; stroke-width: 0.8" />
4270
  </g>
4271
  </g>
4272
  <g id="text_27">
4273
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.633718" transform="rotate(-0 40.72 213.633718)">0.3</text>
4274
  </g>
4275
  </g>
4276
  <g id="ytick_4">
4277
  <g id="grid-y--5" class="grid grid-y">
4278
- <path d="M 47.72 126.904004 L 831.034248 126.904004 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4279
  </g>
4280
  <g id="line2d_28">
4281
  <g>
4282
- <use ns4:href="#m0fca2865ba" x="47.72" y="126.904004" style="stroke: #000000; stroke-width: 0.8" />
4283
  </g>
4284
  </g>
4285
  <g id="text_28">
4286
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.703223" transform="rotate(-0 40.72 130.703223)">0.4</text>
4287
  </g>
4288
  </g>
4289
  <g id="ytick_5">
4290
  <g id="grid-y--6" class="grid grid-y">
4291
- <path d="M 47.72 43.973509 L 831.034248 43.973509 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4292
  </g>
4293
  <g id="line2d_29">
4294
  <g>
4295
- <use ns4:href="#m0fca2865ba" x="47.72" y="43.973509" style="stroke: #000000; stroke-width: 0.8" />
4296
  </g>
4297
  </g>
4298
  <g id="text_29">
4299
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.772728" transform="rotate(-0 40.72 47.772728)">0.5</text>
4300
  </g>
4301
  </g>
4302
  <g id="label--y" class="ylabel">
@@ -4304,66 +4304,66 @@ body[data-tool="eraser"] .main-content {
4304
  </g>
4305
  </g>
4306
  <g id="series--hf-kernels-causal-conv1d" class="series">
4307
- <path d="M 83.325193 420.186871 L 114.286231 415.418367 L 145.247268 416.273381 L 176.208306 416.636616 L 207.169343 417.035512 L 238.130381 417.110979 L 269.091418 417.475043 L 300.052455 417.781886 L 331.013493 417.616025 L 361.97453 417.516509 L 392.935568 417.674077 L 423.896605 417.018926 L 454.857643 417.358941 L 485.81868 417.823352 L 516.779718 416.969997 L 547.740755 416.968338 L 578.701793 417.640904 L 609.66283 417.300889 L 640.623868 417.38382 L 671.584905 417.980919 L 702.545943 416.836479 L 733.50698 417.110149 L 764.468018 417.450164 L 795.429055 417.582853 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4308
  <defs>
4309
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4310
  </defs>
4311
  <g clip-path="url(#pb49fc4c8d2)">
4312
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4313
- <use ns4:href="#md7efaf3aec" x="114.286231" y="415.418367" style="fill: #1f77b4; stroke: #1f77b4" />
4314
- <use ns4:href="#md7efaf3aec" x="145.247268" y="416.273381" style="fill: #1f77b4; stroke: #1f77b4" />
4315
- <use ns4:href="#md7efaf3aec" x="176.208306" y="416.636616" style="fill: #1f77b4; stroke: #1f77b4" />
4316
- <use ns4:href="#md7efaf3aec" x="207.169343" y="417.035512" style="fill: #1f77b4; stroke: #1f77b4" />
4317
- <use ns4:href="#md7efaf3aec" x="238.130381" y="417.110979" style="fill: #1f77b4; stroke: #1f77b4" />
4318
- <use ns4:href="#md7efaf3aec" x="269.091418" y="417.475043" style="fill: #1f77b4; stroke: #1f77b4" />
4319
- <use ns4:href="#md7efaf3aec" x="300.052455" y="417.781886" style="fill: #1f77b4; stroke: #1f77b4" />
4320
- <use ns4:href="#md7efaf3aec" x="331.013493" y="417.616025" style="fill: #1f77b4; stroke: #1f77b4" />
4321
- <use ns4:href="#md7efaf3aec" x="361.97453" y="417.516509" style="fill: #1f77b4; stroke: #1f77b4" />
4322
- <use ns4:href="#md7efaf3aec" x="392.935568" y="417.674077" style="fill: #1f77b4; stroke: #1f77b4" />
4323
- <use ns4:href="#md7efaf3aec" x="423.896605" y="417.018926" style="fill: #1f77b4; stroke: #1f77b4" />
4324
- <use ns4:href="#md7efaf3aec" x="454.857643" y="417.358941" style="fill: #1f77b4; stroke: #1f77b4" />
4325
- <use ns4:href="#md7efaf3aec" x="485.81868" y="417.823352" style="fill: #1f77b4; stroke: #1f77b4" />
4326
- <use ns4:href="#md7efaf3aec" x="516.779718" y="416.969997" style="fill: #1f77b4; stroke: #1f77b4" />
4327
- <use ns4:href="#md7efaf3aec" x="547.740755" y="416.968338" style="fill: #1f77b4; stroke: #1f77b4" />
4328
- <use ns4:href="#md7efaf3aec" x="578.701793" y="417.640904" style="fill: #1f77b4; stroke: #1f77b4" />
4329
- <use ns4:href="#md7efaf3aec" x="609.66283" y="417.300889" style="fill: #1f77b4; stroke: #1f77b4" />
4330
- <use ns4:href="#md7efaf3aec" x="640.623868" y="417.38382" style="fill: #1f77b4; stroke: #1f77b4" />
4331
- <use ns4:href="#md7efaf3aec" x="671.584905" y="417.980919" style="fill: #1f77b4; stroke: #1f77b4" />
4332
- <use ns4:href="#md7efaf3aec" x="702.545943" y="416.836479" style="fill: #1f77b4; stroke: #1f77b4" />
4333
- <use ns4:href="#md7efaf3aec" x="733.50698" y="417.110149" style="fill: #1f77b4; stroke: #1f77b4" />
4334
- <use ns4:href="#md7efaf3aec" x="764.468018" y="417.450164" style="fill: #1f77b4; stroke: #1f77b4" />
4335
- <use ns4:href="#md7efaf3aec" x="795.429055" y="417.582853" style="fill: #1f77b4; stroke: #1f77b4" />
4336
  </g>
4337
  </g>
4338
  <g id="series--torch-eager" class="series">
4339
- <path d="M 83.325193 398.251755 L 114.286231 389.253796 L 145.247268 390.058222 L 176.208306 391.19354 L 207.169343 391.293886 L 238.130381 390.514339 L 269.091418 392.14807 L 300.052455 391.725125 L 331.013493 390.70425 L 361.97453 390.564098 L 392.935568 326.689372 L 423.896605 322.484796 L 454.857643 390.539219 L 485.81868 392.305638 L 516.779718 392.105776 L 547.740755 390.821182 L 578.701793 391.343644 L 609.66283 390.687664 L 640.623868 392.031968 L 671.584905 390.522632 L 702.545943 380.546094 L 733.50698 375.794177 L 764.468018 57.235754 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4340
  <defs>
4341
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4342
  </defs>
4343
  <g clip-path="url(#pb49fc4c8d2)">
4344
- <use ns4:href="#m9b8c54d372" x="83.325193" y="398.251755" style="fill: #ff7f0e; stroke: #ff7f0e" />
4345
- <use ns4:href="#m9b8c54d372" x="114.286231" y="389.253796" style="fill: #ff7f0e; stroke: #ff7f0e" />
4346
- <use ns4:href="#m9b8c54d372" x="145.247268" y="390.058222" style="fill: #ff7f0e; stroke: #ff7f0e" />
4347
- <use ns4:href="#m9b8c54d372" x="176.208306" y="391.19354" style="fill: #ff7f0e; stroke: #ff7f0e" />
4348
- <use ns4:href="#m9b8c54d372" x="207.169343" y="391.293886" style="fill: #ff7f0e; stroke: #ff7f0e" />
4349
- <use ns4:href="#m9b8c54d372" x="238.130381" y="390.514339" style="fill: #ff7f0e; stroke: #ff7f0e" />
4350
- <use ns4:href="#m9b8c54d372" x="269.091418" y="392.14807" style="fill: #ff7f0e; stroke: #ff7f0e" />
4351
- <use ns4:href="#m9b8c54d372" x="300.052455" y="391.725125" style="fill: #ff7f0e; stroke: #ff7f0e" />
4352
- <use ns4:href="#m9b8c54d372" x="331.013493" y="390.70425" style="fill: #ff7f0e; stroke: #ff7f0e" />
4353
- <use ns4:href="#m9b8c54d372" x="361.97453" y="390.564098" style="fill: #ff7f0e; stroke: #ff7f0e" />
4354
- <use ns4:href="#m9b8c54d372" x="392.935568" y="326.689372" style="fill: #ff7f0e; stroke: #ff7f0e" />
4355
- <use ns4:href="#m9b8c54d372" x="423.896605" y="322.484796" style="fill: #ff7f0e; stroke: #ff7f0e" />
4356
- <use ns4:href="#m9b8c54d372" x="454.857643" y="390.539219" style="fill: #ff7f0e; stroke: #ff7f0e" />
4357
- <use ns4:href="#m9b8c54d372" x="485.81868" y="392.305638" style="fill: #ff7f0e; stroke: #ff7f0e" />
4358
- <use ns4:href="#m9b8c54d372" x="516.779718" y="392.105776" style="fill: #ff7f0e; stroke: #ff7f0e" />
4359
- <use ns4:href="#m9b8c54d372" x="547.740755" y="390.821182" style="fill: #ff7f0e; stroke: #ff7f0e" />
4360
- <use ns4:href="#m9b8c54d372" x="578.701793" y="391.343644" style="fill: #ff7f0e; stroke: #ff7f0e" />
4361
- <use ns4:href="#m9b8c54d372" x="609.66283" y="390.687664" style="fill: #ff7f0e; stroke: #ff7f0e" />
4362
- <use ns4:href="#m9b8c54d372" x="640.623868" y="392.031968" style="fill: #ff7f0e; stroke: #ff7f0e" />
4363
- <use ns4:href="#m9b8c54d372" x="671.584905" y="390.522632" style="fill: #ff7f0e; stroke: #ff7f0e" />
4364
- <use ns4:href="#m9b8c54d372" x="702.545943" y="380.546094" style="fill: #ff7f0e; stroke: #ff7f0e" />
4365
- <use ns4:href="#m9b8c54d372" x="733.50698" y="375.794177" style="fill: #ff7f0e; stroke: #ff7f0e" />
4366
- <use ns4:href="#m9b8c54d372" x="764.468018" y="57.235754" style="fill: #ff7f0e; stroke: #ff7f0e" />
4367
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
4368
  </g>
4369
  </g>
@@ -4422,7 +4422,7 @@ body[data-tool="eraser"] .main-content {
4422
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4423
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4424
  </span> |
4425
- Cell: combine | 4.61s
4426
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4427
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4428
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4547,7 +4547,7 @@ torch_eager cuda_B2_D64_S512_W2 0.08 True
4547
  torch_eager cuda_B2_D64_S512_W4 0.08 True
4548
  torch_eager cuda_B4_D2048_S128_W2 0.08 True
4549
  torch_eager cuda_B4_D2048_S128_W4 0.08 True
4550
- torch_eager cuda_B4_D2048_S2048_W2 0.48 True
4551
  torch_eager cuda_B4_D2048_S2048_W4 0.50 True
4552
  torch_eager cuda_B4_D2048_S512_W2 0.09 True
4553
  torch_eager cuda_B4_D2048_S512_W4 0.10 True
@@ -4576,7 +4576,7 @@ Implementations included:
4576
  <div class="uv-install-logs" id="uv-logs-combine">
4577
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4578
  <div class="uv-logs-content" style="display: none;">
4579
- Installed 37 packages in 314ms
4580
  </div>
4581
  </div>
4582
  <div class="cell-artifacts">
@@ -4589,7 +4589,7 @@ Installed 37 packages in 314ms
4589
  <rdf:RDF>
4590
  <ns2:Work>
4591
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4592
- <dc:date>2025-12-19T19:09:46.065014</dc:date>
4593
  <dc:format>image/svg+xml</dc:format>
4594
  <dc:creator>
4595
  <ns2:Agent>
@@ -4933,70 +4933,70 @@ Installed 37 packages in 314ms
4933
  <g id="matplotlib.axis_2">
4934
  <g id="ytick_1">
4935
  <g id="grid-y--2" class="grid grid-y">
4936
- <path d="M 47.72 375.695489 L 831.034248 375.695489 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4937
  </g>
4938
  <g id="line2d_25">
4939
  <defs>
4940
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4941
  </defs>
4942
  <g>
4943
- <use ns4:href="#m0fca2865ba" x="47.72" y="375.695489" style="stroke: #000000; stroke-width: 0.8" />
4944
  </g>
4945
  </g>
4946
  <g id="text_25">
4947
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.494708" transform="rotate(-0 40.72 379.494708)">0.1</text>
4948
  </g>
4949
  </g>
4950
  <g id="ytick_2">
4951
  <g id="grid-y--3" class="grid grid-y">
4952
- <path d="M 47.72 292.764994 L 831.034248 292.764994 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4953
  </g>
4954
  <g id="line2d_26">
4955
  <g>
4956
- <use ns4:href="#m0fca2865ba" x="47.72" y="292.764994" style="stroke: #000000; stroke-width: 0.8" />
4957
  </g>
4958
  </g>
4959
  <g id="text_26">
4960
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.564213" transform="rotate(-0 40.72 296.564213)">0.2</text>
4961
  </g>
4962
  </g>
4963
  <g id="ytick_3">
4964
  <g id="grid-y--4" class="grid grid-y">
4965
- <path d="M 47.72 209.834499 L 831.034248 209.834499 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4966
  </g>
4967
  <g id="line2d_27">
4968
  <g>
4969
- <use ns4:href="#m0fca2865ba" x="47.72" y="209.834499" style="stroke: #000000; stroke-width: 0.8" />
4970
  </g>
4971
  </g>
4972
  <g id="text_27">
4973
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.633718" transform="rotate(-0 40.72 213.633718)">0.3</text>
4974
  </g>
4975
  </g>
4976
  <g id="ytick_4">
4977
  <g id="grid-y--5" class="grid grid-y">
4978
- <path d="M 47.72 126.904004 L 831.034248 126.904004 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4979
  </g>
4980
  <g id="line2d_28">
4981
  <g>
4982
- <use ns4:href="#m0fca2865ba" x="47.72" y="126.904004" style="stroke: #000000; stroke-width: 0.8" />
4983
  </g>
4984
  </g>
4985
  <g id="text_28">
4986
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.703223" transform="rotate(-0 40.72 130.703223)">0.4</text>
4987
  </g>
4988
  </g>
4989
  <g id="ytick_5">
4990
  <g id="grid-y--6" class="grid grid-y">
4991
- <path d="M 47.72 43.973509 L 831.034248 43.973509 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4992
  </g>
4993
  <g id="line2d_29">
4994
  <g>
4995
- <use ns4:href="#m0fca2865ba" x="47.72" y="43.973509" style="stroke: #000000; stroke-width: 0.8" />
4996
  </g>
4997
  </g>
4998
  <g id="text_29">
4999
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.772728" transform="rotate(-0 40.72 47.772728)">0.5</text>
5000
  </g>
5001
  </g>
5002
  <g id="label--y" class="ylabel">
@@ -5004,66 +5004,66 @@ Installed 37 packages in 314ms
5004
  </g>
5005
  </g>
5006
  <g id="series--hf-kernels-causal-conv1d" class="series">
5007
- <path d="M 83.325193 420.186871 L 114.286231 415.418367 L 145.247268 416.273381 L 176.208306 416.636616 L 207.169343 417.035512 L 238.130381 417.110979 L 269.091418 417.475043 L 300.052455 417.781886 L 331.013493 417.616025 L 361.97453 417.516509 L 392.935568 417.674077 L 423.896605 417.018926 L 454.857643 417.358941 L 485.81868 417.823352 L 516.779718 416.969997 L 547.740755 416.968338 L 578.701793 417.640904 L 609.66283 417.300889 L 640.623868 417.38382 L 671.584905 417.980919 L 702.545943 416.836479 L 733.50698 417.110149 L 764.468018 417.450164 L 795.429055 417.582853 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
5008
  <defs>
5009
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
5010
  </defs>
5011
  <g clip-path="url(#pb49fc4c8d2)">
5012
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
5013
- <use ns4:href="#md7efaf3aec" x="114.286231" y="415.418367" style="fill: #1f77b4; stroke: #1f77b4" />
5014
- <use ns4:href="#md7efaf3aec" x="145.247268" y="416.273381" style="fill: #1f77b4; stroke: #1f77b4" />
5015
- <use ns4:href="#md7efaf3aec" x="176.208306" y="416.636616" style="fill: #1f77b4; stroke: #1f77b4" />
5016
- <use ns4:href="#md7efaf3aec" x="207.169343" y="417.035512" style="fill: #1f77b4; stroke: #1f77b4" />
5017
- <use ns4:href="#md7efaf3aec" x="238.130381" y="417.110979" style="fill: #1f77b4; stroke: #1f77b4" />
5018
- <use ns4:href="#md7efaf3aec" x="269.091418" y="417.475043" style="fill: #1f77b4; stroke: #1f77b4" />
5019
- <use ns4:href="#md7efaf3aec" x="300.052455" y="417.781886" style="fill: #1f77b4; stroke: #1f77b4" />
5020
- <use ns4:href="#md7efaf3aec" x="331.013493" y="417.616025" style="fill: #1f77b4; stroke: #1f77b4" />
5021
- <use ns4:href="#md7efaf3aec" x="361.97453" y="417.516509" style="fill: #1f77b4; stroke: #1f77b4" />
5022
- <use ns4:href="#md7efaf3aec" x="392.935568" y="417.674077" style="fill: #1f77b4; stroke: #1f77b4" />
5023
- <use ns4:href="#md7efaf3aec" x="423.896605" y="417.018926" style="fill: #1f77b4; stroke: #1f77b4" />
5024
- <use ns4:href="#md7efaf3aec" x="454.857643" y="417.358941" style="fill: #1f77b4; stroke: #1f77b4" />
5025
- <use ns4:href="#md7efaf3aec" x="485.81868" y="417.823352" style="fill: #1f77b4; stroke: #1f77b4" />
5026
- <use ns4:href="#md7efaf3aec" x="516.779718" y="416.969997" style="fill: #1f77b4; stroke: #1f77b4" />
5027
- <use ns4:href="#md7efaf3aec" x="547.740755" y="416.968338" style="fill: #1f77b4; stroke: #1f77b4" />
5028
- <use ns4:href="#md7efaf3aec" x="578.701793" y="417.640904" style="fill: #1f77b4; stroke: #1f77b4" />
5029
- <use ns4:href="#md7efaf3aec" x="609.66283" y="417.300889" style="fill: #1f77b4; stroke: #1f77b4" />
5030
- <use ns4:href="#md7efaf3aec" x="640.623868" y="417.38382" style="fill: #1f77b4; stroke: #1f77b4" />
5031
- <use ns4:href="#md7efaf3aec" x="671.584905" y="417.980919" style="fill: #1f77b4; stroke: #1f77b4" />
5032
- <use ns4:href="#md7efaf3aec" x="702.545943" y="416.836479" style="fill: #1f77b4; stroke: #1f77b4" />
5033
- <use ns4:href="#md7efaf3aec" x="733.50698" y="417.110149" style="fill: #1f77b4; stroke: #1f77b4" />
5034
- <use ns4:href="#md7efaf3aec" x="764.468018" y="417.450164" style="fill: #1f77b4; stroke: #1f77b4" />
5035
- <use ns4:href="#md7efaf3aec" x="795.429055" y="417.582853" style="fill: #1f77b4; stroke: #1f77b4" />
5036
  </g>
5037
  </g>
5038
  <g id="series--torch-eager" class="series">
5039
- <path d="M 83.325193 398.251755 L 114.286231 389.253796 L 145.247268 390.058222 L 176.208306 391.19354 L 207.169343 391.293886 L 238.130381 390.514339 L 269.091418 392.14807 L 300.052455 391.725125 L 331.013493 390.70425 L 361.97453 390.564098 L 392.935568 326.689372 L 423.896605 322.484796 L 454.857643 390.539219 L 485.81868 392.305638 L 516.779718 392.105776 L 547.740755 390.821182 L 578.701793 391.343644 L 609.66283 390.687664 L 640.623868 392.031968 L 671.584905 390.522632 L 702.545943 380.546094 L 733.50698 375.794177 L 764.468018 57.235754 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5040
  <defs>
5041
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5042
  </defs>
5043
  <g clip-path="url(#pb49fc4c8d2)">
5044
- <use ns4:href="#m9b8c54d372" x="83.325193" y="398.251755" style="fill: #ff7f0e; stroke: #ff7f0e" />
5045
- <use ns4:href="#m9b8c54d372" x="114.286231" y="389.253796" style="fill: #ff7f0e; stroke: #ff7f0e" />
5046
- <use ns4:href="#m9b8c54d372" x="145.247268" y="390.058222" style="fill: #ff7f0e; stroke: #ff7f0e" />
5047
- <use ns4:href="#m9b8c54d372" x="176.208306" y="391.19354" style="fill: #ff7f0e; stroke: #ff7f0e" />
5048
- <use ns4:href="#m9b8c54d372" x="207.169343" y="391.293886" style="fill: #ff7f0e; stroke: #ff7f0e" />
5049
- <use ns4:href="#m9b8c54d372" x="238.130381" y="390.514339" style="fill: #ff7f0e; stroke: #ff7f0e" />
5050
- <use ns4:href="#m9b8c54d372" x="269.091418" y="392.14807" style="fill: #ff7f0e; stroke: #ff7f0e" />
5051
- <use ns4:href="#m9b8c54d372" x="300.052455" y="391.725125" style="fill: #ff7f0e; stroke: #ff7f0e" />
5052
- <use ns4:href="#m9b8c54d372" x="331.013493" y="390.70425" style="fill: #ff7f0e; stroke: #ff7f0e" />
5053
- <use ns4:href="#m9b8c54d372" x="361.97453" y="390.564098" style="fill: #ff7f0e; stroke: #ff7f0e" />
5054
- <use ns4:href="#m9b8c54d372" x="392.935568" y="326.689372" style="fill: #ff7f0e; stroke: #ff7f0e" />
5055
- <use ns4:href="#m9b8c54d372" x="423.896605" y="322.484796" style="fill: #ff7f0e; stroke: #ff7f0e" />
5056
- <use ns4:href="#m9b8c54d372" x="454.857643" y="390.539219" style="fill: #ff7f0e; stroke: #ff7f0e" />
5057
- <use ns4:href="#m9b8c54d372" x="485.81868" y="392.305638" style="fill: #ff7f0e; stroke: #ff7f0e" />
5058
- <use ns4:href="#m9b8c54d372" x="516.779718" y="392.105776" style="fill: #ff7f0e; stroke: #ff7f0e" />
5059
- <use ns4:href="#m9b8c54d372" x="547.740755" y="390.821182" style="fill: #ff7f0e; stroke: #ff7f0e" />
5060
- <use ns4:href="#m9b8c54d372" x="578.701793" y="391.343644" style="fill: #ff7f0e; stroke: #ff7f0e" />
5061
- <use ns4:href="#m9b8c54d372" x="609.66283" y="390.687664" style="fill: #ff7f0e; stroke: #ff7f0e" />
5062
- <use ns4:href="#m9b8c54d372" x="640.623868" y="392.031968" style="fill: #ff7f0e; stroke: #ff7f0e" />
5063
- <use ns4:href="#m9b8c54d372" x="671.584905" y="390.522632" style="fill: #ff7f0e; stroke: #ff7f0e" />
5064
- <use ns4:href="#m9b8c54d372" x="702.545943" y="380.546094" style="fill: #ff7f0e; stroke: #ff7f0e" />
5065
- <use ns4:href="#m9b8c54d372" x="733.50698" y="375.794177" style="fill: #ff7f0e; stroke: #ff7f0e" />
5066
- <use ns4:href="#m9b8c54d372" x="764.468018" y="57.235754" style="fill: #ff7f0e; stroke: #ff7f0e" />
5067
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
5068
  </g>
5069
  </g>
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T19:55:43.820965</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
 
4233
  <g id="matplotlib.axis_2">
4234
  <g id="ytick_1">
4235
  <g id="grid-y--2" class="grid grid-y">
4236
+ <path d="M 47.72 375.22161 L 831.034248 375.22161 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4237
  </g>
4238
  <g id="line2d_25">
4239
  <defs>
4240
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4241
  </defs>
4242
  <g>
4243
+ <use ns4:href="#m0fca2865ba" x="47.72" y="375.22161" style="stroke: #000000; stroke-width: 0.8" />
4244
  </g>
4245
  </g>
4246
  <g id="text_25">
4247
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.020828" transform="rotate(-0 40.72 379.020828)">0.1</text>
4248
  </g>
4249
  </g>
4250
  <g id="ytick_2">
4251
  <g id="grid-y--3" class="grid grid-y">
4252
+ <path d="M 47.72 292.730166 L 831.034248 292.730166 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4253
  </g>
4254
  <g id="line2d_26">
4255
  <g>
4256
+ <use ns4:href="#m0fca2865ba" x="47.72" y="292.730166" style="stroke: #000000; stroke-width: 0.8" />
4257
  </g>
4258
  </g>
4259
  <g id="text_26">
4260
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.529385" transform="rotate(-0 40.72 296.529385)">0.2</text>
4261
  </g>
4262
  </g>
4263
  <g id="ytick_3">
4264
  <g id="grid-y--4" class="grid grid-y">
4265
+ <path d="M 47.72 210.238722 L 831.034248 210.238722 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4266
  </g>
4267
  <g id="line2d_27">
4268
  <g>
4269
+ <use ns4:href="#m0fca2865ba" x="47.72" y="210.238722" style="stroke: #000000; stroke-width: 0.8" />
4270
  </g>
4271
  </g>
4272
  <g id="text_27">
4273
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="214.037941" transform="rotate(-0 40.72 214.037941)">0.3</text>
4274
  </g>
4275
  </g>
4276
  <g id="ytick_4">
4277
  <g id="grid-y--5" class="grid grid-y">
4278
+ <path d="M 47.72 127.747279 L 831.034248 127.747279 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4279
  </g>
4280
  <g id="line2d_28">
4281
  <g>
4282
+ <use ns4:href="#m0fca2865ba" x="47.72" y="127.747279" style="stroke: #000000; stroke-width: 0.8" />
4283
  </g>
4284
  </g>
4285
  <g id="text_28">
4286
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="131.546498" transform="rotate(-0 40.72 131.546498)">0.4</text>
4287
  </g>
4288
  </g>
4289
  <g id="ytick_5">
4290
  <g id="grid-y--6" class="grid grid-y">
4291
+ <path d="M 47.72 45.255835 L 831.034248 45.255835 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4292
  </g>
4293
  <g id="line2d_29">
4294
  <g>
4295
+ <use ns4:href="#m0fca2865ba" x="47.72" y="45.255835" style="stroke: #000000; stroke-width: 0.8" />
4296
  </g>
4297
  </g>
4298
  <g id="text_29">
4299
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="49.055054" transform="rotate(-0 40.72 49.055054)">0.5</text>
4300
  </g>
4301
  </g>
4302
  <g id="label--y" class="ylabel">
 
4304
  </g>
4305
  </g>
4306
  <g id="series--hf-kernels-causal-conv1d" class="series">
4307
+ <path d="M 83.325193 420.186871 L 114.286231 413.982689 L 145.247268 414.824927 L 176.208306 414.222739 L 207.169343 415.130145 L 238.130381 415.088899 L 269.091418 415.65809 L 300.052455 416.614991 L 331.013493 415.971558 L 361.97453 415.633343 L 392.935568 415.913814 L 423.896605 415.798326 L 454.857643 415.649841 L 485.81868 415.575599 L 516.779718 415.732333 L 547.740755 415.740582 L 578.701793 416.102719 L 609.66283 416.037551 L 640.623868 415.3273 L 671.584905 415.427114 L 702.545943 415.715009 L 733.50698 414.033009 L 764.468018 415.715834 L 795.429055 416.374941 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4308
  <defs>
4309
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4310
  </defs>
4311
  <g clip-path="url(#pb49fc4c8d2)">
4312
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4313
+ <use ns4:href="#md7efaf3aec" x="114.286231" y="413.982689" style="fill: #1f77b4; stroke: #1f77b4" />
4314
+ <use ns4:href="#md7efaf3aec" x="145.247268" y="414.824927" style="fill: #1f77b4; stroke: #1f77b4" />
4315
+ <use ns4:href="#md7efaf3aec" x="176.208306" y="414.222739" style="fill: #1f77b4; stroke: #1f77b4" />
4316
+ <use ns4:href="#md7efaf3aec" x="207.169343" y="415.130145" style="fill: #1f77b4; stroke: #1f77b4" />
4317
+ <use ns4:href="#md7efaf3aec" x="238.130381" y="415.088899" style="fill: #1f77b4; stroke: #1f77b4" />
4318
+ <use ns4:href="#md7efaf3aec" x="269.091418" y="415.65809" style="fill: #1f77b4; stroke: #1f77b4" />
4319
+ <use ns4:href="#md7efaf3aec" x="300.052455" y="416.614991" style="fill: #1f77b4; stroke: #1f77b4" />
4320
+ <use ns4:href="#md7efaf3aec" x="331.013493" y="415.971558" style="fill: #1f77b4; stroke: #1f77b4" />
4321
+ <use ns4:href="#md7efaf3aec" x="361.97453" y="415.633343" style="fill: #1f77b4; stroke: #1f77b4" />
4322
+ <use ns4:href="#md7efaf3aec" x="392.935568" y="415.913814" style="fill: #1f77b4; stroke: #1f77b4" />
4323
+ <use ns4:href="#md7efaf3aec" x="423.896605" y="415.798326" style="fill: #1f77b4; stroke: #1f77b4" />
4324
+ <use ns4:href="#md7efaf3aec" x="454.857643" y="415.649841" style="fill: #1f77b4; stroke: #1f77b4" />
4325
+ <use ns4:href="#md7efaf3aec" x="485.81868" y="415.575599" style="fill: #1f77b4; stroke: #1f77b4" />
4326
+ <use ns4:href="#md7efaf3aec" x="516.779718" y="415.732333" style="fill: #1f77b4; stroke: #1f77b4" />
4327
+ <use ns4:href="#md7efaf3aec" x="547.740755" y="415.740582" style="fill: #1f77b4; stroke: #1f77b4" />
4328
+ <use ns4:href="#md7efaf3aec" x="578.701793" y="416.102719" style="fill: #1f77b4; stroke: #1f77b4" />
4329
+ <use ns4:href="#md7efaf3aec" x="609.66283" y="416.037551" style="fill: #1f77b4; stroke: #1f77b4" />
4330
+ <use ns4:href="#md7efaf3aec" x="640.623868" y="415.3273" style="fill: #1f77b4; stroke: #1f77b4" />
4331
+ <use ns4:href="#md7efaf3aec" x="671.584905" y="415.427114" style="fill: #1f77b4; stroke: #1f77b4" />
4332
+ <use ns4:href="#md7efaf3aec" x="702.545943" y="415.715009" style="fill: #1f77b4; stroke: #1f77b4" />
4333
+ <use ns4:href="#md7efaf3aec" x="733.50698" y="414.033009" style="fill: #1f77b4; stroke: #1f77b4" />
4334
+ <use ns4:href="#md7efaf3aec" x="764.468018" y="415.715834" style="fill: #1f77b4; stroke: #1f77b4" />
4335
+ <use ns4:href="#md7efaf3aec" x="795.429055" y="416.374941" style="fill: #1f77b4; stroke: #1f77b4" />
4336
  </g>
4337
  </g>
4338
  <g id="series--torch-eager" class="series">
4339
+ <path d="M 83.325193 398.136083 L 114.286231 388.072127 L 145.247268 387.931066 L 176.208306 388.641318 L 207.169343 388.731233 L 238.130381 388.599247 L 269.091418 390.224329 L 300.052455 389.556973 L 331.013493 389.416737 L 361.97453 388.913539 L 392.935568 326.565682 L 423.896605 322.943482 L 454.857643 389.985928 L 485.81868 389.721956 L 516.779718 389.763201 L 547.740755 389.433236 L 578.701793 390.282897 L 609.66283 388.765055 L 640.623868 388.979533 L 671.584905 389.457983 L 702.545943 380.730388 L 733.50698 375.598596 L 764.468018 57.586656 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4340
  <defs>
4341
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4342
  </defs>
4343
  <g clip-path="url(#pb49fc4c8d2)">
4344
+ <use ns4:href="#m9b8c54d372" x="83.325193" y="398.136083" style="fill: #ff7f0e; stroke: #ff7f0e" />
4345
+ <use ns4:href="#m9b8c54d372" x="114.286231" y="388.072127" style="fill: #ff7f0e; stroke: #ff7f0e" />
4346
+ <use ns4:href="#m9b8c54d372" x="145.247268" y="387.931066" style="fill: #ff7f0e; stroke: #ff7f0e" />
4347
+ <use ns4:href="#m9b8c54d372" x="176.208306" y="388.641318" style="fill: #ff7f0e; stroke: #ff7f0e" />
4348
+ <use ns4:href="#m9b8c54d372" x="207.169343" y="388.731233" style="fill: #ff7f0e; stroke: #ff7f0e" />
4349
+ <use ns4:href="#m9b8c54d372" x="238.130381" y="388.599247" style="fill: #ff7f0e; stroke: #ff7f0e" />
4350
+ <use ns4:href="#m9b8c54d372" x="269.091418" y="390.224329" style="fill: #ff7f0e; stroke: #ff7f0e" />
4351
+ <use ns4:href="#m9b8c54d372" x="300.052455" y="389.556973" style="fill: #ff7f0e; stroke: #ff7f0e" />
4352
+ <use ns4:href="#m9b8c54d372" x="331.013493" y="389.416737" style="fill: #ff7f0e; stroke: #ff7f0e" />
4353
+ <use ns4:href="#m9b8c54d372" x="361.97453" y="388.913539" style="fill: #ff7f0e; stroke: #ff7f0e" />
4354
+ <use ns4:href="#m9b8c54d372" x="392.935568" y="326.565682" style="fill: #ff7f0e; stroke: #ff7f0e" />
4355
+ <use ns4:href="#m9b8c54d372" x="423.896605" y="322.943482" style="fill: #ff7f0e; stroke: #ff7f0e" />
4356
+ <use ns4:href="#m9b8c54d372" x="454.857643" y="389.985928" style="fill: #ff7f0e; stroke: #ff7f0e" />
4357
+ <use ns4:href="#m9b8c54d372" x="485.81868" y="389.721956" style="fill: #ff7f0e; stroke: #ff7f0e" />
4358
+ <use ns4:href="#m9b8c54d372" x="516.779718" y="389.763201" style="fill: #ff7f0e; stroke: #ff7f0e" />
4359
+ <use ns4:href="#m9b8c54d372" x="547.740755" y="389.433236" style="fill: #ff7f0e; stroke: #ff7f0e" />
4360
+ <use ns4:href="#m9b8c54d372" x="578.701793" y="390.282897" style="fill: #ff7f0e; stroke: #ff7f0e" />
4361
+ <use ns4:href="#m9b8c54d372" x="609.66283" y="388.765055" style="fill: #ff7f0e; stroke: #ff7f0e" />
4362
+ <use ns4:href="#m9b8c54d372" x="640.623868" y="388.979533" style="fill: #ff7f0e; stroke: #ff7f0e" />
4363
+ <use ns4:href="#m9b8c54d372" x="671.584905" y="389.457983" style="fill: #ff7f0e; stroke: #ff7f0e" />
4364
+ <use ns4:href="#m9b8c54d372" x="702.545943" y="380.730388" style="fill: #ff7f0e; stroke: #ff7f0e" />
4365
+ <use ns4:href="#m9b8c54d372" x="733.50698" y="375.598596" style="fill: #ff7f0e; stroke: #ff7f0e" />
4366
+ <use ns4:href="#m9b8c54d372" x="764.468018" y="57.586656" style="fill: #ff7f0e; stroke: #ff7f0e" />
4367
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
4368
  </g>
4369
  </g>
 
4422
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4423
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4424
  </span> |
4425
+ Cell: combine | 4.64s
4426
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4427
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4428
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4547
  torch_eager cuda_B2_D64_S512_W4 0.08 True
4548
  torch_eager cuda_B4_D2048_S128_W2 0.08 True
4549
  torch_eager cuda_B4_D2048_S128_W4 0.08 True
4550
+ torch_eager cuda_B4_D2048_S2048_W2 0.49 True
4551
  torch_eager cuda_B4_D2048_S2048_W4 0.50 True
4552
  torch_eager cuda_B4_D2048_S512_W2 0.09 True
4553
  torch_eager cuda_B4_D2048_S512_W4 0.10 True
 
4576
  <div class="uv-install-logs" id="uv-logs-combine">
4577
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4578
  <div class="uv-logs-content" style="display: none;">
4579
+ Installed 37 packages in 204ms
4580
  </div>
4581
  </div>
4582
  <div class="cell-artifacts">
 
4589
  <rdf:RDF>
4590
  <ns2:Work>
4591
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4592
+ <dc:date>2025-12-19T19:55:43.820965</dc:date>
4593
  <dc:format>image/svg+xml</dc:format>
4594
  <dc:creator>
4595
  <ns2:Agent>
 
4933
  <g id="matplotlib.axis_2">
4934
  <g id="ytick_1">
4935
  <g id="grid-y--2" class="grid grid-y">
4936
+ <path d="M 47.72 375.22161 L 831.034248 375.22161 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4937
  </g>
4938
  <g id="line2d_25">
4939
  <defs>
4940
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4941
  </defs>
4942
  <g>
4943
+ <use ns4:href="#m0fca2865ba" x="47.72" y="375.22161" style="stroke: #000000; stroke-width: 0.8" />
4944
  </g>
4945
  </g>
4946
  <g id="text_25">
4947
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.020828" transform="rotate(-0 40.72 379.020828)">0.1</text>
4948
  </g>
4949
  </g>
4950
  <g id="ytick_2">
4951
  <g id="grid-y--3" class="grid grid-y">
4952
+ <path d="M 47.72 292.730166 L 831.034248 292.730166 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4953
  </g>
4954
  <g id="line2d_26">
4955
  <g>
4956
+ <use ns4:href="#m0fca2865ba" x="47.72" y="292.730166" style="stroke: #000000; stroke-width: 0.8" />
4957
  </g>
4958
  </g>
4959
  <g id="text_26">
4960
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.529385" transform="rotate(-0 40.72 296.529385)">0.2</text>
4961
  </g>
4962
  </g>
4963
  <g id="ytick_3">
4964
  <g id="grid-y--4" class="grid grid-y">
4965
+ <path d="M 47.72 210.238722 L 831.034248 210.238722 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4966
  </g>
4967
  <g id="line2d_27">
4968
  <g>
4969
+ <use ns4:href="#m0fca2865ba" x="47.72" y="210.238722" style="stroke: #000000; stroke-width: 0.8" />
4970
  </g>
4971
  </g>
4972
  <g id="text_27">
4973
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="214.037941" transform="rotate(-0 40.72 214.037941)">0.3</text>
4974
  </g>
4975
  </g>
4976
  <g id="ytick_4">
4977
  <g id="grid-y--5" class="grid grid-y">
4978
+ <path d="M 47.72 127.747279 L 831.034248 127.747279 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4979
  </g>
4980
  <g id="line2d_28">
4981
  <g>
4982
+ <use ns4:href="#m0fca2865ba" x="47.72" y="127.747279" style="stroke: #000000; stroke-width: 0.8" />
4983
  </g>
4984
  </g>
4985
  <g id="text_28">
4986
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="131.546498" transform="rotate(-0 40.72 131.546498)">0.4</text>
4987
  </g>
4988
  </g>
4989
  <g id="ytick_5">
4990
  <g id="grid-y--6" class="grid grid-y">
4991
+ <path d="M 47.72 45.255835 L 831.034248 45.255835 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4992
  </g>
4993
  <g id="line2d_29">
4994
  <g>
4995
+ <use ns4:href="#m0fca2865ba" x="47.72" y="45.255835" style="stroke: #000000; stroke-width: 0.8" />
4996
  </g>
4997
  </g>
4998
  <g id="text_29">
4999
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="49.055054" transform="rotate(-0 40.72 49.055054)">0.5</text>
5000
  </g>
5001
  </g>
5002
  <g id="label--y" class="ylabel">
 
5004
  </g>
5005
  </g>
5006
  <g id="series--hf-kernels-causal-conv1d" class="series">
5007
+ <path d="M 83.325193 420.186871 L 114.286231 413.982689 L 145.247268 414.824927 L 176.208306 414.222739 L 207.169343 415.130145 L 238.130381 415.088899 L 269.091418 415.65809 L 300.052455 416.614991 L 331.013493 415.971558 L 361.97453 415.633343 L 392.935568 415.913814 L 423.896605 415.798326 L 454.857643 415.649841 L 485.81868 415.575599 L 516.779718 415.732333 L 547.740755 415.740582 L 578.701793 416.102719 L 609.66283 416.037551 L 640.623868 415.3273 L 671.584905 415.427114 L 702.545943 415.715009 L 733.50698 414.033009 L 764.468018 415.715834 L 795.429055 416.374941 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
5008
  <defs>
5009
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
5010
  </defs>
5011
  <g clip-path="url(#pb49fc4c8d2)">
5012
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
5013
+ <use ns4:href="#md7efaf3aec" x="114.286231" y="413.982689" style="fill: #1f77b4; stroke: #1f77b4" />
5014
+ <use ns4:href="#md7efaf3aec" x="145.247268" y="414.824927" style="fill: #1f77b4; stroke: #1f77b4" />
5015
+ <use ns4:href="#md7efaf3aec" x="176.208306" y="414.222739" style="fill: #1f77b4; stroke: #1f77b4" />
5016
+ <use ns4:href="#md7efaf3aec" x="207.169343" y="415.130145" style="fill: #1f77b4; stroke: #1f77b4" />
5017
+ <use ns4:href="#md7efaf3aec" x="238.130381" y="415.088899" style="fill: #1f77b4; stroke: #1f77b4" />
5018
+ <use ns4:href="#md7efaf3aec" x="269.091418" y="415.65809" style="fill: #1f77b4; stroke: #1f77b4" />
5019
+ <use ns4:href="#md7efaf3aec" x="300.052455" y="416.614991" style="fill: #1f77b4; stroke: #1f77b4" />
5020
+ <use ns4:href="#md7efaf3aec" x="331.013493" y="415.971558" style="fill: #1f77b4; stroke: #1f77b4" />
5021
+ <use ns4:href="#md7efaf3aec" x="361.97453" y="415.633343" style="fill: #1f77b4; stroke: #1f77b4" />
5022
+ <use ns4:href="#md7efaf3aec" x="392.935568" y="415.913814" style="fill: #1f77b4; stroke: #1f77b4" />
5023
+ <use ns4:href="#md7efaf3aec" x="423.896605" y="415.798326" style="fill: #1f77b4; stroke: #1f77b4" />
5024
+ <use ns4:href="#md7efaf3aec" x="454.857643" y="415.649841" style="fill: #1f77b4; stroke: #1f77b4" />
5025
+ <use ns4:href="#md7efaf3aec" x="485.81868" y="415.575599" style="fill: #1f77b4; stroke: #1f77b4" />
5026
+ <use ns4:href="#md7efaf3aec" x="516.779718" y="415.732333" style="fill: #1f77b4; stroke: #1f77b4" />
5027
+ <use ns4:href="#md7efaf3aec" x="547.740755" y="415.740582" style="fill: #1f77b4; stroke: #1f77b4" />
5028
+ <use ns4:href="#md7efaf3aec" x="578.701793" y="416.102719" style="fill: #1f77b4; stroke: #1f77b4" />
5029
+ <use ns4:href="#md7efaf3aec" x="609.66283" y="416.037551" style="fill: #1f77b4; stroke: #1f77b4" />
5030
+ <use ns4:href="#md7efaf3aec" x="640.623868" y="415.3273" style="fill: #1f77b4; stroke: #1f77b4" />
5031
+ <use ns4:href="#md7efaf3aec" x="671.584905" y="415.427114" style="fill: #1f77b4; stroke: #1f77b4" />
5032
+ <use ns4:href="#md7efaf3aec" x="702.545943" y="415.715009" style="fill: #1f77b4; stroke: #1f77b4" />
5033
+ <use ns4:href="#md7efaf3aec" x="733.50698" y="414.033009" style="fill: #1f77b4; stroke: #1f77b4" />
5034
+ <use ns4:href="#md7efaf3aec" x="764.468018" y="415.715834" style="fill: #1f77b4; stroke: #1f77b4" />
5035
+ <use ns4:href="#md7efaf3aec" x="795.429055" y="416.374941" style="fill: #1f77b4; stroke: #1f77b4" />
5036
  </g>
5037
  </g>
5038
  <g id="series--torch-eager" class="series">
5039
+ <path d="M 83.325193 398.136083 L 114.286231 388.072127 L 145.247268 387.931066 L 176.208306 388.641318 L 207.169343 388.731233 L 238.130381 388.599247 L 269.091418 390.224329 L 300.052455 389.556973 L 331.013493 389.416737 L 361.97453 388.913539 L 392.935568 326.565682 L 423.896605 322.943482 L 454.857643 389.985928 L 485.81868 389.721956 L 516.779718 389.763201 L 547.740755 389.433236 L 578.701793 390.282897 L 609.66283 388.765055 L 640.623868 388.979533 L 671.584905 389.457983 L 702.545943 380.730388 L 733.50698 375.598596 L 764.468018 57.586656 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5040
  <defs>
5041
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5042
  </defs>
5043
  <g clip-path="url(#pb49fc4c8d2)">
5044
+ <use ns4:href="#m9b8c54d372" x="83.325193" y="398.136083" style="fill: #ff7f0e; stroke: #ff7f0e" />
5045
+ <use ns4:href="#m9b8c54d372" x="114.286231" y="388.072127" style="fill: #ff7f0e; stroke: #ff7f0e" />
5046
+ <use ns4:href="#m9b8c54d372" x="145.247268" y="387.931066" style="fill: #ff7f0e; stroke: #ff7f0e" />
5047
+ <use ns4:href="#m9b8c54d372" x="176.208306" y="388.641318" style="fill: #ff7f0e; stroke: #ff7f0e" />
5048
+ <use ns4:href="#m9b8c54d372" x="207.169343" y="388.731233" style="fill: #ff7f0e; stroke: #ff7f0e" />
5049
+ <use ns4:href="#m9b8c54d372" x="238.130381" y="388.599247" style="fill: #ff7f0e; stroke: #ff7f0e" />
5050
+ <use ns4:href="#m9b8c54d372" x="269.091418" y="390.224329" style="fill: #ff7f0e; stroke: #ff7f0e" />
5051
+ <use ns4:href="#m9b8c54d372" x="300.052455" y="389.556973" style="fill: #ff7f0e; stroke: #ff7f0e" />
5052
+ <use ns4:href="#m9b8c54d372" x="331.013493" y="389.416737" style="fill: #ff7f0e; stroke: #ff7f0e" />
5053
+ <use ns4:href="#m9b8c54d372" x="361.97453" y="388.913539" style="fill: #ff7f0e; stroke: #ff7f0e" />
5054
+ <use ns4:href="#m9b8c54d372" x="392.935568" y="326.565682" style="fill: #ff7f0e; stroke: #ff7f0e" />
5055
+ <use ns4:href="#m9b8c54d372" x="423.896605" y="322.943482" style="fill: #ff7f0e; stroke: #ff7f0e" />
5056
+ <use ns4:href="#m9b8c54d372" x="454.857643" y="389.985928" style="fill: #ff7f0e; stroke: #ff7f0e" />
5057
+ <use ns4:href="#m9b8c54d372" x="485.81868" y="389.721956" style="fill: #ff7f0e; stroke: #ff7f0e" />
5058
+ <use ns4:href="#m9b8c54d372" x="516.779718" y="389.763201" style="fill: #ff7f0e; stroke: #ff7f0e" />
5059
+ <use ns4:href="#m9b8c54d372" x="547.740755" y="389.433236" style="fill: #ff7f0e; stroke: #ff7f0e" />
5060
+ <use ns4:href="#m9b8c54d372" x="578.701793" y="390.282897" style="fill: #ff7f0e; stroke: #ff7f0e" />
5061
+ <use ns4:href="#m9b8c54d372" x="609.66283" y="388.765055" style="fill: #ff7f0e; stroke: #ff7f0e" />
5062
+ <use ns4:href="#m9b8c54d372" x="640.623868" y="388.979533" style="fill: #ff7f0e; stroke: #ff7f0e" />
5063
+ <use ns4:href="#m9b8c54d372" x="671.584905" y="389.457983" style="fill: #ff7f0e; stroke: #ff7f0e" />
5064
+ <use ns4:href="#m9b8c54d372" x="702.545943" y="380.730388" style="fill: #ff7f0e; stroke: #ff7f0e" />
5065
+ <use ns4:href="#m9b8c54d372" x="733.50698" y="375.598596" style="fill: #ff7f0e; stroke: #ff7f0e" />
5066
+ <use ns4:href="#m9b8c54d372" x="764.468018" y="57.586656" style="fill: #ff7f0e; stroke: #ff7f0e" />
5067
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
5068
  </g>
5069
  </g>
deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl CHANGED
@@ -1,4 +1,4 @@
1
- {"ts": "2025-12-19T19:09:31Z", "run": "04da14902c784090beeb85878bc3f422", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_Q100_H8_E256_L4_P4", "batch_size": 1, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.03506000007291732, "p50": 0.03723999998328509, "p90": 0.03738000009434472, "mean": 0.036888000067847315, "iqr": 0.0004900000476482091, "raw_times": [0.03506000007291732, 0.03787000014199293, 0.03738000009434472, 0.036890000046696514, 0.03723999998328509], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04449099992598349, "peak_bytes": 2264064, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.539113701047427e-08, "mse": 6.418638644407112e-15, "ref": "deformable_detr_torch"}, "err": null}
2
- {"ts": "2025-12-19T19:09:31Z", "run": "04da14902c784090beeb85878bc3f422", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_Q300_H8_E256_L4_P4", "batch_size": 1, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04158100000495324, "p50": 0.04245099989930168, "p90": 0.04307099993638985, "mean": 0.04284299998289498, "iqr": 0.0008199999683711212, "raw_times": [0.04158100000495324, 0.04245099989930168, 0.0448610001058114, 0.04307099993638985, 0.04225099996801873], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.046680999957970926, "peak_bytes": 4004864, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.559346050176828e-08, "mse": 6.4289483059246175e-15, "ref": "deformable_detr_torch"}, "err": null}
3
- {"ts": "2025-12-19T19:09:31Z", "run": "04da14902c784090beeb85878bc3f422", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_Q100_H8_E256_L4_P4", "batch_size": 2, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04193000017949089, "p50": 0.04463999994186452, "p90": 0.0456009997833462, "mean": 0.044314399929135107, "iqr": 0.001990999862755416, "raw_times": [0.04193000017949089, 0.04463999994186452, 0.04579099982038315, 0.0456009997833462, 0.04360999992059078], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04799999987881165, "peak_bytes": 5459968, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.555110149657594e-08, "mse": 6.418781369458724e-15, "ref": "deformable_detr_torch"}, "err": null}
4
- {"ts": "2025-12-19T19:09:31Z", "run": "04da14902c784090beeb85878bc3f422", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_Q300_H8_E256_L4_P4", "batch_size": 2, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044340000158626935, "p50": 0.0453610000477056, "p90": 0.045860999989599804, "mean": 0.04539080005088181, "iqr": 0.000529999852005858, "raw_times": [0.045860999989599804, 0.045331000137593946, 0.044340000158626935, 0.04606099992088275, 0.0453610000477056], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04745000001094013, "peak_bytes": 8008704, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.5905669427147586e-08, "mse": 6.485184940875199e-15, "ref": "deformable_detr_torch"}, "err": null}
 
1
+ {"ts": "2025-12-19T19:41:40Z", "run": "e6f28dfc458847cc825acfc40a1937dc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q100_H8_E256_L4_P4", "batch_size": 1, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.4261789999732173, "p50": 3.457469000011315, "p90": 3.459429999963959, "mean": 3.4539671999937127, "iqr": 0.0022309999962999427, "raw_times": [3.459429999963959, 3.4695590000524135, 3.457469000011315, 3.457198999967659, 3.4261789999732173], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.4921710000048733, "peak_bytes": 5929472, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
2
+ {"ts": "2025-12-19T19:41:40Z", "run": "e6f28dfc458847cc825acfc40a1937dc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q300_H8_E256_L4_P4", "batch_size": 1, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.229746999953932, "p50": 4.235096999991583, "p90": 4.236528000035378, "mean": 4.242877599995154, "iqr": 0.0027010000280824897, "raw_times": [4.236528000035378, 4.235096999991583, 4.279188999987582, 4.233827000007295, 4.229746999953932], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.256637999958457, "peak_bytes": 15161856, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
3
+ {"ts": "2025-12-19T19:41:41Z", "run": "e6f28dfc458847cc825acfc40a1937dc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q100_H8_E256_L4_P4", "batch_size": 2, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.2112570000085725, "p50": 4.251798000041163, "p90": 4.262317999973675, "mean": 4.247635800004446, "iqr": 0.04195999997591571, "raw_times": [4.2112570000085725, 4.262317999973675, 4.220357999997759, 4.251798000041163, 4.292448000001059], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.28278900000123, "peak_bytes": 11958784, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
4
+ {"ts": "2025-12-19T19:41:41Z", "run": "e6f28dfc458847cc825acfc40a1937dc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q300_H8_E256_L4_P4", "batch_size": 2, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.324839999981123, "p50": 4.342328999996425, "p90": 4.349751000006563, "mean": 4.3508561999942685, "iqr": 0.022982000018600957, "raw_times": [4.326768999987962, 4.410591999999269, 4.342328999996425, 4.324839999981123, 4.349751000006563], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.316158999984054, "peak_bytes": 30977024, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
deformable_detr/impls/cells/benchmark.py CHANGED
@@ -4,7 +4,6 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
- # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
@@ -13,30 +12,107 @@
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- from kernels import get_kernel
17
 
18
- # Load the deformable DETR kernel
19
- deformable_detr = get_kernel("kernels-community/deformable-detr")
20
 
21
-
22
- def hf_kernels_deformable_detr(
23
  value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64
24
  ):
25
- """HuggingFace Kernels Deformable DETR Multi-Scale Deformable Attention"""
26
- return deformable_detr.ms_deform_attn_forward(
27
- value=value,
28
- spatial_shapes=spatial_shapes,
29
- level_start_index=level_start_index,
30
- sampling_loc=sampling_locations,
31
- attn_weight=attention_weights,
32
- im2col_step=im2col_step
33
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
 
36
  run_benchmark(
37
  kernel_type=KernelTypeEnum.DEFORMABLE_DETR,
38
- impl_name="hf_kernels_deformable_detr",
39
- impl_tags={"family": "hf-kernels", "backend": "cuda"},
40
- impl_func=hf_kernels_deformable_detr,
41
  dtype="float32",
42
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
 
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
15
 
 
 
16
 
17
+ def torch_deformable_detr(
 
18
  value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64
19
  ):
20
+ """
21
+ PyTorch native reference implementation of multi-scale deformable attention.
22
+ Uses vectorized bilinear interpolation for reasonable performance.
23
+ """
24
+ bs, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
25
+ _, _, _, channels = value.shape
26
+
27
+ output = torch.zeros(bs, num_queries, num_heads, channels, device=value.device, dtype=value.dtype)
28
+
29
+ # Split value tensor by levels
30
+ value_list = value.split([int(h * w) for h, w in spatial_shapes.tolist()], dim=1)
31
+
32
+ # Iterate through each level (can't avoid this loop easily)
33
+ for level_idx in range(num_levels):
34
+ h, w = spatial_shapes[level_idx].tolist()
35
+ value_level = value_list[level_idx] # (bs, h*w, num_heads, channels)
36
+
37
+ # Reshape to spatial grid: (bs, num_heads, channels, h, w)
38
+ value_spatial = value_level.reshape(bs, h, w, num_heads, channels).permute(0, 3, 4, 1, 2)
39
+
40
+ # Get sampling locations and weights for this level
41
+ # loc: (bs, num_queries, num_heads, num_points, 2)
42
+ loc = sampling_locations[:, :, :, level_idx, :, :]
43
+ # weight: (bs, num_queries, num_heads, num_points)
44
+ weight = attention_weights[:, :, :, level_idx, :]
45
+
46
+ # Convert normalized coordinates to pixel coordinates
47
+ # loc[..., 0] is x (width), loc[..., 1] is y (height)
48
+ x = loc[..., 0] * w - 0.5 # (bs, num_queries, num_heads, num_points)
49
+ y = loc[..., 1] * h - 0.5
50
+
51
+ # Get integer coordinates for bilinear interpolation
52
+ x0 = torch.floor(x).long()
53
+ y0 = torch.floor(y).long()
54
+ x1 = x0 + 1
55
+ y1 = y0 + 1
56
+
57
+ # Compute interpolation weights BEFORE clamping (important!)
58
+ lw = x - x0.float() # weight for x direction
59
+ lh = y - y0.float() # weight for y direction
60
+ hw = 1 - lw
61
+ hh = 1 - lh
62
+
63
+ # Create mask for valid sample locations
64
+ valid = (y > -1) & (x > -1) & (y < h) & (x < w)
65
+
66
+ # Create masks for each corner being in bounds
67
+ mask_tl = ((y0 >= 0) & (x0 >= 0)).unsqueeze(-1).float()
68
+ mask_tr = ((y0 >= 0) & (x1 <= w - 1)).unsqueeze(-1).float()
69
+ mask_bl = ((y1 <= h - 1) & (x0 >= 0)).unsqueeze(-1).float()
70
+ mask_br = ((y1 <= h - 1) & (x1 <= w - 1)).unsqueeze(-1).float()
71
+
72
+ # Clamp coordinates for safe indexing
73
+ x0_clamped = torch.clamp(x0, 0, w - 1)
74
+ x1_clamped = torch.clamp(x1, 0, w - 1)
75
+ y0_clamped = torch.clamp(y0, 0, h - 1)
76
+ y1_clamped = torch.clamp(y1, 0, h - 1)
77
+
78
+ # Bilinear interpolation weights for all 4 corners
79
+ w_tl = (hh * hw).unsqueeze(-1) # top-left: (bs, num_queries, num_heads, num_points, 1)
80
+ w_tr = (hh * lw).unsqueeze(-1) # top-right
81
+ w_bl = (lh * hw).unsqueeze(-1) # bottom-left
82
+ w_br = (lh * lw).unsqueeze(-1) # bottom-right
83
+
84
+ # Gather values from the 4 corners using advanced indexing
85
+ batch_idx = torch.arange(bs, device=value.device).view(bs, 1, 1, 1).expand(bs, num_queries, num_heads, num_points)
86
+ head_idx = torch.arange(num_heads, device=value.device).view(1, 1, num_heads, 1).expand(bs, num_queries, num_heads, num_points)
87
+
88
+ # Gather corner values with clamped indices, then apply corner masks
89
+ v_tl = value_spatial[batch_idx, head_idx, :, y0_clamped, x0_clamped] * mask_tl
90
+ v_tr = value_spatial[batch_idx, head_idx, :, y0_clamped, x1_clamped] * mask_tr
91
+ v_bl = value_spatial[batch_idx, head_idx, :, y1_clamped, x0_clamped] * mask_bl
92
+ v_br = value_spatial[batch_idx, head_idx, :, y1_clamped, x1_clamped] * mask_br
93
+
94
+ # Bilinear interpolation
95
+ sampled = w_tl * v_tl + w_tr * v_tr + w_bl * v_bl + w_br * v_br
96
+
97
+ # Apply valid mask (only accumulate if entire sample location is valid)
98
+ sampled = sampled * valid.unsqueeze(-1).float()
99
+
100
+ # Apply attention weights and sum over points
101
+ # weight: (bs, num_queries, num_heads, num_points)
102
+ # Expand weight: (bs, num_queries, num_heads, num_points, 1)
103
+ weighted_sampled = sampled * weight.unsqueeze(-1)
104
+
105
+ # Sum over points: (bs, num_queries, num_heads, channels)
106
+ output += weighted_sampled.sum(dim=3)
107
+
108
+ # Flatten last two dimensions to match kernel output
109
+ return output.reshape(bs, num_queries, num_heads * channels)
110
 
111
 
112
  run_benchmark(
113
  kernel_type=KernelTypeEnum.DEFORMABLE_DETR,
114
+ impl_name="torch_eager",
115
+ impl_tags={"family": "pytorch", "backend": "eager"},
116
+ impl_func=torch_deformable_detr,
117
  dtype="float32",
118
  )
deformable_detr/impls/hf_kernels_deformable_detr.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3905,7 +3905,7 @@ Cell: nv | 0.25s
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
- <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:55:49 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.25s
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
- | N/A 30C P0 77W / 350W | 0MiB / 46068MiB | 11% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.25s
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
- Cell: benchmark | 4.73s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -4003,24 +4003,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q100_H8_E256_L4_P4
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
- hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 199.744us 791.13% 199.744us 199.744us 1
4007
- hf_kernels_deformable_detr 5.99% 129.162us 99.60% 2.148ms 2.148ms 0.000us 0.00% 26.304us 26.304us 1
4008
- _deformable_detr_57c3d32::ms_deform_attn_forward 3.04% 65.452us 93.61% 2.019ms 672.874us 22.336us 88.47% 26.304us 8.768us 3
4009
- void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.336us 88.47% 22.336us 7.445us 3
4010
- aten::zeros 0.91% 19.609us 87.96% 1.897ms 632.230us 0.000us 0.00% 3.968us 1.323us 3
4011
- aten::zero_ 0.66% 14.208us 85.42% 1.842ms 614.026us 0.000us 0.00% 3.968us 1.323us 3
4012
- aten::fill_ 1.51% 32.653us 84.76% 1.828ms 609.290us 2.912us 11.53% 3.968us 1.323us 3
4013
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.912us 11.53% 2.912us 0.971us 3
4014
- Activity Buffer Request 81.38% 1.755ms 81.38% 1.755ms 1.755ms 1.056us 4.18% 1.056us 1.056us 1
4015
- aten::empty 1.62% 35.003us 1.62% 35.003us 11.668us 0.000us 0.00% 0.000us 0.000us 3
4016
- cudaLaunchKernel 2.65% 57.140us 2.65% 57.140us 9.523us 0.000us 0.00% 0.000us 0.000us 6
4017
- aten::view 0.79% 17.140us 0.79% 17.140us 2.857us 0.000us 0.00% 0.000us 0.000us 6
4018
- aten::select 0.89% 19.100us 1.05% 22.620us 7.540us 0.000us 0.00% 0.000us 0.000us 3
4019
- aten::as_strided 0.16% 3.520us 0.16% 3.520us 1.173us 0.000us 0.00% 0.000us 0.000us 3
4020
- cudaDeviceSynchronize 0.40% 8.641us 0.40% 8.641us 8.641us 0.000us 0.00% 0.000us 0.000us 1
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
- Self CPU time total: 2.156ms
4023
- Self CUDA time total: 25.248us
4024
 
4025
 
4026
 
@@ -4030,24 +4030,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q300_H8_E256_L4_P4
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4032
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4033
- hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 136.418us 517.99% 136.418us 136.418us 1
4034
- hf_kernels_deformable_detr 5.06% 104.032us 99.72% 2.049ms 2.049ms 0.000us 0.00% 27.296us 27.296us 1
4035
- _deformable_detr_57c3d32::ms_deform_attn_forward 1.59% 32.619us 94.66% 1.945ms 648.480us 23.488us 89.19% 27.296us 9.099us 3
4036
- void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 23.488us 89.19% 23.488us 7.829us 3
4037
- aten::zeros 0.44% 8.979us 91.25% 1.875ms 625.117us 0.000us 0.00% 3.808us 1.269us 3
4038
- aten::zero_ 0.41% 8.351us 89.97% 1.849ms 616.327us 0.000us 0.00% 3.808us 1.269us 3
4039
- aten::fill_ 1.21% 24.960us 89.56% 1.841ms 613.543us 2.848us 10.81% 3.808us 1.269us 3
4040
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.848us 10.81% 2.848us 0.949us 3
4041
- Activity Buffer Request 87.10% 1.790ms 87.10% 1.790ms 1.790ms 0.960us 3.65% 0.960us 0.960us 1
4042
- aten::empty 0.85% 17.391us 0.85% 17.391us 5.797us 0.000us 0.00% 0.000us 0.000us 3
4043
- cudaLaunchKernel 1.95% 40.151us 1.95% 40.151us 6.692us 0.000us 0.00% 0.000us 0.000us 6
4044
- aten::view 0.44% 9.121us 0.44% 9.121us 1.520us 0.000us 0.00% 0.000us 0.000us 6
4045
- aten::select 0.58% 11.950us 0.68% 13.920us 4.640us 0.000us 0.00% 0.000us 0.000us 3
4046
- aten::as_strided 0.10% 1.970us 0.10% 1.970us 0.657us 0.000us 0.00% 0.000us 0.000us 3
4047
- cudaDeviceSynchronize 0.28% 5.670us 0.28% 5.670us 5.670us 0.000us 0.00% 0.000us 0.000us 1
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
- Self CPU time total: 2.055ms
4050
- Self CUDA time total: 26.336us
4051
 
4052
 
4053
 
@@ -4057,24 +4057,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q100_H8_E256_L4_P4
4057
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4058
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4059
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
- hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 138.431us 541.44% 138.431us 138.431us 1
4061
- hf_kernels_deformable_detr 4.88% 96.691us 99.73% 1.977ms 1.977ms 0.000us 0.00% 26.495us 26.495us 1
4062
- _deformable_detr_57c3d32::ms_deform_attn_forward 1.70% 33.709us 94.86% 1.881ms 626.893us 22.783us 89.11% 26.495us 8.832us 3
4063
- void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.783us 89.11% 22.783us 7.594us 3
4064
- aten::zeros 0.43% 8.511us 91.28% 1.810ms 603.293us 0.000us 0.00% 3.712us 1.237us 3
4065
- aten::zero_ 0.42% 8.319us 90.02% 1.785ms 594.946us 0.000us 0.00% 3.712us 1.237us 3
4066
- aten::fill_ 1.36% 26.920us 89.60% 1.777ms 592.173us 2.784us 10.89% 3.712us 1.237us 3
4067
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.784us 10.89% 2.784us 0.928us 3
4068
- Activity Buffer Request 86.99% 1.725ms 86.99% 1.725ms 1.725ms 0.928us 3.63% 0.928us 0.928us 1
4069
- aten::empty 0.83% 16.530us 0.83% 16.530us 5.510us 0.000us 0.00% 0.000us 0.000us 3
4070
- cudaLaunchKernel 1.99% 39.553us 1.99% 39.553us 6.592us 0.000us 0.00% 0.000us 0.000us 6
4071
- aten::view 0.47% 9.270us 0.47% 9.270us 1.545us 0.000us 0.00% 0.000us 0.000us 6
4072
- aten::select 0.56% 11.070us 0.66% 13.141us 4.380us 0.000us 0.00% 0.000us 0.000us 3
4073
- aten::as_strided 0.10% 2.071us 0.10% 2.071us 0.690us 0.000us 0.00% 0.000us 0.000us 3
4074
- cudaDeviceSynchronize 0.27% 5.300us 0.27% 5.300us 5.300us 0.000us 0.00% 0.000us 0.000us 1
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
- Self CPU time total: 1.983ms
4077
- Self CUDA time total: 25.567us
4078
 
4079
 
4080
 
@@ -4084,43 +4084,42 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q300_H8_E256_L4_P4
4084
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4085
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4086
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4087
- hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 142.528us 304.46% 142.528us 142.528us 1
4088
- hf_kernels_deformable_detr 4.36% 98.391us 99.78% 2.253ms 2.253ms 0.000us 0.00% 47.838us 47.838us 1
4089
- _deformable_detr_57c3d32::ms_deform_attn_forward 1.43% 32.311us 95.42% 2.155ms 718.335us 43.743us 93.44% 47.838us 15.946us 3
4090
- void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 43.743us 93.44% 43.743us 14.581us 3
4091
- aten::zeros 0.35% 7.869us 92.42% 2.087ms 695.715us 0.000us 0.00% 4.095us 1.365us 3
4092
- aten::zero_ 0.37% 8.381us 91.32% 2.062ms 687.455us 0.000us 0.00% 4.095us 1.365us 3
4093
- aten::fill_ 1.13% 25.460us 90.95% 2.054ms 684.661us 3.071us 6.56% 4.095us 1.365us 3
4094
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.071us 6.56% 3.071us 1.024us 3
4095
- Activity Buffer Request 79.30% 1.791ms 79.30% 1.791ms 1.791ms 1.024us 2.19% 1.024us 1.024us 1
4096
- aten::empty 0.75% 16.910us 0.75% 16.910us 5.637us 0.000us 0.00% 0.000us 0.000us 3
4097
- cudaLaunchKernel 11.13% 251.265us 11.13% 251.265us 41.878us 0.000us 0.00% 0.000us 0.000us 6
4098
- aten::view 0.41% 9.300us 0.41% 9.300us 1.550us 0.000us 0.00% 0.000us 0.000us 6
4099
- aten::select 0.48% 10.740us 0.56% 12.720us 4.240us 0.000us 0.00% 0.000us 0.000us 3
4100
- aten::as_strided 0.09% 1.980us 0.09% 1.980us 0.660us 0.000us 0.00% 0.000us 0.000us 3
4101
- cudaDeviceSynchronize 0.22% 4.929us 0.22% 4.929us 4.929us 0.000us 0.00% 0.000us 0.000us 1
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
- Self CPU time total: 2.258ms
4104
- Self CUDA time total: 46.814us
4105
 
4106
 
4107
  impl wl p50(ms) ok
4108
  hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
4109
- hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.04 True
4110
- hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.04 True
4111
  hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
4112
  </pre></div>
4113
  <div class="uv-install-logs" id="uv-logs-benchmark">
4114
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4115
  <div class="uv-logs-content" style="display: none;">
4116
- Installed 14 packages in 11ms
4117
  </div>
4118
  </div>
4119
- <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
4120
-
4121
- Fetching 7 files: 14%|█▍ | 1/7 [00:00&lt;00:02, 2.99it/s]
4122
- Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 9.51it/s]
4123
- Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 11.76it/s]</div>
4124
  <div class="cell-artifacts">
4125
  <h4>Artifacts:</h4>
4126
  <a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.28s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 19:41:27 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
 
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
+ | N/A 32C P0 120W / 350W | 0MiB / 46068MiB | 92% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
 
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
+ Cell: benchmark | 8.66s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
+ hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 191.775us 760.50% 191.775us 191.775us 1
4007
+ hf_kernels_deformable_detr 6.53% 139.932us 99.65% 2.134ms 2.134ms 0.000us 0.00% 26.274us 26.274us 1
4008
+ _deformable_detr_57c3d32::ms_deform_attn_forward 3.14% 67.151us 93.12% 1.994ms 664.639us 22.336us 88.58% 26.274us 8.758us 3
4009
+ void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.336us 88.58% 22.336us 7.445us 3
4010
+ aten::zeros 0.92% 19.641us 87.16% 1.866ms 622.148us 0.000us 0.00% 3.938us 1.313us 3
4011
+ aten::zero_ 0.66% 14.050us 84.59% 1.811ms 603.774us 0.000us 0.00% 3.938us 1.313us 3
4012
+ aten::fill_ 1.47% 31.401us 83.93% 1.797ms 599.090us 2.881us 11.42% 3.938us 1.313us 3
4013
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.881us 11.42% 2.881us 0.960us 3
4014
+ Activity Buffer Request 80.47% 1.723ms 80.47% 1.723ms 1.723ms 1.057us 4.19% 1.057us 1.057us 1
4015
+ aten::empty 1.66% 35.481us 1.66% 35.481us 11.827us 0.000us 0.00% 0.000us 0.000us 3
4016
+ cudaLaunchKernel 2.78% 59.511us 2.78% 59.511us 9.919us 0.000us 0.00% 0.000us 0.000us 6
4017
+ aten::view 0.84% 17.931us 0.84% 17.931us 2.989us 0.000us 0.00% 0.000us 0.000us 6
4018
+ aten::select 1.00% 21.440us 1.20% 25.621us 8.540us 0.000us 0.00% 0.000us 0.000us 3
4019
+ aten::as_strided 0.20% 4.181us 0.20% 4.181us 1.394us 0.000us 0.00% 0.000us 0.000us 3
4020
+ cudaDeviceSynchronize 0.35% 7.461us 0.35% 7.461us 7.461us 0.000us 0.00% 0.000us 0.000us 1
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
+ Self CPU time total: 2.141ms
4023
+ Self CUDA time total: 25.217us
4024
 
4025
 
4026
 
 
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4032
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4033
+ hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 140.638us 537.96% 140.638us 140.638us 1
4034
+ hf_kernels_deformable_detr 3.75% 74.302us 99.73% 1.975ms 1.975ms 0.000us 0.00% 27.071us 27.071us 1
4035
+ _deformable_detr_57c3d32::ms_deform_attn_forward 1.66% 32.812us 95.98% 1.901ms 633.661us 23.327us 89.23% 27.071us 9.024us 3
4036
+ void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 23.327us 89.23% 23.327us 7.776us 3
4037
+ aten::zeros 0.45% 8.890us 92.43% 1.831ms 610.224us 0.000us 0.00% 3.744us 1.248us 3
4038
+ aten::zero_ 0.40% 7.970us 91.07% 1.804ms 601.294us 0.000us 0.00% 3.744us 1.248us 3
4039
+ aten::fill_ 1.26% 24.969us 90.67% 1.796ms 598.637us 2.816us 10.77% 3.744us 1.248us 3
4040
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.816us 10.77% 2.816us 0.939us 3
4041
+ Activity Buffer Request 88.11% 1.745ms 88.11% 1.745ms 1.745ms 0.928us 3.55% 0.928us 0.928us 1
4042
+ aten::empty 0.90% 17.900us 0.90% 17.900us 5.967us 0.000us 0.00% 0.000us 0.000us 3
4043
+ cudaLaunchKernel 2.05% 40.542us 2.05% 40.542us 6.757us 0.000us 0.00% 0.000us 0.000us 6
4044
+ aten::view 0.46% 9.070us 0.46% 9.070us 1.512us 0.000us 0.00% 0.000us 0.000us 6
4045
+ aten::select 0.58% 11.410us 0.69% 13.720us 4.573us 0.000us 0.00% 0.000us 0.000us 3
4046
+ aten::as_strided 0.12% 2.310us 0.12% 2.310us 0.770us 0.000us 0.00% 0.000us 0.000us 3
4047
+ cudaDeviceSynchronize 0.27% 5.400us 0.27% 5.400us 5.400us 0.000us 0.00% 0.000us 0.000us 1
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
+ Self CPU time total: 1.981ms
4050
+ Self CUDA time total: 26.143us
4051
 
4052
 
4053
 
 
4057
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4058
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4059
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
+ hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 139.424us 546.70% 139.424us 139.424us 1
4061
+ hf_kernels_deformable_detr 3.45% 67.322us 99.73% 1.947ms 1.947ms 0.000us 0.00% 26.463us 26.463us 1
4062
+ _deformable_detr_57c3d32::ms_deform_attn_forward 1.76% 34.371us 96.28% 1.880ms 626.621us 22.688us 88.96% 26.463us 8.821us 3
4063
+ void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.688us 88.96% 22.688us 7.563us 3
4064
+ aten::zeros 0.42% 8.159us 92.58% 1.808ms 602.514us 0.000us 0.00% 3.775us 1.258us 3
4065
+ aten::zero_ 0.40% 7.880us 91.30% 1.783ms 594.177us 0.000us 0.00% 3.775us 1.258us 3
4066
+ aten::fill_ 1.36% 26.500us 90.89% 1.775ms 591.551us 2.815us 11.04% 3.775us 1.258us 3
4067
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.815us 11.04% 2.815us 0.938us 3
4068
+ Activity Buffer Request 88.22% 1.722ms 88.22% 1.722ms 1.722ms 0.960us 3.76% 0.960us 0.960us 1
4069
+ aten::empty 0.86% 16.851us 0.86% 16.851us 5.617us 0.000us 0.00% 0.000us 0.000us 3
4070
+ cudaLaunchKernel 2.08% 40.632us 2.08% 40.632us 6.772us 0.000us 0.00% 0.000us 0.000us 6
4071
+ aten::view 0.52% 10.080us 0.52% 10.080us 1.680us 0.000us 0.00% 0.000us 0.000us 6
4072
+ aten::select 0.55% 10.661us 0.66% 12.960us 4.320us 0.000us 0.00% 0.000us 0.000us 3
4073
+ aten::as_strided 0.12% 2.299us 0.12% 2.299us 0.766us 0.000us 0.00% 0.000us 0.000us 3
4074
+ cudaDeviceSynchronize 0.27% 5.270us 0.27% 5.270us 5.270us 0.000us 0.00% 0.000us 0.000us 1
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
+ Self CPU time total: 1.952ms
4077
+ Self CUDA time total: 25.503us
4078
 
4079
 
4080
 
 
4084
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4085
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4086
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4087
+ hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 144.383us 310.53% 144.383us 144.383us 1
4088
+ hf_kernels_deformable_detr 3.20% 70.383us 99.77% 2.197ms 2.197ms 0.000us 0.00% 47.520us 47.520us 1
4089
+ _deformable_detr_57c3d32::ms_deform_attn_forward 1.51% 33.251us 96.57% 2.127ms 709.009us 43.392us 93.32% 47.520us 15.840us 3
4090
+ void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 43.392us 93.32% 43.392us 14.464us 3
4091
+ aten::zeros 0.36% 7.853us 93.39% 2.057ms 685.609us 0.000us 0.00% 4.128us 1.376us 3
4092
+ aten::zero_ 0.36% 8.030us 92.24% 2.032ms 677.202us 0.000us 0.00% 4.128us 1.376us 3
4093
+ aten::fill_ 1.13% 24.791us 91.88% 2.024ms 674.525us 3.104us 6.68% 4.128us 1.376us 3
4094
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.104us 6.68% 3.104us 1.035us 3
4095
+ Activity Buffer Request 79.51% 1.751ms 79.51% 1.751ms 1.751ms 1.024us 2.20% 1.024us 1.024us 1
4096
+ aten::empty 0.79% 17.369us 0.79% 17.369us 5.790us 0.000us 0.00% 0.000us 0.000us 3
4097
+ cudaLaunchKernel 11.88% 261.685us 11.88% 261.685us 43.614us 0.000us 0.00% 0.000us 0.000us 6
4098
+ aten::view 0.43% 9.529us 0.43% 9.529us 1.588us 0.000us 0.00% 0.000us 0.000us 6
4099
+ aten::select 0.50% 10.960us 0.60% 13.220us 4.407us 0.000us 0.00% 0.000us 0.000us 3
4100
+ aten::as_strided 0.10% 2.260us 0.10% 2.260us 0.753us 0.000us 0.00% 0.000us 0.000us 3
4101
+ cudaDeviceSynchronize 0.23% 5.101us 0.23% 5.101us 5.101us 0.000us 0.00% 0.000us 0.000us 1
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
+ Self CPU time total: 2.203ms
4104
+ Self CUDA time total: 46.496us
4105
 
4106
 
4107
  impl wl p50(ms) ok
4108
  hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
4109
+ hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.05 True
4110
+ hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.05 True
4111
  hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
4112
  </pre></div>
4113
  <div class="uv-install-logs" id="uv-logs-benchmark">
4114
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4115
  <div class="uv-logs-content" style="display: none;">
4116
+ Installed 51 packages in 320ms
4117
  </div>
4118
  </div>
4119
+ <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4120
+ Fetching 7 files: 14%|█▍ | 1/7 [00:00&lt;00:00, 8.32it/s]
4121
+ Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 7.49it/s]
4122
+ Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 10.54it/s]</div>
 
4123
  <div class="cell-artifacts">
4124
  <h4>Artifacts:</h4>
4125
  <a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
deformable_detr/impls/torch_deformable_detr.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,7 +3904,7 @@ Cell: nv | 0.25s
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
- <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:55:49 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.25s
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
- | N/A 30C P0 77W / 350W | 0MiB / 46068MiB | 11% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
@@ -3935,9 +3935,9 @@ Cell: nv | 0.25s
3935
  <span class="collapse-indicators">
3936
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3939
  </span> |
3940
- Cell: benchmark | 9.12s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -4077,29 +4077,29 @@ PROFILE TRACE: torch_eager | cuda_B1_Q100_H8_E256_L4_P4
4077
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4078
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.928ms 1345.17% 19.928ms 19.928ms 1
4081
- torch_eager 20.03% 4.383ms 99.97% 21.877ms 21.877ms 0.000us 0.00% 1.483ms 1.483ms 1
4082
- aten::index 4.57% 999.946us 16.87% 3.693ms 76.930us 235.999us 15.93% 369.535us 7.699us 48
4083
- aten::copy_ 4.70% 1.029ms 11.50% 2.517ms 11.491us 366.142us 24.72% 366.142us 1.672us 219
4084
- aten::mul 5.86% 1.283ms 10.10% 2.209ms 11.507us 293.927us 19.84% 293.927us 1.531us 192
4085
- void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 235.999us 15.93% 235.999us 4.917us 48
4086
- aten::to 0.58% 126.416us 10.88% 2.380ms 13.921us 0.000us 0.00% 232.606us 1.360us 171
4087
- aten::_to_copy 1.91% 417.236us 10.30% 2.254ms 18.325us 0.000us 0.00% 232.606us 1.891us 123
4088
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 202.308us 13.66% 202.308us 1.686us 120
4089
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 167.963us 11.34% 167.963us 2.000us 84
4090
- aten::contiguous 0.37% 80.417us 8.79% 1.925ms 20.049us 0.000us 0.00% 133.536us 1.391us 96
4091
- aten::clone 0.80% 175.766us 8.43% 1.844ms 19.211us 0.000us 0.00% 133.536us 1.391us 96
4092
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.536us 9.01% 133.536us 1.391us 96
4093
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.352us 7.79% 115.352us 1.202us 96
4094
- aten::__and__ 0.45% 97.450us 4.50% 984.021us 11.715us 0.000us 0.00% 98.725us 1.175us 84
4095
- aten::bitwise_and 2.51% 548.975us 4.05% 886.571us 10.554us 98.725us 6.66% 98.725us 1.175us 84
4096
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 98.725us 6.66% 98.725us 1.175us 84
4097
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 86.111us 5.81% 86.111us 1.196us 72
4098
- aten::sub 2.21% 483.704us 3.73% 817.012us 11.347us 79.134us 5.34% 79.134us 1.099us 72
4099
- aten::add 1.64% 359.872us 2.73% 597.608us 9.960us 74.367us 5.02% 74.367us 1.239us 60
4100
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4101
- Self CPU time total: 21.884ms
4102
- Self CUDA time total: 1.481ms
4103
 
4104
 
4105
 
@@ -4109,29 +4109,29 @@ PROFILE TRACE: torch_eager | cuda_B1_Q300_H8_E256_L4_P4
4109
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4110
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4111
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4112
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 18.701ms 1173.82% 18.701ms 18.701ms 1
4113
- torch_eager 19.86% 4.084ms 99.94% 20.549ms 20.549ms 0.000us 0.00% 1.594ms 1.594ms 1
4114
- aten::index 4.47% 919.982us 16.50% 3.393ms 70.681us 250.075us 15.70% 381.947us 7.957us 48
4115
- aten::copy_ 4.90% 1.007ms 11.73% 2.411ms 11.009us 365.571us 22.95% 365.571us 1.669us 219
4116
- aten::mul 5.89% 1.211ms 10.29% 2.116ms 11.019us 357.953us 22.47% 357.953us 1.864us 192
4117
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 266.175us 16.71% 266.175us 2.218us 120
4118
- void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 250.075us 15.70% 250.075us 5.210us 48
4119
- aten::to 0.56% 115.808us 10.96% 2.254ms 13.183us 0.000us 0.00% 233.699us 1.367us 171
4120
- aten::_to_copy 1.83% 375.992us 10.40% 2.138ms 17.386us 0.000us 0.00% 233.699us 1.900us 123
4121
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 169.346us 10.63% 169.346us 2.016us 84
4122
- aten::contiguous 0.37% 76.815us 8.72% 1.793ms 18.680us 0.000us 0.00% 131.872us 1.374us 96
4123
- aten::clone 0.79% 162.290us 8.35% 1.716ms 17.880us 0.000us 0.00% 131.872us 1.374us 96
4124
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 131.872us 8.28% 131.872us 1.374us 96
4125
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 117.600us 7.38% 117.600us 1.225us 96
4126
- aten::__and__ 0.42% 86.722us 4.57% 939.170us 11.181us 0.000us 0.00% 105.348us 1.254us 84
4127
- aten::bitwise_and 2.53% 520.363us 4.15% 852.448us 10.148us 105.348us 6.61% 105.348us 1.254us 84
4128
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 105.348us 6.61% 105.348us 1.254us 84
4129
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.196us 6.54% 104.196us 1.447us 72
4130
- aten::add 1.65% 339.069us 2.79% 573.170us 9.553us 91.619us 5.75% 91.619us 1.527us 60
4131
- aten::sub 2.16% 443.591us 3.72% 765.420us 10.631us 80.447us 5.05% 80.447us 1.117us 72
4132
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4133
- Self CPU time total: 20.561ms
4134
- Self CUDA time total: 1.593ms
4135
 
4136
 
4137
 
@@ -4141,29 +4141,29 @@ PROFILE TRACE: torch_eager | cuda_B2_Q100_H8_E256_L4_P4
4141
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4142
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4143
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4144
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.345ms 1257.82% 19.345ms 19.345ms 1
4145
- torch_eager 19.37% 4.137ms 99.97% 21.351ms 21.351ms 0.000us 0.00% 1.539ms 1.539ms 1
4146
- aten::index 4.47% 955.266us 16.53% 3.530ms 73.551us 242.625us 15.78% 377.060us 7.855us 48
4147
- aten::copy_ 4.74% 1.012ms 11.59% 2.476ms 11.307us 367.943us 23.92% 367.943us 1.680us 219
4148
- aten::mul 5.81% 1.241ms 10.15% 2.167ms 11.287us 324.158us 21.08% 324.158us 1.688us 192
4149
- void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 242.625us 15.78% 242.625us 5.055us 48
4150
- aten::to 0.53% 113.722us 11.14% 2.380ms 13.916us 0.000us 0.00% 233.508us 1.366us 171
4151
- aten::_to_copy 2.07% 441.682us 10.61% 2.266ms 18.422us 0.000us 0.00% 233.508us 1.898us 123
4152
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 233.472us 15.18% 233.472us 1.946us 120
4153
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 168.769us 10.97% 168.769us 2.009us 84
4154
- aten::contiguous 0.38% 81.343us 8.57% 1.831ms 19.072us 0.000us 0.00% 134.435us 1.400us 96
4155
- aten::clone 0.71% 151.394us 8.19% 1.750ms 18.225us 0.000us 0.00% 134.435us 1.400us 96
4156
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 134.435us 8.74% 134.435us 1.400us 96
4157
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 116.161us 7.55% 116.161us 1.210us 96
4158
- aten::__and__ 0.37% 78.366us 4.26% 910.569us 10.840us 0.000us 0.00% 104.128us 1.240us 84
4159
- aten::bitwise_and 2.32% 495.587us 3.90% 832.203us 9.907us 104.128us 6.77% 104.128us 1.240us 84
4160
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.128us 6.77% 104.128us 1.240us 84
4161
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 95.391us 6.20% 95.391us 1.325us 72
4162
- aten::add 1.65% 352.101us 2.82% 602.659us 10.044us 83.522us 5.43% 83.522us 1.392us 60
4163
- aten::sub 2.19% 467.179us 3.78% 806.853us 11.206us 79.169us 5.15% 79.169us 1.100us 72
4164
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4165
- Self CPU time total: 21.357ms
4166
- Self CUDA time total: 1.538ms
4167
 
4168
 
4169
 
@@ -4173,43 +4173,37 @@ PROFILE TRACE: torch_eager | cuda_B2_Q300_H8_E256_L4_P4
4173
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4174
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4175
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4176
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 18.928ms 1070.52% 18.928ms 18.928ms 1
4177
- torch_eager 19.00% 4.018ms 99.97% 21.144ms 21.144ms 0.000us 0.00% 1.769ms 1.769ms 1
4178
- aten::mul 5.84% 1.234ms 10.44% 2.209ms 11.503us 449.959us 25.45% 449.959us 2.344us 192
4179
- aten::index 4.43% 937.219us 16.19% 3.424ms 71.339us 281.246us 15.91% 418.466us 8.718us 48
4180
- aten::copy_ 4.75% 1.005ms 11.71% 2.477ms 11.312us 370.923us 20.98% 370.923us 1.694us 219
4181
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 355.583us 20.11% 355.583us 2.963us 120
4182
- void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 281.246us 15.91% 281.246us 5.859us 48
4183
- aten::to 0.52% 110.789us 10.89% 2.302ms 13.465us 0.000us 0.00% 233.703us 1.367us 171
4184
- aten::_to_copy 1.88% 398.545us 10.36% 2.192ms 17.819us 0.000us 0.00% 233.703us 1.900us 123
4185
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 168.070us 9.51% 168.070us 2.001us 84
4186
- aten::contiguous 0.38% 80.073us 8.57% 1.813ms 18.880us 0.000us 0.00% 137.220us 1.429us 96
4187
- aten::clone 0.71% 149.477us 8.19% 1.732ms 18.046us 0.000us 0.00% 137.220us 1.429us 96
4188
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 137.220us 7.76% 137.220us 1.429us 96
4189
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 128.960us 7.29% 128.960us 1.791us 72
4190
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 120.326us 6.81% 120.326us 1.253us 96
4191
- aten::add 1.60% 338.443us 2.84% 599.957us 9.999us 113.407us 6.41% 113.407us 1.890us 60
4192
- aten::__and__ 0.34% 72.039us 4.35% 919.096us 10.942us 0.000us 0.00% 109.028us 1.298us 84
4193
- aten::bitwise_and 2.36% 498.512us 4.00% 847.057us 10.084us 109.028us 6.17% 109.028us 1.298us 84
4194
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 109.028us 6.17% 109.028us 1.298us 84
4195
- aten::sub 2.14% 452.695us 3.86% 815.589us 11.328us 84.674us 4.79% 84.674us 1.176us 72
4196
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4197
- Self CPU time total: 21.151ms
4198
- Self CUDA time total: 1.768ms
4199
 
4200
 
4201
  impl wl p50(ms) ok
4202
- torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.29 True
4203
- torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.07 True
4204
- torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.07 True
4205
- torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.11 True
4206
  </pre></div>
4207
- <div class="uv-install-logs" id="uv-logs-benchmark">
4208
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4209
- <div class="uv-logs-content" style="display: none;">
4210
- Installed 37 packages in 286ms
4211
- </div>
4212
- </div>
4213
  <div class="cell-artifacts">
4214
  <h4>Artifacts:</h4>
4215
  <a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.28s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 19:41:27 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
 
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
+ | N/A 32C P0 120W / 350W | 0MiB / 46068MiB | 92% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
 
3935
  <span class="collapse-indicators">
3936
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3939
  </span> |
3940
+ Cell: benchmark | 5.49s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
4077
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4078
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 20.954ms 1412.31% 20.954ms 20.954ms 1
4081
+ torch_eager 20.81% 4.774ms 99.96% 22.930ms 22.930ms 0.000us 0.00% 1.485ms 1.485ms 1
4082
+ aten::index 4.58% 1.051ms 16.46% 3.775ms 78.637us 236.928us 15.97% 370.530us 7.719us 48
4083
+ aten::copy_ 4.73% 1.085ms 11.22% 2.575ms 11.756us 365.953us 24.67% 365.953us 1.671us 219
4084
+ aten::mul 5.80% 1.330ms 10.04% 2.304ms 12.001us 294.214us 19.83% 294.214us 1.532us 192
4085
+ void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 236.928us 15.97% 236.928us 4.936us 48
4086
+ aten::to 0.58% 133.877us 11.21% 2.571ms 15.036us 0.000us 0.00% 232.351us 1.359us 171
4087
+ aten::_to_copy 2.31% 530.135us 10.63% 2.437ms 19.815us 0.000us 0.00% 232.351us 1.889us 123
4088
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 202.211us 13.63% 202.211us 1.685us 120
4089
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 167.998us 11.32% 167.998us 2.000us 84
4090
+ aten::contiguous 0.35% 80.702us 8.37% 1.919ms 19.991us 0.000us 0.00% 133.602us 1.392us 96
4091
+ aten::clone 0.72% 165.584us 8.01% 1.838ms 19.151us 0.000us 0.00% 133.602us 1.392us 96
4092
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.602us 9.00% 133.602us 1.392us 96
4093
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.712us 7.80% 115.712us 1.205us 96
4094
+ aten::__and__ 0.62% 142.312us 4.46% 1.024ms 12.189us 0.000us 0.00% 99.106us 1.180us 84
4095
+ aten::bitwise_and 2.26% 518.769us 3.84% 881.597us 10.495us 99.106us 6.68% 99.106us 1.180us 84
4096
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 99.106us 6.68% 99.106us 1.180us 84
4097
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 86.240us 5.81% 86.240us 1.198us 72
4098
+ aten::sub 2.18% 500.017us 3.71% 850.631us 11.814us 79.203us 5.34% 79.203us 1.100us 72
4099
+ aten::add 1.61% 368.526us 2.74% 627.393us 10.457us 74.431us 5.02% 74.431us 1.241us 60
4100
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4101
+ Self CPU time total: 22.938ms
4102
+ Self CUDA time total: 1.484ms
4103
 
4104
 
4105
 
 
4109
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4110
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4111
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4112
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.509ms 1221.49% 19.509ms 19.509ms 1
4113
+ torch_eager 19.85% 4.302ms 99.97% 21.668ms 21.668ms 0.000us 0.00% 1.598ms 1.598ms 1
4114
+ aten::index 4.46% 966.583us 16.34% 3.542ms 73.793us 250.148us 15.66% 382.462us 7.968us 48
4115
+ aten::copy_ 4.88% 1.058ms 11.66% 2.528ms 11.545us 367.423us 23.01% 367.423us 1.678us 219
4116
+ aten::mul 5.89% 1.276ms 10.32% 2.236ms 11.647us 359.260us 22.49% 359.260us 1.871us 192
4117
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 267.420us 16.74% 267.420us 2.228us 120
4118
+ void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 250.148us 15.66% 250.148us 5.211us 48
4119
+ aten::to 0.54% 118.126us 10.89% 2.361ms 13.808us 0.000us 0.00% 235.109us 1.375us 171
4120
+ aten::_to_copy 1.87% 405.252us 10.35% 2.243ms 18.236us 0.000us 0.00% 235.109us 1.911us 123
4121
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 169.767us 10.63% 169.767us 2.021us 84
4122
+ aten::contiguous 0.36% 77.869us 8.56% 1.855ms 19.322us 0.000us 0.00% 132.314us 1.378us 96
4123
+ aten::clone 0.77% 166.617us 8.20% 1.777ms 18.511us 0.000us 0.00% 132.314us 1.378us 96
4124
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 132.314us 8.28% 132.314us 1.378us 96
4125
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 117.886us 7.38% 117.886us 1.228us 96
4126
+ aten::__and__ 0.36% 78.606us 4.33% 937.927us 11.166us 0.000us 0.00% 105.249us 1.253us 84
4127
+ aten::bitwise_and 2.36% 512.411us 3.96% 859.321us 10.230us 105.249us 6.59% 105.249us 1.253us 84
4128
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 105.249us 6.59% 105.249us 1.253us 84
4129
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.480us 6.54% 104.480us 1.451us 72
4130
+ aten::add 1.62% 350.142us 2.81% 608.190us 10.136us 91.837us 5.75% 91.837us 1.531us 60
4131
+ aten::sub 2.30% 498.767us 3.88% 840.992us 11.680us 80.480us 5.04% 80.480us 1.118us 72
4132
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4133
+ Self CPU time total: 21.675ms
4134
+ Self CUDA time total: 1.597ms
4135
 
4136
 
4137
 
 
4141
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4142
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4143
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4144
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.494ms 1266.42% 19.494ms 19.494ms 1
4145
+ torch_eager 20.07% 4.284ms 99.97% 21.345ms 21.345ms 0.000us 0.00% 1.540ms 1.540ms 1
4146
+ aten::index 4.57% 976.579us 16.61% 3.546ms 73.876us 243.229us 15.80% 377.664us 7.868us 48
4147
+ aten::copy_ 4.96% 1.060ms 11.92% 2.545ms 11.623us 367.712us 23.89% 367.712us 1.679us 219
4148
+ aten::mul 6.15% 1.313ms 10.67% 2.278ms 11.865us 325.252us 21.13% 325.252us 1.694us 192
4149
+ void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 243.229us 15.80% 243.229us 5.067us 48
4150
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 234.241us 15.22% 234.241us 1.952us 120
4151
+ aten::to 0.55% 117.567us 11.05% 2.359ms 13.796us 0.000us 0.00% 233.277us 1.364us 171
4152
+ aten::_to_copy 1.93% 412.957us 10.50% 2.242ms 18.225us 0.000us 0.00% 233.277us 1.897us 123
4153
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 168.702us 10.96% 168.702us 2.008us 84
4154
+ aten::contiguous 0.37% 78.560us 8.76% 1.871ms 19.493us 0.000us 0.00% 134.435us 1.400us 96
4155
+ aten::clone 0.72% 153.204us 8.40% 1.793ms 18.675us 0.000us 0.00% 134.435us 1.400us 96
4156
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 134.435us 8.73% 134.435us 1.400us 96
4157
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.962us 7.53% 115.962us 1.208us 96
4158
+ aten::__and__ 0.35% 74.950us 4.35% 927.999us 11.048us 0.000us 0.00% 104.006us 1.238us 84
4159
+ aten::bitwise_and 2.36% 503.597us 4.00% 853.049us 10.155us 104.006us 6.76% 104.006us 1.238us 84
4160
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.006us 6.76% 104.006us 1.238us 84
4161
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 95.808us 6.22% 95.808us 1.331us 72
4162
+ aten::add 1.68% 357.766us 2.90% 618.339us 10.306us 83.778us 5.44% 83.778us 1.396us 60
4163
+ aten::sub 2.21% 472.075us 3.83% 818.182us 11.364us 78.946us 5.13% 78.946us 1.096us 72
4164
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4165
+ Self CPU time total: 21.351ms
4166
+ Self CUDA time total: 1.539ms
4167
 
4168
 
4169
 
 
4173
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4174
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4175
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4176
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 21.106ms 1191.42% 21.106ms 21.106ms 1
4177
+ torch_eager 20.48% 4.473ms 99.97% 21.833ms 21.833ms 0.000us 0.00% 1.773ms 1.773ms 1
4178
+ aten::mul 6.38% 1.394ms 11.03% 2.409ms 12.546us 451.910us 25.51% 451.910us 2.354us 192
4179
+ aten::index 4.81% 1.050ms 17.73% 3.872ms 80.660us 281.474us 15.89% 419.235us 8.734us 48
4180
+ aten::copy_ 5.13% 1.119ms 12.00% 2.622ms 11.970us 371.967us 21.00% 371.967us 1.698us 219
4181
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 357.220us 20.17% 357.220us 2.977us 120
4182
+ void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 281.474us 15.89% 281.474us 5.864us 48
4183
+ aten::to 0.62% 134.727us 11.66% 2.546ms 14.889us 0.000us 0.00% 234.206us 1.370us 171
4184
+ aten::_to_copy 2.10% 458.958us 11.04% 2.411ms 19.605us 0.000us 0.00% 234.206us 1.904us 123
4185
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 168.509us 9.51% 168.509us 2.006us 84
4186
+ aten::contiguous 0.48% 104.345us 9.14% 1.996ms 20.797us 0.000us 0.00% 137.761us 1.435us 96
4187
+ aten::clone 0.85% 185.548us 8.66% 1.892ms 19.710us 0.000us 0.00% 137.761us 1.435us 96
4188
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 137.761us 7.78% 137.761us 1.435us 96
4189
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 129.798us 7.33% 129.798us 1.803us 72
4190
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 120.063us 6.78% 120.063us 1.251us 96
4191
+ aten::add 1.83% 400.048us 3.06% 668.907us 11.148us 114.148us 6.44% 114.148us 1.902us 60
4192
+ aten::__and__ 0.43% 94.485us 4.77% 1.041ms 12.390us 0.000us 0.00% 108.862us 1.296us 84
4193
+ aten::bitwise_and 2.65% 579.339us 4.33% 946.258us 11.265us 108.862us 6.15% 108.862us 1.296us 84
4194
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 108.862us 6.15% 108.862us 1.296us 84
4195
+ aten::sub 2.45% 535.892us 4.10% 895.598us 12.439us 84.572us 4.77% 84.572us 1.175us 72
4196
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4197
+ Self CPU time total: 21.838ms
4198
+ Self CUDA time total: 1.771ms
4199
 
4200
 
4201
  impl wl p50(ms) ok
4202
+ torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.46 True
4203
+ torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.24 True
4204
+ torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.25 True
4205
+ torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.34 True
4206
  </pre></div>
 
 
 
 
 
 
4207
  <div class="cell-artifacts">
4208
  <h4>Artifacts:</h4>
4209
  <a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
deformable_detr/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 8ba4297c6f7aa344148bb08f308ffa1b2639ebb4a639b03f7139f745563a6d78
  • Pointer size: 130 Bytes
  • Size of remote file: 17.8 kB

Git LFS Details

  • SHA256: 88116c3810103702d4e4bca4659d09621c275dbe5bc24360506bd5c5adb84f9c
  • Pointer size: 130 Bytes
  • Size of remote file: 14.9 kB
deformable_detr/results/combined_results.html CHANGED
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-12-19T19:10:04.668129</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
@@ -3908,260 +3908,208 @@ body[data-tool="eraser"] .main-content {
3908
  </g>
3909
  <g id="axes--1" class="axes">
3910
  <g id="patch_2">
3911
- <path d="M 47.72 425.105974 L 824.19299 425.105974 L 824.19299 26.88 L 47.72 26.88 L 47.72 425.105974 z " style="fill: none" />
3912
  </g>
3913
  <g id="matplotlib.axis_1">
3914
  <g id="xtick_1">
3915
  <g id="grid-x--1" class="grid grid-x">
3916
- <path d="M 83.014227 425.105974 L 83.014227 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3917
  </g>
3918
  <g id="line2d_1">
3919
  <defs>
3920
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
3921
  </defs>
3922
  <g>
3923
- <use ns4:href="#mafb3703e5b" x="83.014227" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
3924
  </g>
3925
  </g>
3926
  <g id="text_1">
3927
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(28.96641 549.280197) rotate(-45)">cuda_B1_Q100_H8_E256_L4_P4</text>
3928
  </g>
3929
  </g>
3930
  <g id="xtick_2">
3931
  <g id="grid-x--2" class="grid grid-x">
3932
- <path d="M 318.309072 425.105974 L 318.309072 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3933
  </g>
3934
  <g id="line2d_2">
3935
  <g>
3936
- <use ns4:href="#mafb3703e5b" x="318.309072" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
3937
  </g>
3938
  </g>
3939
  <g id="text_2">
3940
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(264.261255 549.280197) rotate(-45)">cuda_B1_Q300_H8_E256_L4_P4</text>
3941
  </g>
3942
  </g>
3943
  <g id="xtick_3">
3944
  <g id="grid-x--3" class="grid grid-x">
3945
- <path d="M 553.603918 425.105974 L 553.603918 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3946
  </g>
3947
  <g id="line2d_3">
3948
  <g>
3949
- <use ns4:href="#mafb3703e5b" x="553.603918" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
3950
  </g>
3951
  </g>
3952
  <g id="text_3">
3953
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(499.556101 549.280197) rotate(-45)">cuda_B2_Q100_H8_E256_L4_P4</text>
3954
  </g>
3955
  </g>
3956
  <g id="xtick_4">
3957
  <g id="grid-x--4" class="grid grid-x">
3958
- <path d="M 788.898763 425.105974 L 788.898763 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3959
  </g>
3960
  <g id="line2d_4">
3961
  <g>
3962
- <use ns4:href="#mafb3703e5b" x="788.898763" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
3963
  </g>
3964
  </g>
3965
  <g id="text_4">
3966
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(734.850946 549.280197) rotate(-45)">cuda_B2_Q300_H8_E256_L4_P4</text>
3967
  </g>
3968
  </g>
3969
  <g id="label--x" class="xlabel">
3970
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.956495" y="562.545859" transform="rotate(-0 435.956495 562.545859)">Workload</text>
3971
  </g>
3972
  </g>
3973
  <g id="matplotlib.axis_2">
3974
  <g id="ytick_1">
3975
  <g id="grid-y--2" class="grid grid-y">
3976
- <path d="M 47.72 410.313695 L 824.19299 410.313695 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3977
  </g>
3978
  <g id="line2d_5">
3979
  <defs>
3980
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3981
  </defs>
3982
  <g>
3983
- <use ns4:href="#m0fca2865ba" x="47.72" y="410.313695" style="stroke: #000000; stroke-width: 0.8" />
3984
  </g>
3985
  </g>
3986
  <g id="text_5">
3987
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="414.112914" transform="rotate(-0 40.72 414.112914)">0.0</text>
3988
  </g>
3989
  </g>
3990
  <g id="ytick_2">
3991
  <g id="grid-y--3" class="grid grid-y">
3992
- <path d="M 47.72 365.88698 L 824.19299 365.88698 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3993
  </g>
3994
  <g id="line2d_6">
3995
  <g>
3996
- <use ns4:href="#m0fca2865ba" x="47.72" y="365.88698" style="stroke: #000000; stroke-width: 0.8" />
3997
  </g>
3998
  </g>
3999
  <g id="text_6">
4000
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="369.686199" transform="rotate(-0 40.72 369.686199)">0.5</text>
4001
  </g>
4002
  </g>
4003
  <g id="ytick_3">
4004
  <g id="grid-y--4" class="grid grid-y">
4005
- <path d="M 47.72 321.460266 L 824.19299 321.460266 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4006
  </g>
4007
  <g id="line2d_7">
4008
  <g>
4009
- <use ns4:href="#m0fca2865ba" x="47.72" y="321.460266" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="325.259484" transform="rotate(-0 40.72 325.259484)">1.0</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_4">
4017
  <g id="grid-y--5" class="grid grid-y">
4018
- <path d="M 47.72 277.033551 L 824.19299 277.033551 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
- <use ns4:href="#m0fca2865ba" x="47.72" y="277.033551" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="280.83277" transform="rotate(-0 40.72 280.83277)">1.5</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_5">
4030
  <g id="grid-y--6" class="grid grid-y">
4031
- <path d="M 47.72 232.606836 L 824.19299 232.606836 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
- <use ns4:href="#m0fca2865ba" x="47.72" y="232.606836" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="236.406055" transform="rotate(-0 40.72 236.406055)">2.0</text>
4040
- </g>
4041
- </g>
4042
- <g id="ytick_6">
4043
- <g id="grid-y--7" class="grid grid-y">
4044
- <path d="M 47.72 188.180122 L 824.19299 188.180122 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4045
- </g>
4046
- <g id="line2d_10">
4047
- <g>
4048
- <use ns4:href="#m0fca2865ba" x="47.72" y="188.180122" style="stroke: #000000; stroke-width: 0.8" />
4049
- </g>
4050
- </g>
4051
- <g id="text_10">
4052
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="191.97934" transform="rotate(-0 40.72 191.97934)">2.5</text>
4053
- </g>
4054
- </g>
4055
- <g id="ytick_7">
4056
- <g id="grid-y--8" class="grid grid-y">
4057
- <path d="M 47.72 143.753407 L 824.19299 143.753407 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
- </g>
4059
- <g id="line2d_11">
4060
- <g>
4061
- <use ns4:href="#m0fca2865ba" x="47.72" y="143.753407" style="stroke: #000000; stroke-width: 0.8" />
4062
- </g>
4063
- </g>
4064
- <g id="text_11">
4065
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="147.552626" transform="rotate(-0 40.72 147.552626)">3.0</text>
4066
- </g>
4067
- </g>
4068
- <g id="ytick_8">
4069
- <g id="grid-y--9" class="grid grid-y">
4070
- <path d="M 47.72 99.326692 L 824.19299 99.326692 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
- </g>
4072
- <g id="line2d_12">
4073
- <g>
4074
- <use ns4:href="#m0fca2865ba" x="47.72" y="99.326692" style="stroke: #000000; stroke-width: 0.8" />
4075
- </g>
4076
- </g>
4077
- <g id="text_12">
4078
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="103.125911" transform="rotate(-0 40.72 103.125911)">3.5</text>
4079
- </g>
4080
- </g>
4081
- <g id="ytick_9">
4082
- <g id="grid-y--10" class="grid grid-y">
4083
- <path d="M 47.72 54.899978 L 824.19299 54.899978 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
- </g>
4085
- <g id="line2d_13">
4086
- <g>
4087
- <use ns4:href="#m0fca2865ba" x="47.72" y="54.899978" style="stroke: #000000; stroke-width: 0.8" />
4088
- </g>
4089
- </g>
4090
- <g id="text_13">
4091
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="58.699197" transform="rotate(-0 40.72 58.699197)">4.0</text>
4092
  </g>
4093
  </g>
4094
  <g id="label--y" class="ylabel">
4095
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="225.992987" transform="rotate(-90 18.737188 225.992987)">Latency P50 (ms)</text>
4096
  </g>
4097
  </g>
4098
  <g id="series--hf-kernels-deformable-detr" class="series">
4099
- <path d="M 83.014227 407.004793 L 318.309072 406.541778 L 553.603918 406.347278 L 788.898763 406.283214 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4100
  <defs>
4101
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4102
  </defs>
4103
- <g clip-path="url(#pb5c8282ea4)">
4104
- <use ns4:href="#md7efaf3aec" x="83.014227" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
4105
- <use ns4:href="#md7efaf3aec" x="318.309072" y="406.541778" style="fill: #1f77b4; stroke: #1f77b4" />
4106
- <use ns4:href="#md7efaf3aec" x="553.603918" y="406.347278" style="fill: #1f77b4; stroke: #1f77b4" />
4107
- <use ns4:href="#md7efaf3aec" x="788.898763" y="406.283214" style="fill: #1f77b4; stroke: #1f77b4" />
4108
  </g>
4109
  </g>
4110
  <g id="series--torch-eager" class="series">
4111
- <path d="M 83.014227 118.130211 L 318.309072 48.708671 L 553.603918 48.49098 L 788.898763 44.981181 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4112
  <defs>
4113
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4114
  </defs>
4115
- <g clip-path="url(#pb5c8282ea4)">
4116
- <use ns4:href="#m9b8c54d372" x="83.014227" y="118.130211" style="fill: #ff7f0e; stroke: #ff7f0e" />
4117
- <use ns4:href="#m9b8c54d372" x="318.309072" y="48.708671" style="fill: #ff7f0e; stroke: #ff7f0e" />
4118
- <use ns4:href="#m9b8c54d372" x="553.603918" y="48.49098" style="fill: #ff7f0e; stroke: #ff7f0e" />
4119
- <use ns4:href="#m9b8c54d372" x="788.898763" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4120
  </g>
4121
  </g>
4122
  <g id="patch_3">
4123
- <path d="M 47.72 425.105974 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4124
  </g>
4125
  <g id="patch_4">
4126
  <path d="M 824.19299 425.105974 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4127
  </g>
4128
  <g id="patch_5">
4129
- <path d="M 47.72 425.105974 L 824.19299 425.105974 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4130
  </g>
4131
  <g id="patch_6">
4132
- <path d="M 47.72 26.88 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4133
  </g>
4134
- <g id="text_14">
4135
- <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.956495" y="20.88" transform="rotate(-0 435.956495 20.88)">Attention Implementation Latency</text>
4136
  </g>
4137
  <g id="legend" class="legend">
4138
  <g id="patch_7">
4139
- <path d="M 54.72 64.7925 L 225.330938 64.7925 Q 227.330938 64.7925 227.330938 62.7925 L 227.330938 33.88 Q 227.330938 31.88 225.330938 31.88 L 54.72 31.88 Q 52.72 31.88 52.72 33.88 L 52.72 62.7925 Q 52.72 64.7925 54.72 64.7925 L 54.72 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4140
  </g>
4141
- <g id="line2d_14">
4142
- <path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4143
  <g>
4144
- <use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4145
  </g>
4146
  </g>
4147
  <g id="legend-label--hf-kernels-deformable-detr" class="legend">
4148
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">hf_kernels_deformable_detr</text>
4149
  </g>
4150
- <g id="line2d_15">
4151
- <path d="M 56.72 54.934687 L 66.72 54.934687 L 76.72 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4152
  <g>
4153
- <use ns4:href="#m9b8c54d372" x="66.72" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4154
  </g>
4155
  </g>
4156
  <g id="legend-label--torch-eager" class="legend">
4157
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="58.434687" transform="rotate(-0 84.72 58.434687)">torch_eager</text>
4158
  </g>
4159
  </g>
4160
  </g>
4161
  </g>
4162
  <defs>
4163
- <clipPath id="pb5c8282ea4">
4164
- <rect x="47.72" y="26.88" width="776.47299" height="398.225974" />
4165
  </clipPath>
4166
  </defs>
4167
  </svg>
@@ -4174,7 +4122,7 @@ body[data-tool="eraser"] .main-content {
4174
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4175
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4176
  </span> |
4177
- Cell: combine | 4.43s
4178
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4179
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4180
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4262,13 +4210,13 @@ COMBINED BENCHMARK SUMMARY
4262
 
4263
  impl wl p50(ms) ok
4264
  hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
4265
- hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.04 True
4266
- hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.04 True
4267
  hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
4268
- torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.29 True
4269
- torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.07 True
4270
- torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.07 True
4271
- torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.11 True
4272
 
4273
  GENERATING COMBINED VISUALIZATION
4274
 
@@ -4288,7 +4236,7 @@ Implementations included:
4288
  <div class="uv-install-logs" id="uv-logs-combine">
4289
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4290
  <div class="uv-logs-content" style="display: none;">
4291
- Installed 37 packages in 282ms
4292
  </div>
4293
  </div>
4294
  <div class="cell-artifacts">
@@ -4301,7 +4249,7 @@ Installed 37 packages in 282ms
4301
  <rdf:RDF>
4302
  <ns2:Work>
4303
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4304
- <dc:date>2025-12-19T19:10:04.668129</dc:date>
4305
  <dc:format>image/svg+xml</dc:format>
4306
  <dc:creator>
4307
  <ns2:Agent>
@@ -4320,260 +4268,208 @@ Installed 37 packages in 282ms
4320
  </g>
4321
  <g id="axes--1" class="axes">
4322
  <g id="patch_2">
4323
- <path d="M 47.72 425.105974 L 824.19299 425.105974 L 824.19299 26.88 L 47.72 26.88 L 47.72 425.105974 z " style="fill: none" />
4324
  </g>
4325
  <g id="matplotlib.axis_1">
4326
  <g id="xtick_1">
4327
  <g id="grid-x--1" class="grid grid-x">
4328
- <path d="M 83.014227 425.105974 L 83.014227 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4329
  </g>
4330
  <g id="line2d_1">
4331
  <defs>
4332
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
4333
  </defs>
4334
  <g>
4335
- <use ns4:href="#mafb3703e5b" x="83.014227" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
4336
  </g>
4337
  </g>
4338
  <g id="text_1">
4339
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(28.96641 549.280197) rotate(-45)">cuda_B1_Q100_H8_E256_L4_P4</text>
4340
  </g>
4341
  </g>
4342
  <g id="xtick_2">
4343
  <g id="grid-x--2" class="grid grid-x">
4344
- <path d="M 318.309072 425.105974 L 318.309072 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4345
  </g>
4346
  <g id="line2d_2">
4347
  <g>
4348
- <use ns4:href="#mafb3703e5b" x="318.309072" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
4349
  </g>
4350
  </g>
4351
  <g id="text_2">
4352
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(264.261255 549.280197) rotate(-45)">cuda_B1_Q300_H8_E256_L4_P4</text>
4353
  </g>
4354
  </g>
4355
  <g id="xtick_3">
4356
  <g id="grid-x--3" class="grid grid-x">
4357
- <path d="M 553.603918 425.105974 L 553.603918 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4358
  </g>
4359
  <g id="line2d_3">
4360
  <g>
4361
- <use ns4:href="#mafb3703e5b" x="553.603918" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
4362
  </g>
4363
  </g>
4364
  <g id="text_3">
4365
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(499.556101 549.280197) rotate(-45)">cuda_B2_Q100_H8_E256_L4_P4</text>
4366
  </g>
4367
  </g>
4368
  <g id="xtick_4">
4369
  <g id="grid-x--4" class="grid grid-x">
4370
- <path d="M 788.898763 425.105974 L 788.898763 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4371
  </g>
4372
  <g id="line2d_4">
4373
  <g>
4374
- <use ns4:href="#mafb3703e5b" x="788.898763" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
4375
  </g>
4376
  </g>
4377
  <g id="text_4">
4378
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(734.850946 549.280197) rotate(-45)">cuda_B2_Q300_H8_E256_L4_P4</text>
4379
  </g>
4380
  </g>
4381
  <g id="label--x" class="xlabel">
4382
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.956495" y="562.545859" transform="rotate(-0 435.956495 562.545859)">Workload</text>
4383
  </g>
4384
  </g>
4385
  <g id="matplotlib.axis_2">
4386
  <g id="ytick_1">
4387
  <g id="grid-y--2" class="grid grid-y">
4388
- <path d="M 47.72 410.313695 L 824.19299 410.313695 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4389
  </g>
4390
  <g id="line2d_5">
4391
  <defs>
4392
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4393
  </defs>
4394
  <g>
4395
- <use ns4:href="#m0fca2865ba" x="47.72" y="410.313695" style="stroke: #000000; stroke-width: 0.8" />
4396
  </g>
4397
  </g>
4398
  <g id="text_5">
4399
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="414.112914" transform="rotate(-0 40.72 414.112914)">0.0</text>
4400
  </g>
4401
  </g>
4402
  <g id="ytick_2">
4403
  <g id="grid-y--3" class="grid grid-y">
4404
- <path d="M 47.72 365.88698 L 824.19299 365.88698 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4405
  </g>
4406
  <g id="line2d_6">
4407
  <g>
4408
- <use ns4:href="#m0fca2865ba" x="47.72" y="365.88698" style="stroke: #000000; stroke-width: 0.8" />
4409
  </g>
4410
  </g>
4411
  <g id="text_6">
4412
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="369.686199" transform="rotate(-0 40.72 369.686199)">0.5</text>
4413
  </g>
4414
  </g>
4415
  <g id="ytick_3">
4416
  <g id="grid-y--4" class="grid grid-y">
4417
- <path d="M 47.72 321.460266 L 824.19299 321.460266 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4418
  </g>
4419
  <g id="line2d_7">
4420
  <g>
4421
- <use ns4:href="#m0fca2865ba" x="47.72" y="321.460266" style="stroke: #000000; stroke-width: 0.8" />
4422
  </g>
4423
  </g>
4424
  <g id="text_7">
4425
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="325.259484" transform="rotate(-0 40.72 325.259484)">1.0</text>
4426
  </g>
4427
  </g>
4428
  <g id="ytick_4">
4429
  <g id="grid-y--5" class="grid grid-y">
4430
- <path d="M 47.72 277.033551 L 824.19299 277.033551 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4431
  </g>
4432
  <g id="line2d_8">
4433
  <g>
4434
- <use ns4:href="#m0fca2865ba" x="47.72" y="277.033551" style="stroke: #000000; stroke-width: 0.8" />
4435
  </g>
4436
  </g>
4437
  <g id="text_8">
4438
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="280.83277" transform="rotate(-0 40.72 280.83277)">1.5</text>
4439
  </g>
4440
  </g>
4441
  <g id="ytick_5">
4442
  <g id="grid-y--6" class="grid grid-y">
4443
- <path d="M 47.72 232.606836 L 824.19299 232.606836 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4444
  </g>
4445
  <g id="line2d_9">
4446
  <g>
4447
- <use ns4:href="#m0fca2865ba" x="47.72" y="232.606836" style="stroke: #000000; stroke-width: 0.8" />
4448
  </g>
4449
  </g>
4450
  <g id="text_9">
4451
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="236.406055" transform="rotate(-0 40.72 236.406055)">2.0</text>
4452
- </g>
4453
- </g>
4454
- <g id="ytick_6">
4455
- <g id="grid-y--7" class="grid grid-y">
4456
- <path d="M 47.72 188.180122 L 824.19299 188.180122 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4457
- </g>
4458
- <g id="line2d_10">
4459
- <g>
4460
- <use ns4:href="#m0fca2865ba" x="47.72" y="188.180122" style="stroke: #000000; stroke-width: 0.8" />
4461
- </g>
4462
- </g>
4463
- <g id="text_10">
4464
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="191.97934" transform="rotate(-0 40.72 191.97934)">2.5</text>
4465
- </g>
4466
- </g>
4467
- <g id="ytick_7">
4468
- <g id="grid-y--8" class="grid grid-y">
4469
- <path d="M 47.72 143.753407 L 824.19299 143.753407 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4470
- </g>
4471
- <g id="line2d_11">
4472
- <g>
4473
- <use ns4:href="#m0fca2865ba" x="47.72" y="143.753407" style="stroke: #000000; stroke-width: 0.8" />
4474
- </g>
4475
- </g>
4476
- <g id="text_11">
4477
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="147.552626" transform="rotate(-0 40.72 147.552626)">3.0</text>
4478
- </g>
4479
- </g>
4480
- <g id="ytick_8">
4481
- <g id="grid-y--9" class="grid grid-y">
4482
- <path d="M 47.72 99.326692 L 824.19299 99.326692 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4483
- </g>
4484
- <g id="line2d_12">
4485
- <g>
4486
- <use ns4:href="#m0fca2865ba" x="47.72" y="99.326692" style="stroke: #000000; stroke-width: 0.8" />
4487
- </g>
4488
- </g>
4489
- <g id="text_12">
4490
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="103.125911" transform="rotate(-0 40.72 103.125911)">3.5</text>
4491
- </g>
4492
- </g>
4493
- <g id="ytick_9">
4494
- <g id="grid-y--10" class="grid grid-y">
4495
- <path d="M 47.72 54.899978 L 824.19299 54.899978 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4496
- </g>
4497
- <g id="line2d_13">
4498
- <g>
4499
- <use ns4:href="#m0fca2865ba" x="47.72" y="54.899978" style="stroke: #000000; stroke-width: 0.8" />
4500
- </g>
4501
- </g>
4502
- <g id="text_13">
4503
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="58.699197" transform="rotate(-0 40.72 58.699197)">4.0</text>
4504
  </g>
4505
  </g>
4506
  <g id="label--y" class="ylabel">
4507
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="225.992987" transform="rotate(-90 18.737188 225.992987)">Latency P50 (ms)</text>
4508
  </g>
4509
  </g>
4510
  <g id="series--hf-kernels-deformable-detr" class="series">
4511
- <path d="M 83.014227 407.004793 L 318.309072 406.541778 L 553.603918 406.347278 L 788.898763 406.283214 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4512
  <defs>
4513
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4514
  </defs>
4515
- <g clip-path="url(#pb5c8282ea4)">
4516
- <use ns4:href="#md7efaf3aec" x="83.014227" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
4517
- <use ns4:href="#md7efaf3aec" x="318.309072" y="406.541778" style="fill: #1f77b4; stroke: #1f77b4" />
4518
- <use ns4:href="#md7efaf3aec" x="553.603918" y="406.347278" style="fill: #1f77b4; stroke: #1f77b4" />
4519
- <use ns4:href="#md7efaf3aec" x="788.898763" y="406.283214" style="fill: #1f77b4; stroke: #1f77b4" />
4520
  </g>
4521
  </g>
4522
  <g id="series--torch-eager" class="series">
4523
- <path d="M 83.014227 118.130211 L 318.309072 48.708671 L 553.603918 48.49098 L 788.898763 44.981181 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4524
  <defs>
4525
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4526
  </defs>
4527
- <g clip-path="url(#pb5c8282ea4)">
4528
- <use ns4:href="#m9b8c54d372" x="83.014227" y="118.130211" style="fill: #ff7f0e; stroke: #ff7f0e" />
4529
- <use ns4:href="#m9b8c54d372" x="318.309072" y="48.708671" style="fill: #ff7f0e; stroke: #ff7f0e" />
4530
- <use ns4:href="#m9b8c54d372" x="553.603918" y="48.49098" style="fill: #ff7f0e; stroke: #ff7f0e" />
4531
- <use ns4:href="#m9b8c54d372" x="788.898763" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4532
  </g>
4533
  </g>
4534
  <g id="patch_3">
4535
- <path d="M 47.72 425.105974 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4536
  </g>
4537
  <g id="patch_4">
4538
  <path d="M 824.19299 425.105974 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4539
  </g>
4540
  <g id="patch_5">
4541
- <path d="M 47.72 425.105974 L 824.19299 425.105974 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4542
  </g>
4543
  <g id="patch_6">
4544
- <path d="M 47.72 26.88 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4545
  </g>
4546
- <g id="text_14">
4547
- <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.956495" y="20.88" transform="rotate(-0 435.956495 20.88)">Attention Implementation Latency</text>
4548
  </g>
4549
  <g id="legend" class="legend">
4550
  <g id="patch_7">
4551
- <path d="M 54.72 64.7925 L 225.330938 64.7925 Q 227.330938 64.7925 227.330938 62.7925 L 227.330938 33.88 Q 227.330938 31.88 225.330938 31.88 L 54.72 31.88 Q 52.72 31.88 52.72 33.88 L 52.72 62.7925 Q 52.72 64.7925 54.72 64.7925 L 54.72 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4552
  </g>
4553
- <g id="line2d_14">
4554
- <path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4555
  <g>
4556
- <use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4557
  </g>
4558
  </g>
4559
  <g id="legend-label--hf-kernels-deformable-detr" class="legend">
4560
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">hf_kernels_deformable_detr</text>
4561
  </g>
4562
- <g id="line2d_15">
4563
- <path d="M 56.72 54.934687 L 66.72 54.934687 L 76.72 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4564
  <g>
4565
- <use ns4:href="#m9b8c54d372" x="66.72" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4566
  </g>
4567
  </g>
4568
  <g id="legend-label--torch-eager" class="legend">
4569
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="58.434687" transform="rotate(-0 84.72 58.434687)">torch_eager</text>
4570
  </g>
4571
  </g>
4572
  </g>
4573
  </g>
4574
  <defs>
4575
- <clipPath id="pb5c8282ea4">
4576
- <rect x="47.72" y="26.88" width="776.47299" height="398.225974" />
4577
  </clipPath>
4578
  </defs>
4579
  </svg>
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T19:55:30.123615</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
 
3908
  </g>
3909
  <g id="axes--1" class="axes">
3910
  <g id="patch_2">
3911
+ <path d="M 39.870649 425.105974 L 824.19299 425.105974 L 824.19299 26.88 L 39.870649 26.88 L 39.870649 425.105974 z " style="fill: none" />
3912
  </g>
3913
  <g id="matplotlib.axis_1">
3914
  <g id="xtick_1">
3915
  <g id="grid-x--1" class="grid grid-x">
3916
+ <path d="M 75.521665 425.105974 L 75.521665 26.88 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3917
  </g>
3918
  <g id="line2d_1">
3919
  <defs>
3920
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
3921
  </defs>
3922
  <g>
3923
+ <use ns4:href="#mafb3703e5b" x="75.521665" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
3924
  </g>
3925
  </g>
3926
  <g id="text_1">
3927
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(21.473848 549.280197) rotate(-45)">cuda_B1_Q100_H8_E256_L4_P4</text>
3928
  </g>
3929
  </g>
3930
  <g id="xtick_2">
3931
  <g id="grid-x--2" class="grid grid-x">
3932
+ <path d="M 313.195102 425.105974 L 313.195102 26.88 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3933
  </g>
3934
  <g id="line2d_2">
3935
  <g>
3936
+ <use ns4:href="#mafb3703e5b" x="313.195102" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
3937
  </g>
3938
  </g>
3939
  <g id="text_2">
3940
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(259.147284 549.280197) rotate(-45)">cuda_B1_Q300_H8_E256_L4_P4</text>
3941
  </g>
3942
  </g>
3943
  <g id="xtick_3">
3944
  <g id="grid-x--3" class="grid grid-x">
3945
+ <path d="M 550.868538 425.105974 L 550.868538 26.88 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3946
  </g>
3947
  <g id="line2d_3">
3948
  <g>
3949
+ <use ns4:href="#mafb3703e5b" x="550.868538" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
3950
  </g>
3951
  </g>
3952
  <g id="text_3">
3953
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(496.820721 549.280197) rotate(-45)">cuda_B2_Q100_H8_E256_L4_P4</text>
3954
  </g>
3955
  </g>
3956
  <g id="xtick_4">
3957
  <g id="grid-x--4" class="grid grid-x">
3958
+ <path d="M 788.541975 425.105974 L 788.541975 26.88 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3959
  </g>
3960
  <g id="line2d_4">
3961
  <g>
3962
+ <use ns4:href="#mafb3703e5b" x="788.541975" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
3963
  </g>
3964
  </g>
3965
  <g id="text_4">
3966
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(734.494157 549.280197) rotate(-45)">cuda_B2_Q300_H8_E256_L4_P4</text>
3967
  </g>
3968
  </g>
3969
  <g id="label--x" class="xlabel">
3970
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="432.03182" y="562.545859" transform="rotate(-0 432.03182 562.545859)">Workload</text>
3971
  </g>
3972
  </g>
3973
  <g id="matplotlib.axis_2">
3974
  <g id="ytick_1">
3975
  <g id="grid-y--2" class="grid grid-y">
3976
+ <path d="M 39.870649 410.192454 L 824.19299 410.192454 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3977
  </g>
3978
  <g id="line2d_5">
3979
  <defs>
3980
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3981
  </defs>
3982
  <g>
3983
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="410.192454" style="stroke: #000000; stroke-width: 0.8" />
3984
  </g>
3985
  </g>
3986
  <g id="text_5">
3987
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="413.991673" transform="rotate(-0 32.870649 413.991673)">0</text>
3988
  </g>
3989
  </g>
3990
  <g id="ytick_2">
3991
  <g id="grid-y--3" class="grid grid-y">
3992
+ <path d="M 39.870649 326.087525 L 824.19299 326.087525 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3993
  </g>
3994
  <g id="line2d_6">
3995
  <g>
3996
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="326.087525" style="stroke: #000000; stroke-width: 0.8" />
3997
  </g>
3998
  </g>
3999
  <g id="text_6">
4000
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="329.886744" transform="rotate(-0 32.870649 329.886744)">1</text>
4001
  </g>
4002
  </g>
4003
  <g id="ytick_3">
4004
  <g id="grid-y--4" class="grid grid-y">
4005
+ <path d="M 39.870649 241.982596 L 824.19299 241.982596 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4006
  </g>
4007
  <g id="line2d_7">
4008
  <g>
4009
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="241.982596" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="245.781814" transform="rotate(-0 32.870649 245.781814)">2</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_4">
4017
  <g id="grid-y--5" class="grid grid-y">
4018
+ <path d="M 39.870649 157.877666 L 824.19299 157.877666 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="157.877666" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="161.676885" transform="rotate(-0 32.870649 161.676885)">3</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_5">
4030
  <g id="grid-y--6" class="grid grid-y">
4031
+ <path d="M 39.870649 73.772737 L 824.19299 73.772737 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="73.772737" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="77.571956" transform="rotate(-0 32.870649 77.571956)">4</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4040
  </g>
4041
  </g>
4042
  <g id="label--y" class="ylabel">
4043
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="20.428462" y="225.992987" transform="rotate(-90 20.428462 225.992987)">Latency P50 (ms)</text>
4044
  </g>
4045
  </g>
4046
  <g id="series--hf-kernels-deformable-detr" class="series">
4047
+ <path d="M 75.521665 407.004793 L 313.195102 406.379052 L 550.868538 406.331113 L 788.541975 406.269716 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4048
  <defs>
4049
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4050
  </defs>
4051
+ <g clip-path="url(#pbac879f81a)">
4052
+ <use ns4:href="#md7efaf3aec" x="75.521665" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
4053
+ <use ns4:href="#md7efaf3aec" x="313.195102" y="406.379052" style="fill: #1f77b4; stroke: #1f77b4" />
4054
+ <use ns4:href="#md7efaf3aec" x="550.868538" y="406.331113" style="fill: #1f77b4; stroke: #1f77b4" />
4055
+ <use ns4:href="#md7efaf3aec" x="788.541975" y="406.269716" style="fill: #1f77b4; stroke: #1f77b4" />
4056
  </g>
4057
  </g>
4058
  <g id="series--torch-eager" class="series">
4059
+ <path d="M 75.521665 119.402268 L 313.195102 53.99992 L 550.868538 52.595284 L 788.541975 44.981181 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4060
  <defs>
4061
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4062
  </defs>
4063
+ <g clip-path="url(#pbac879f81a)">
4064
+ <use ns4:href="#m9b8c54d372" x="75.521665" y="119.402268" style="fill: #ff7f0e; stroke: #ff7f0e" />
4065
+ <use ns4:href="#m9b8c54d372" x="313.195102" y="53.99992" style="fill: #ff7f0e; stroke: #ff7f0e" />
4066
+ <use ns4:href="#m9b8c54d372" x="550.868538" y="52.595284" style="fill: #ff7f0e; stroke: #ff7f0e" />
4067
+ <use ns4:href="#m9b8c54d372" x="788.541975" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4068
  </g>
4069
  </g>
4070
  <g id="patch_3">
4071
+ <path d="M 39.870649 425.105974 L 39.870649 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4072
  </g>
4073
  <g id="patch_4">
4074
  <path d="M 824.19299 425.105974 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4075
  </g>
4076
  <g id="patch_5">
4077
+ <path d="M 39.870649 425.105974 L 824.19299 425.105974 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4078
  </g>
4079
  <g id="patch_6">
4080
+ <path d="M 39.870649 26.88 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4081
  </g>
4082
+ <g id="text_10">
4083
+ <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="432.03182" y="20.88" transform="rotate(-0 432.03182 20.88)">Attention Implementation Latency</text>
4084
  </g>
4085
  <g id="legend" class="legend">
4086
  <g id="patch_7">
4087
+ <path d="M 46.870649 64.7925 L 217.481587 64.7925 Q 219.481587 64.7925 219.481587 62.7925 L 219.481587 33.88 Q 219.481587 31.88 217.481587 31.88 L 46.870649 31.88 Q 44.870649 31.88 44.870649 33.88 L 44.870649 62.7925 Q 44.870649 64.7925 46.870649 64.7925 L 46.870649 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4088
  </g>
4089
+ <g id="line2d_10">
4090
+ <path d="M 48.870649 39.978438 L 58.870649 39.978438 L 68.870649 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4091
  <g>
4092
+ <use ns4:href="#md7efaf3aec" x="58.870649" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4093
  </g>
4094
  </g>
4095
  <g id="legend-label--hf-kernels-deformable-detr" class="legend">
4096
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="76.870649" y="43.478438" transform="rotate(-0 76.870649 43.478438)">hf_kernels_deformable_detr</text>
4097
  </g>
4098
+ <g id="line2d_11">
4099
+ <path d="M 48.870649 54.934687 L 58.870649 54.934687 L 68.870649 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4100
  <g>
4101
+ <use ns4:href="#m9b8c54d372" x="58.870649" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4102
  </g>
4103
  </g>
4104
  <g id="legend-label--torch-eager" class="legend">
4105
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="76.870649" y="58.434687" transform="rotate(-0 76.870649 58.434687)">torch_eager</text>
4106
  </g>
4107
  </g>
4108
  </g>
4109
  </g>
4110
  <defs>
4111
+ <clipPath id="pbac879f81a">
4112
+ <rect x="39.870649" y="26.88" width="784.322341" height="398.225974" />
4113
  </clipPath>
4114
  </defs>
4115
  </svg>
 
4122
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4123
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4124
  </span> |
4125
+ Cell: combine | 4.63s
4126
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4127
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4128
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4210
 
4211
  impl wl p50(ms) ok
4212
  hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
4213
+ hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.05 True
4214
+ hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.05 True
4215
  hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
4216
+ torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.46 True
4217
+ torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.24 True
4218
+ torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.25 True
4219
+ torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.34 True
4220
 
4221
  GENERATING COMBINED VISUALIZATION
4222
 
 
4236
  <div class="uv-install-logs" id="uv-logs-combine">
4237
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4238
  <div class="uv-logs-content" style="display: none;">
4239
+ Installed 37 packages in 311ms
4240
  </div>
4241
  </div>
4242
  <div class="cell-artifacts">
 
4249
  <rdf:RDF>
4250
  <ns2:Work>
4251
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4252
+ <dc:date>2025-12-19T19:55:30.123615</dc:date>
4253
  <dc:format>image/svg+xml</dc:format>
4254
  <dc:creator>
4255
  <ns2:Agent>
 
4268
  </g>
4269
  <g id="axes--1" class="axes">
4270
  <g id="patch_2">
4271
+ <path d="M 39.870649 425.105974 L 824.19299 425.105974 L 824.19299 26.88 L 39.870649 26.88 L 39.870649 425.105974 z " style="fill: none" />
4272
  </g>
4273
  <g id="matplotlib.axis_1">
4274
  <g id="xtick_1">
4275
  <g id="grid-x--1" class="grid grid-x">
4276
+ <path d="M 75.521665 425.105974 L 75.521665 26.88 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4277
  </g>
4278
  <g id="line2d_1">
4279
  <defs>
4280
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
4281
  </defs>
4282
  <g>
4283
+ <use ns4:href="#mafb3703e5b" x="75.521665" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
4284
  </g>
4285
  </g>
4286
  <g id="text_1">
4287
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(21.473848 549.280197) rotate(-45)">cuda_B1_Q100_H8_E256_L4_P4</text>
4288
  </g>
4289
  </g>
4290
  <g id="xtick_2">
4291
  <g id="grid-x--2" class="grid grid-x">
4292
+ <path d="M 313.195102 425.105974 L 313.195102 26.88 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4293
  </g>
4294
  <g id="line2d_2">
4295
  <g>
4296
+ <use ns4:href="#mafb3703e5b" x="313.195102" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
4297
  </g>
4298
  </g>
4299
  <g id="text_2">
4300
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(259.147284 549.280197) rotate(-45)">cuda_B1_Q300_H8_E256_L4_P4</text>
4301
  </g>
4302
  </g>
4303
  <g id="xtick_3">
4304
  <g id="grid-x--3" class="grid grid-x">
4305
+ <path d="M 550.868538 425.105974 L 550.868538 26.88 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4306
  </g>
4307
  <g id="line2d_3">
4308
  <g>
4309
+ <use ns4:href="#mafb3703e5b" x="550.868538" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
4310
  </g>
4311
  </g>
4312
  <g id="text_3">
4313
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(496.820721 549.280197) rotate(-45)">cuda_B2_Q100_H8_E256_L4_P4</text>
4314
  </g>
4315
  </g>
4316
  <g id="xtick_4">
4317
  <g id="grid-x--4" class="grid grid-x">
4318
+ <path d="M 788.541975 425.105974 L 788.541975 26.88 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4319
  </g>
4320
  <g id="line2d_4">
4321
  <g>
4322
+ <use ns4:href="#mafb3703e5b" x="788.541975" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
4323
  </g>
4324
  </g>
4325
  <g id="text_4">
4326
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(734.494157 549.280197) rotate(-45)">cuda_B2_Q300_H8_E256_L4_P4</text>
4327
  </g>
4328
  </g>
4329
  <g id="label--x" class="xlabel">
4330
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="432.03182" y="562.545859" transform="rotate(-0 432.03182 562.545859)">Workload</text>
4331
  </g>
4332
  </g>
4333
  <g id="matplotlib.axis_2">
4334
  <g id="ytick_1">
4335
  <g id="grid-y--2" class="grid grid-y">
4336
+ <path d="M 39.870649 410.192454 L 824.19299 410.192454 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4337
  </g>
4338
  <g id="line2d_5">
4339
  <defs>
4340
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4341
  </defs>
4342
  <g>
4343
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="410.192454" style="stroke: #000000; stroke-width: 0.8" />
4344
  </g>
4345
  </g>
4346
  <g id="text_5">
4347
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="413.991673" transform="rotate(-0 32.870649 413.991673)">0</text>
4348
  </g>
4349
  </g>
4350
  <g id="ytick_2">
4351
  <g id="grid-y--3" class="grid grid-y">
4352
+ <path d="M 39.870649 326.087525 L 824.19299 326.087525 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4353
  </g>
4354
  <g id="line2d_6">
4355
  <g>
4356
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="326.087525" style="stroke: #000000; stroke-width: 0.8" />
4357
  </g>
4358
  </g>
4359
  <g id="text_6">
4360
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="329.886744" transform="rotate(-0 32.870649 329.886744)">1</text>
4361
  </g>
4362
  </g>
4363
  <g id="ytick_3">
4364
  <g id="grid-y--4" class="grid grid-y">
4365
+ <path d="M 39.870649 241.982596 L 824.19299 241.982596 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4366
  </g>
4367
  <g id="line2d_7">
4368
  <g>
4369
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="241.982596" style="stroke: #000000; stroke-width: 0.8" />
4370
  </g>
4371
  </g>
4372
  <g id="text_7">
4373
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="245.781814" transform="rotate(-0 32.870649 245.781814)">2</text>
4374
  </g>
4375
  </g>
4376
  <g id="ytick_4">
4377
  <g id="grid-y--5" class="grid grid-y">
4378
+ <path d="M 39.870649 157.877666 L 824.19299 157.877666 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4379
  </g>
4380
  <g id="line2d_8">
4381
  <g>
4382
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="157.877666" style="stroke: #000000; stroke-width: 0.8" />
4383
  </g>
4384
  </g>
4385
  <g id="text_8">
4386
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="161.676885" transform="rotate(-0 32.870649 161.676885)">3</text>
4387
  </g>
4388
  </g>
4389
  <g id="ytick_5">
4390
  <g id="grid-y--6" class="grid grid-y">
4391
+ <path d="M 39.870649 73.772737 L 824.19299 73.772737 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4392
  </g>
4393
  <g id="line2d_9">
4394
  <g>
4395
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="73.772737" style="stroke: #000000; stroke-width: 0.8" />
4396
  </g>
4397
  </g>
4398
  <g id="text_9">
4399
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="77.571956" transform="rotate(-0 32.870649 77.571956)">4</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4400
  </g>
4401
  </g>
4402
  <g id="label--y" class="ylabel">
4403
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="20.428462" y="225.992987" transform="rotate(-90 20.428462 225.992987)">Latency P50 (ms)</text>
4404
  </g>
4405
  </g>
4406
  <g id="series--hf-kernels-deformable-detr" class="series">
4407
+ <path d="M 75.521665 407.004793 L 313.195102 406.379052 L 550.868538 406.331113 L 788.541975 406.269716 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4408
  <defs>
4409
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4410
  </defs>
4411
+ <g clip-path="url(#pbac879f81a)">
4412
+ <use ns4:href="#md7efaf3aec" x="75.521665" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
4413
+ <use ns4:href="#md7efaf3aec" x="313.195102" y="406.379052" style="fill: #1f77b4; stroke: #1f77b4" />
4414
+ <use ns4:href="#md7efaf3aec" x="550.868538" y="406.331113" style="fill: #1f77b4; stroke: #1f77b4" />
4415
+ <use ns4:href="#md7efaf3aec" x="788.541975" y="406.269716" style="fill: #1f77b4; stroke: #1f77b4" />
4416
  </g>
4417
  </g>
4418
  <g id="series--torch-eager" class="series">
4419
+ <path d="M 75.521665 119.402268 L 313.195102 53.99992 L 550.868538 52.595284 L 788.541975 44.981181 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4420
  <defs>
4421
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4422
  </defs>
4423
+ <g clip-path="url(#pbac879f81a)">
4424
+ <use ns4:href="#m9b8c54d372" x="75.521665" y="119.402268" style="fill: #ff7f0e; stroke: #ff7f0e" />
4425
+ <use ns4:href="#m9b8c54d372" x="313.195102" y="53.99992" style="fill: #ff7f0e; stroke: #ff7f0e" />
4426
+ <use ns4:href="#m9b8c54d372" x="550.868538" y="52.595284" style="fill: #ff7f0e; stroke: #ff7f0e" />
4427
+ <use ns4:href="#m9b8c54d372" x="788.541975" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4428
  </g>
4429
  </g>
4430
  <g id="patch_3">
4431
+ <path d="M 39.870649 425.105974 L 39.870649 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4432
  </g>
4433
  <g id="patch_4">
4434
  <path d="M 824.19299 425.105974 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4435
  </g>
4436
  <g id="patch_5">
4437
+ <path d="M 39.870649 425.105974 L 824.19299 425.105974 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4438
  </g>
4439
  <g id="patch_6">
4440
+ <path d="M 39.870649 26.88 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4441
  </g>
4442
+ <g id="text_10">
4443
+ <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="432.03182" y="20.88" transform="rotate(-0 432.03182 20.88)">Attention Implementation Latency</text>
4444
  </g>
4445
  <g id="legend" class="legend">
4446
  <g id="patch_7">
4447
+ <path d="M 46.870649 64.7925 L 217.481587 64.7925 Q 219.481587 64.7925 219.481587 62.7925 L 219.481587 33.88 Q 219.481587 31.88 217.481587 31.88 L 46.870649 31.88 Q 44.870649 31.88 44.870649 33.88 L 44.870649 62.7925 Q 44.870649 64.7925 46.870649 64.7925 L 46.870649 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4448
  </g>
4449
+ <g id="line2d_10">
4450
+ <path d="M 48.870649 39.978438 L 58.870649 39.978438 L 68.870649 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4451
  <g>
4452
+ <use ns4:href="#md7efaf3aec" x="58.870649" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4453
  </g>
4454
  </g>
4455
  <g id="legend-label--hf-kernels-deformable-detr" class="legend">
4456
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="76.870649" y="43.478438" transform="rotate(-0 76.870649 43.478438)">hf_kernels_deformable_detr</text>
4457
  </g>
4458
+ <g id="line2d_11">
4459
+ <path d="M 48.870649 54.934687 L 58.870649 54.934687 L 68.870649 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4460
  <g>
4461
+ <use ns4:href="#m9b8c54d372" x="58.870649" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4462
  </g>
4463
  </g>
4464
  <g id="legend-label--torch-eager" class="legend">
4465
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="76.870649" y="58.434687" transform="rotate(-0 76.870649 58.434687)">torch_eager</text>
4466
  </g>
4467
  </g>
4468
  </g>
4469
  </g>
4470
  <defs>
4471
+ <clipPath id="pbac879f81a">
4472
+ <rect x="39.870649" y="26.88" width="784.322341" height="398.225974" />
4473
  </clipPath>
4474
  </defs>
4475
  </svg>
flash_attn/impls/artifacts/benchmark/attention.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
2
- {"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
3
- {"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
4
- {"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
5
- {"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
6
- {"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
 
1
+ {"ts": "2025-12-19T19:55:13Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.971173999914754, "p50": 0.9783339999103191, "p90": 0.9836439999162394, "mean": 0.9789179998733744, "iqr": 0.007710000090810354, "raw_times": [0.9783339999103191, 0.975933999825429, 0.9836439999162394, 0.9855039998001303, 0.971173999914754], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0032949999185803, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-12-19T19:55:13Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0312540000541048, "p50": 1.039535000018077, "p90": 1.0408949999600736, "mean": 1.0369627999807562, "iqr": 0.00922000003811263, "raw_times": [1.031674999921961, 1.0408949999600736, 1.0414549999495648, 1.0312540000541048, 1.039535000018077], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0439259999657224, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-12-19T19:55:14Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.05486600000404, "p50": 1.0608159998355404, "p90": 1.0660549999101931, "mean": 1.062165799930881, "iqr": 0.010128999974767794, "raw_times": [1.0608159998355404, 1.0731659999692056, 1.0559259999354254, 1.05486600000404, 1.0660549999101931], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0692559999370133, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-12-19T19:55:14Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0765559998162644, "p50": 1.0860869999760325, "p90": 1.0925159999715106, "mean": 1.0862464000183536, "iqr": 0.013049999779468635, "raw_times": [1.0860869999760325, 1.0925159999715106, 1.0765559998162644, 1.079466000192042, 1.0966070001359185], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.10497600007875, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-12-19T19:55:14Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.247940999974162, "p50": 1.2629510001715971, "p90": 1.2655800001084572, "mean": 1.2603426000623585, "iqr": 0.014840000176263857, "raw_times": [1.2629510001715971, 1.247940999974162, 1.2655800001084572, 1.2507399999321933, 1.2745010001253831], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2752009999985603, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-12-19T19:55:14Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.228739999987738, "p50": 1.2448499999209162, "p90": 1.2651710001136962, "mean": 1.2494922000314546, "iqr": 0.028152000140835298, "raw_times": [1.228739999987738, 1.2448499999209162, 1.237018999972861, 1.2716810001620615, 1.2651710001136962], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.262461000123949, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/cells/benchmark.py CHANGED
@@ -3,8 +3,8 @@
3
  # dependencies = [
4
  # "numpy",
5
  # "torch==2.8.0",
6
- # "kernels",
7
  # "kernels-benchmark-tools",
 
8
  # ]
9
  #
10
  # [tool.uv.sources]
@@ -13,20 +13,18 @@
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- from kernels import get_kernel
17
 
18
- # Load the sage attention kernel
19
- hf_kernels_sage_attn = get_kernel("kernels-community/sage_attention")
20
 
21
-
22
- def sage_attention(query, key, value):
23
- """SageAttention with INT8 Q/K quantization and FP16 P/V"""
24
- return hf_kernels_sage_attn.fwd(query, key, value, is_causal=False)[0]
25
 
26
 
27
  run_benchmark(
28
  kernel_type=KernelTypeEnum.ATTENTION,
29
- impl_name="sage_int8_fp16",
30
- impl_tags={"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"},
31
- impl_func=sage_attention,
32
  )
 
3
  # dependencies = [
4
  # "numpy",
5
  # "torch==2.8.0",
 
6
  # "kernels-benchmark-tools",
7
+ # "xformers",
8
  # ]
9
  #
10
  # [tool.uv.sources]
 
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
+ import xformers.ops as xops
17
 
 
 
18
 
19
+ def xformers_attention(q, k, v):
20
+ """xFormers memory efficient attention"""
21
+ # xFormers expects [batch, seq_len, heads, head_dim]
22
+ return xops.memory_efficient_attention(q, k, v)
23
 
24
 
25
  run_benchmark(
26
  kernel_type=KernelTypeEnum.ATTENTION,
27
+ impl_name="xformers_meff",
28
+ impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
29
+ impl_func=xformers_attention,
30
  )
flash_attn/impls/flash_attention.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.28s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3905,7 +3905,7 @@ Cell: nv | 0.28s
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
- <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:57:02 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.28s
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
- | N/A 34C P0 103W / 350W | 0MiB / 46068MiB | 31% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.28s
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3940
  </span> |
3941
- Cell: benchmark | 4.28s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3989,29 +3989,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.564ms 102.03% 3.564ms 3.564ms 1
3993
- torch_flash_ma 5.92% 322.864us 49.31% 2.690ms 2.690ms 0.000us 0.00% 3.533ms 3.533ms 1
3994
- aten::scaled_dot_product_attention 0.71% 38.601us 3.97% 216.634us 72.211us 0.000us 0.00% 2.778ms 926.157us 3
3995
- aten::_scaled_dot_product_flash_attention 0.48% 26.049us 3.26% 178.033us 59.344us 0.000us 0.00% 2.778ms 926.157us 3
3996
- aten::_flash_attention_forward 0.70% 38.244us 2.37% 129.043us 43.014us 2.778ms 79.53% 2.778ms 926.157us 3
3997
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.778ms 79.53% 2.778ms 926.157us 3
3998
- aten::contiguous 0.25% 13.590us 38.20% 2.084ms 173.652us 0.000us 0.00% 754.825us 62.902us 12
3999
- aten::clone 0.64% 35.000us 37.95% 2.070ms 172.519us 0.000us 0.00% 754.825us 62.902us 12
4000
- aten::copy_ 1.68% 91.923us 35.78% 1.952ms 162.645us 715.017us 20.47% 754.825us 62.902us 12
4001
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 715.017us 20.47% 715.017us 59.585us 12
4002
- Activity Buffer Request 32.25% 1.760ms 32.25% 1.760ms 1.760ms 39.808us 1.14% 39.808us 39.808us 1
4003
- aten::transpose 1.21% 66.005us 1.65% 89.944us 3.748us 0.000us 0.00% 0.000us 0.000us 24
4004
- aten::as_strided 0.44% 23.939us 0.44% 23.939us 0.997us 0.000us 0.00% 0.000us 0.000us 24
4005
- aten::empty_like 0.46% 24.998us 1.93% 105.512us 7.034us 0.000us 0.00% 0.000us 0.000us 15
4006
- aten::empty 1.74% 94.901us 1.74% 94.901us 3.954us 0.000us 0.00% 0.000us 0.000us 24
4007
- cudaLaunchKernel 2.30% 125.662us 2.30% 125.662us 8.377us 0.000us 0.00% 0.000us 0.000us 15
4008
- aten::empty_strided 0.30% 16.192us 0.30% 16.192us 5.397us 0.000us 0.00% 0.000us 0.000us 3
4009
- cudaDeviceGetAttribute 0.04% 2.360us 0.04% 2.360us 0.393us 0.000us 0.00% 0.000us 0.000us 6
4010
- cudaFuncSetAttribute 0.19% 10.450us 0.19% 10.450us 3.483us 0.000us 0.00% 0.000us 0.000us 3
4011
- cudaDeviceSynchronize 50.69% 2.765ms 50.69% 2.765ms 2.765ms 0.000us 0.00% 0.000us 0.000us 1
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
- Self CPU time total: 5.456ms
4014
- Self CUDA time total: 3.493ms
4015
 
4016
 
4017
 
@@ -4021,29 +4021,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4023
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4024
- torch_flash_ma 4.71% 256.956us 44.29% 2.416ms 2.416ms 0.000us 0.00% 3.774ms 3.774ms 1
4025
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.728ms 100.28% 3.728ms 3.728ms 1
4026
- aten::scaled_dot_product_attention 0.47% 25.660us 3.51% 191.364us 63.788us 0.000us 0.00% 2.953ms 984.270us 3
4027
- aten::_scaled_dot_product_flash_attention 0.35% 18.860us 3.04% 165.704us 55.235us 0.000us 0.00% 2.953ms 984.270us 3
4028
- aten::_flash_attention_forward 0.82% 44.462us 2.27% 123.662us 41.221us 2.953ms 79.43% 2.953ms 984.270us 3
4029
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.953ms 79.43% 2.953ms 984.270us 3
4030
- aten::contiguous 0.19% 10.628us 35.19% 1.920ms 159.985us 0.000us 0.00% 820.970us 68.414us 12
4031
- aten::clone 0.57% 30.960us 35.00% 1.909ms 159.100us 0.000us 0.00% 820.970us 68.414us 12
4032
- aten::copy_ 1.50% 81.693us 33.25% 1.814ms 151.145us 764.809us 20.57% 820.970us 68.414us 12
4033
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 764.809us 20.57% 764.809us 63.734us 12
4034
- Activity Buffer Request 30.19% 1.647ms 30.19% 1.647ms 1.647ms 56.161us 1.51% 56.161us 56.161us 1
4035
- aten::transpose 0.93% 50.867us 1.30% 70.984us 2.958us 0.000us 0.00% 0.000us 0.000us 24
4036
- aten::as_strided 0.37% 20.117us 0.37% 20.117us 0.838us 0.000us 0.00% 0.000us 0.000us 24
4037
- aten::empty_like 0.39% 21.390us 1.52% 82.920us 5.528us 0.000us 0.00% 0.000us 0.000us 15
4038
- aten::empty 1.43% 78.110us 1.43% 78.110us 3.255us 0.000us 0.00% 0.000us 0.000us 24
4039
- cudaLaunchKernel 2.02% 110.102us 2.02% 110.102us 7.340us 0.000us 0.00% 0.000us 0.000us 15
4040
- aten::empty_strided 0.25% 13.480us 0.25% 13.480us 4.493us 0.000us 0.00% 0.000us 0.000us 3
4041
- cudaDeviceGetAttribute 0.03% 1.800us 0.03% 1.800us 0.300us 0.000us 0.00% 0.000us 0.000us 6
4042
- cudaFuncSetAttribute 0.07% 4.010us 0.07% 4.010us 1.337us 0.000us 0.00% 0.000us 0.000us 3
4043
- cudaDeviceSynchronize 55.71% 3.039ms 55.71% 3.039ms 3.039ms 0.000us 0.00% 0.000us 0.000us 1
4044
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4045
- Self CPU time total: 5.455ms
4046
- Self CUDA time total: 3.718ms
4047
 
4048
 
4049
 
@@ -4053,29 +4053,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4055
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4056
- torch_flash_ma 4.83% 269.985us 44.74% 2.500ms 2.500ms 0.000us 0.00% 3.834ms 3.834ms 1
4057
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.786ms 100.29% 3.786ms 3.786ms 1
4058
- aten::scaled_dot_product_attention 0.43% 24.011us 3.55% 198.294us 66.098us 0.000us 0.00% 2.997ms 999.122us 3
4059
- aten::_scaled_dot_product_flash_attention 0.34% 19.010us 3.12% 174.283us 58.094us 0.000us 0.00% 2.997ms 999.122us 3
4060
- aten::_flash_attention_forward 0.79% 43.958us 2.36% 131.713us 43.904us 2.997ms 79.40% 2.997ms 999.122us 3
4061
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.997ms 79.40% 2.997ms 999.122us 3
4062
- aten::contiguous 0.20% 11.122us 35.53% 1.985ms 165.423us 0.000us 0.00% 837.094us 69.758us 12
4063
- aten::clone 0.53% 29.350us 35.33% 1.974ms 164.496us 0.000us 0.00% 837.094us 69.758us 12
4064
- aten::copy_ 1.44% 80.718us 33.66% 1.880ms 156.702us 777.862us 20.60% 837.094us 69.758us 12
4065
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 777.862us 20.60% 777.862us 64.822us 12
4066
- Activity Buffer Request 30.68% 1.714ms 30.68% 1.714ms 1.714ms 59.232us 1.57% 59.232us 59.232us 1
4067
- aten::transpose 0.92% 51.150us 1.25% 70.010us 2.917us 0.000us 0.00% 0.000us 0.000us 24
4068
- aten::as_strided 0.34% 18.860us 0.34% 18.860us 0.786us 0.000us 0.00% 0.000us 0.000us 24
4069
- aten::empty_like 0.37% 20.561us 1.52% 84.672us 5.645us 0.000us 0.00% 0.000us 0.000us 15
4070
- aten::empty 1.54% 85.833us 1.54% 85.833us 3.576us 0.000us 0.00% 0.000us 0.000us 24
4071
- cudaLaunchKernel 1.95% 109.214us 1.95% 109.214us 7.281us 0.000us 0.00% 0.000us 0.000us 15
4072
- aten::empty_strided 0.27% 15.280us 0.27% 15.280us 5.093us 0.000us 0.00% 0.000us 0.000us 3
4073
- cudaDeviceGetAttribute 0.04% 2.120us 0.04% 2.120us 0.353us 0.000us 0.00% 0.000us 0.000us 6
4074
- cudaFuncSetAttribute 0.08% 4.293us 0.08% 4.293us 1.431us 0.000us 0.00% 0.000us 0.000us 3
4075
- cudaDeviceSynchronize 55.26% 3.087ms 55.26% 3.087ms 3.087ms 0.000us 0.00% 0.000us 0.000us 1
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
- Self CPU time total: 5.587ms
4078
- Self CUDA time total: 3.775ms
4079
 
4080
 
4081
 
@@ -4085,29 +4085,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
4085
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4086
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
- torch_flash_ma 4.54% 264.303us 45.63% 2.655ms 2.655ms 0.000us 0.00% 3.910ms 3.910ms 1
4089
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.865ms 100.29% 3.865ms 3.865ms 1
4090
- aten::scaled_dot_product_attention 0.44% 25.860us 3.27% 190.173us 63.391us 0.000us 0.00% 3.076ms 1.025ms 3
4091
- aten::_scaled_dot_product_flash_attention 0.31% 18.100us 2.82% 164.313us 54.771us 0.000us 0.00% 3.076ms 1.025ms 3
4092
- aten::_flash_attention_forward 0.70% 40.710us 2.10% 122.383us 40.794us 3.076ms 79.82% 3.076ms 1.025ms 3
4093
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.076ms 79.82% 3.076ms 1.025ms 3
4094
- aten::contiguous 0.17% 9.789us 37.00% 2.153ms 179.384us 0.000us 0.00% 833.826us 69.486us 12
4095
- aten::clone 0.51% 29.519us 36.83% 2.143ms 178.569us 0.000us 0.00% 833.826us 69.486us 12
4096
- aten::copy_ 1.40% 81.625us 35.17% 2.046ms 170.539us 777.953us 20.18% 833.826us 69.486us 12
4097
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 777.953us 20.18% 777.953us 64.829us 12
4098
- Activity Buffer Request 28.32% 1.648ms 28.32% 1.648ms 1.648ms 55.873us 1.45% 55.873us 55.873us 1
4099
- aten::transpose 0.90% 52.082us 1.23% 71.483us 2.978us 0.000us 0.00% 0.000us 0.000us 24
4100
- aten::as_strided 0.33% 19.401us 0.33% 19.401us 0.808us 0.000us 0.00% 0.000us 0.000us 24
4101
- aten::empty_like 0.38% 21.851us 1.50% 87.141us 5.809us 0.000us 0.00% 0.000us 0.000us 15
4102
- aten::empty 1.38% 80.371us 1.38% 80.371us 3.349us 0.000us 0.00% 0.000us 0.000us 24
4103
- cudaLaunchKernel 5.88% 342.407us 5.88% 342.407us 22.827us 0.000us 0.00% 0.000us 0.000us 15
4104
- aten::empty_strided 0.26% 14.910us 0.26% 14.910us 4.970us 0.000us 0.00% 0.000us 0.000us 3
4105
- cudaDeviceGetAttribute 0.03% 1.811us 0.03% 1.811us 0.302us 0.000us 0.00% 0.000us 0.000us 6
4106
- cudaFuncSetAttribute 0.07% 4.181us 0.07% 4.181us 1.394us 0.000us 0.00% 0.000us 0.000us 3
4107
- cudaDeviceSynchronize 54.37% 3.164ms 54.37% 3.164ms 3.164ms 0.000us 0.00% 0.000us 0.000us 1
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
- Self CPU time total: 5.818ms
4110
- Self CUDA time total: 3.854ms
4111
 
4112
 
4113
 
@@ -4117,29 +4117,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
4117
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4118
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4119
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4120
- torch_flash_ma 4.87% 306.708us 43.18% 2.718ms 2.718ms 0.000us 0.00% 4.364ms 4.364ms 1
4121
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.314ms 100.24% 4.314ms 4.314ms 1
4122
- aten::scaled_dot_product_attention 0.42% 26.322us 3.04% 191.625us 63.875us 0.000us 0.00% 3.500ms 1.167ms 3
4123
- aten::_scaled_dot_product_flash_attention 0.31% 19.398us 2.63% 165.303us 55.101us 0.000us 0.00% 3.500ms 1.167ms 3
4124
- aten::_flash_attention_forward 0.65% 40.750us 1.93% 121.261us 40.420us 3.500ms 81.33% 3.500ms 1.167ms 3
4125
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.500ms 81.33% 3.500ms 1.167ms 3
4126
- aten::contiguous 0.18% 11.020us 34.50% 2.172ms 180.965us 0.000us 0.00% 863.467us 71.956us 12
4127
- aten::clone 0.46% 28.711us 34.33% 2.161ms 180.047us 0.000us 0.00% 863.467us 71.956us 12
4128
- aten::copy_ 1.29% 81.309us 32.83% 2.066ms 172.192us 803.338us 18.67% 863.467us 71.956us 12
4129
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 803.338us 18.67% 803.338us 66.945us 12
4130
- Activity Buffer Request 26.76% 1.684ms 26.76% 1.684ms 1.684ms 60.129us 1.40% 60.129us 60.129us 1
4131
- aten::transpose 0.83% 52.430us 1.15% 72.394us 3.016us 0.000us 0.00% 0.000us 0.000us 24
4132
- aten::as_strided 0.32% 19.964us 0.32% 19.964us 0.832us 0.000us 0.00% 0.000us 0.000us 24
4133
- aten::empty_like 0.32% 19.960us 1.35% 84.930us 5.662us 0.000us 0.00% 0.000us 0.000us 15
4134
- aten::empty 1.27% 80.061us 1.27% 80.061us 3.336us 0.000us 0.00% 0.000us 0.000us 24
4135
- cudaLaunchKernel 5.16% 325.017us 5.16% 325.017us 21.668us 0.000us 0.00% 0.000us 0.000us 15
4136
- aten::empty_strided 0.23% 14.460us 0.23% 14.460us 4.820us 0.000us 0.00% 0.000us 0.000us 3
4137
- cudaDeviceGetAttribute 0.04% 2.690us 0.04% 2.690us 0.448us 0.000us 0.00% 0.000us 0.000us 6
4138
- cudaFuncSetAttribute 0.07% 4.660us 0.07% 4.660us 1.553us 0.000us 0.00% 0.000us 0.000us 3
4139
- cudaDeviceSynchronize 56.82% 3.576ms 56.82% 3.576ms 3.576ms 0.000us 0.00% 0.000us 0.000us 1
4140
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4141
- Self CPU time total: 6.294ms
4142
- Self CUDA time total: 4.304ms
4143
 
4144
 
4145
 
@@ -4149,38 +4149,38 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
4149
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4150
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4151
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4152
- torch_flash_ma 3.61% 231.105us 41.57% 2.662ms 2.662ms 0.000us 0.00% 4.461ms 4.461ms 1
4153
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.411ms 100.25% 4.411ms 4.411ms 1
4154
- aten::scaled_dot_product_attention 0.40% 25.770us 2.78% 178.013us 59.338us 0.000us 0.00% 3.582ms 1.194ms 3
4155
- aten::_scaled_dot_product_flash_attention 0.28% 17.960us 2.38% 152.243us 50.748us 0.000us 0.00% 3.582ms 1.194ms 3
4156
- aten::_flash_attention_forward 0.51% 32.421us 1.73% 110.913us 36.971us 3.582ms 81.42% 3.582ms 1.194ms 3
4157
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.582ms 81.42% 3.582ms 1.194ms 3
4158
- aten::contiguous 0.14% 9.230us 34.45% 2.206ms 183.815us 0.000us 0.00% 878.374us 73.198us 12
4159
- aten::clone 0.41% 26.011us 34.30% 2.197ms 183.046us 0.000us 0.00% 878.374us 73.198us 12
4160
- aten::copy_ 1.29% 82.861us 32.91% 2.107ms 175.603us 817.702us 18.58% 878.374us 73.198us 12
4161
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 817.702us 18.58% 817.702us 68.142us 12
4162
- Activity Buffer Request 26.91% 1.723ms 26.91% 1.723ms 1.723ms 60.672us 1.38% 60.672us 60.672us 1
4163
- aten::transpose 0.81% 51.890us 1.10% 70.690us 2.945us 0.000us 0.00% 0.000us 0.000us 24
4164
- aten::as_strided 0.29% 18.800us 0.29% 18.800us 0.783us 0.000us 0.00% 0.000us 0.000us 24
4165
- aten::empty_like 0.29% 18.829us 1.29% 82.771us 5.518us 0.000us 0.00% 0.000us 0.000us 15
4166
- aten::empty 1.23% 78.733us 1.23% 78.733us 3.281us 0.000us 0.00% 0.000us 0.000us 24
4167
- cudaLaunchKernel 5.08% 325.239us 5.08% 325.239us 21.683us 0.000us 0.00% 0.000us 0.000us 15
4168
- aten::empty_strided 0.23% 14.690us 0.23% 14.690us 4.897us 0.000us 0.00% 0.000us 0.000us 3
4169
- cudaDeviceGetAttribute 0.03% 1.808us 0.03% 1.808us 0.301us 0.000us 0.00% 0.000us 0.000us 6
4170
- cudaFuncSetAttribute 0.06% 3.871us 0.06% 3.871us 1.290us 0.000us 0.00% 0.000us 0.000us 3
4171
- cudaDeviceSynchronize 58.43% 3.741ms 58.43% 3.741ms 3.741ms 0.000us 0.00% 0.000us 0.000us 1
4172
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4173
- Self CPU time total: 6.404ms
4174
- Self CUDA time total: 4.400ms
4175
 
4176
 
4177
  impl wl p50(ms) ok
4178
- torch_flash_ma cuda_attn_L128_bfloat16 1.20 True
4179
- torch_flash_ma cuda_attn_L256_bfloat16 1.26 True
4180
- torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
4181
- torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
4182
- torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
4183
- torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
4184
  </pre></div>
4185
  <div class="cell-artifacts">
4186
  <h4>Artifacts:</h4>
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 19:41:23 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
 
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
+ | N/A 31C P0 107W / 350W | 0MiB / 46068MiB | 100% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
 
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3940
  </span> |
3941
+ Cell: benchmark | 4.12s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.560ms 101.41% 3.560ms 3.560ms 1
3993
+ torch_flash_ma 6.12% 330.406us 49.12% 2.651ms 2.651ms 0.000us 0.00% 3.550ms 3.550ms 1
3994
+ aten::scaled_dot_product_attention 0.76% 41.091us 4.12% 222.225us 74.075us 0.000us 0.00% 2.785ms 928.191us 3
3995
+ aten::_scaled_dot_product_flash_attention 0.57% 30.902us 3.36% 181.134us 60.378us 0.000us 0.00% 2.785ms 928.191us 3
3996
+ aten::_flash_attention_forward 0.74% 39.881us 2.41% 130.323us 43.441us 2.785ms 79.34% 2.785ms 928.191us 3
3997
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.785ms 79.34% 2.785ms 928.191us 3
3998
+ aten::contiguous 0.24% 12.809us 37.68% 2.033ms 169.455us 0.000us 0.00% 765.791us 63.816us 12
3999
+ aten::clone 0.64% 34.521us 37.44% 2.021ms 168.387us 0.000us 0.00% 765.791us 63.816us 12
4000
+ aten::copy_ 1.67% 90.094us 35.26% 1.903ms 158.570us 725.311us 20.66% 765.791us 63.816us 12
4001
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 725.311us 20.66% 725.311us 60.443us 12
4002
+ Activity Buffer Request 31.66% 1.709ms 31.66% 1.709ms 1.709ms 40.480us 1.15% 40.480us 40.480us 1
4003
+ aten::transpose 1.17% 63.269us 1.58% 85.140us 3.548us 0.000us 0.00% 0.000us 0.000us 24
4004
+ aten::as_strided 0.41% 21.871us 0.41% 21.871us 0.911us 0.000us 0.00% 0.000us 0.000us 24
4005
+ aten::empty_like 0.47% 25.421us 1.97% 106.322us 7.088us 0.000us 0.00% 0.000us 0.000us 15
4006
+ aten::empty 1.76% 94.971us 1.76% 94.971us 3.957us 0.000us 0.00% 0.000us 0.000us 24
4007
+ cudaLaunchKernel 2.37% 128.144us 2.37% 128.144us 8.543us 0.000us 0.00% 0.000us 0.000us 15
4008
+ aten::empty_strided 0.32% 17.100us 0.32% 17.100us 5.700us 0.000us 0.00% 0.000us 0.000us 3
4009
+ cudaDeviceGetAttribute 0.04% 2.290us 0.04% 2.290us 0.382us 0.000us 0.00% 0.000us 0.000us 6
4010
+ cudaFuncSetAttribute 0.18% 9.631us 0.18% 9.631us 3.210us 0.000us 0.00% 0.000us 0.000us 3
4011
+ cudaDeviceSynchronize 50.88% 2.746ms 50.88% 2.746ms 2.746ms 0.000us 0.00% 0.000us 0.000us 1
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
+ Self CPU time total: 5.397ms
4014
+ Self CUDA time total: 3.510ms
4015
 
4016
 
4017
 
 
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4023
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4024
+ torch_flash_ma 4.59% 254.063us 44.59% 2.468ms 2.468ms 0.000us 0.00% 3.765ms 3.765ms 1
4025
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.721ms 100.30% 3.721ms 3.721ms 1
4026
+ aten::scaled_dot_product_attention 0.43% 23.691us 3.30% 182.385us 60.795us 0.000us 0.00% 2.950ms 983.280us 3
4027
+ aten::_scaled_dot_product_flash_attention 0.32% 17.969us 2.87% 158.694us 52.898us 0.000us 0.00% 2.950ms 983.280us 3
4028
+ aten::_flash_attention_forward 0.74% 40.930us 2.17% 120.223us 40.074us 2.950ms 79.52% 2.950ms 983.280us 3
4029
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.950ms 79.52% 2.950ms 983.280us 3
4030
+ aten::contiguous 0.16% 8.922us 35.94% 1.989ms 165.775us 0.000us 0.00% 815.354us 67.946us 12
4031
+ aten::clone 0.46% 25.650us 35.78% 1.980ms 165.031us 0.000us 0.00% 815.354us 67.946us 12
4032
+ aten::copy_ 1.41% 78.081us 34.18% 1.891ms 157.619us 759.770us 20.48% 815.354us 67.946us 12
4033
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 759.770us 20.48% 759.770us 63.314us 12
4034
+ Activity Buffer Request 31.33% 1.734ms 31.33% 1.734ms 1.734ms 55.584us 1.50% 55.584us 55.584us 1
4035
+ aten::transpose 0.84% 46.272us 1.13% 62.592us 2.608us 0.000us 0.00% 0.000us 0.000us 24
4036
+ aten::as_strided 0.29% 16.320us 0.29% 16.320us 0.680us 0.000us 0.00% 0.000us 0.000us 24
4037
+ aten::empty_like 0.39% 21.392us 1.49% 82.711us 5.514us 0.000us 0.00% 0.000us 0.000us 15
4038
+ aten::empty 1.42% 78.721us 1.42% 78.721us 3.280us 0.000us 0.00% 0.000us 0.000us 24
4039
+ cudaLaunchKernel 1.84% 101.714us 1.84% 101.714us 6.781us 0.000us 0.00% 0.000us 0.000us 15
4040
+ aten::empty_strided 0.25% 13.930us 0.25% 13.930us 4.643us 0.000us 0.00% 0.000us 0.000us 3
4041
+ cudaDeviceGetAttribute 0.03% 1.700us 0.03% 1.700us 0.283us 0.000us 0.00% 0.000us 0.000us 6
4042
+ cudaFuncSetAttribute 0.08% 4.360us 0.08% 4.360us 1.453us 0.000us 0.00% 0.000us 0.000us 3
4043
+ cudaDeviceSynchronize 55.41% 3.067ms 55.41% 3.067ms 3.067ms 0.000us 0.00% 0.000us 0.000us 1
4044
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4045
+ Self CPU time total: 5.534ms
4046
+ Self CUDA time total: 3.710ms
4047
 
4048
 
4049
 
 
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4055
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4056
+ torch_flash_ma 4.62% 254.756us 44.14% 2.433ms 2.433ms 0.000us 0.00% 3.774ms 3.774ms 1
4057
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.727ms 100.29% 3.727ms 3.727ms 1
4058
+ aten::scaled_dot_product_attention 0.43% 23.830us 3.33% 183.454us 61.151us 0.000us 0.00% 2.942ms 980.796us 3
4059
+ aten::_scaled_dot_product_flash_attention 0.32% 17.891us 2.90% 159.624us 53.208us 0.000us 0.00% 2.942ms 980.796us 3
4060
+ aten::_flash_attention_forward 0.73% 40.074us 2.20% 121.152us 40.384us 2.942ms 79.17% 2.942ms 980.796us 3
4061
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.942ms 79.17% 2.942ms 980.796us 3
4062
+ aten::contiguous 0.16% 8.718us 35.43% 1.953ms 162.745us 0.000us 0.00% 831.581us 69.298us 12
4063
+ aten::clone 0.47% 25.749us 35.27% 1.944ms 162.019us 0.000us 0.00% 831.581us 69.298us 12
4064
+ aten::copy_ 1.40% 77.041us 33.64% 1.855ms 154.552us 774.142us 20.83% 831.581us 69.298us 12
4065
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 774.142us 20.83% 774.142us 64.512us 12
4066
+ Activity Buffer Request 30.83% 1.700ms 30.83% 1.700ms 1.700ms 57.439us 1.55% 57.439us 57.439us 1
4067
+ aten::transpose 0.84% 46.360us 1.13% 62.482us 2.603us 0.000us 0.00% 0.000us 0.000us 24
4068
+ aten::as_strided 0.29% 16.122us 0.29% 16.122us 0.672us 0.000us 0.00% 0.000us 0.000us 24
4069
+ aten::empty_like 0.36% 19.611us 1.53% 84.374us 5.625us 0.000us 0.00% 0.000us 0.000us 15
4070
+ aten::empty 1.44% 79.561us 1.44% 79.561us 3.315us 0.000us 0.00% 0.000us 0.000us 24
4071
+ cudaLaunchKernel 1.87% 102.913us 1.87% 102.913us 6.861us 0.000us 0.00% 0.000us 0.000us 15
4072
+ aten::empty_strided 0.28% 15.330us 0.28% 15.330us 5.110us 0.000us 0.00% 0.000us 0.000us 3
4073
+ cudaDeviceGetAttribute 0.03% 1.680us 0.03% 1.680us 0.280us 0.000us 0.00% 0.000us 0.000us 6
4074
+ cudaFuncSetAttribute 0.07% 3.840us 0.07% 3.840us 1.280us 0.000us 0.00% 0.000us 0.000us 3
4075
+ cudaDeviceSynchronize 55.86% 3.080ms 55.86% 3.080ms 3.080ms 0.000us 0.00% 0.000us 0.000us 1
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
+ Self CPU time total: 5.513ms
4078
+ Self CUDA time total: 3.717ms
4079
 
4080
 
4081
 
 
4085
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4086
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
+ torch_flash_ma 4.28% 249.055us 45.91% 2.672ms 2.672ms 0.000us 0.00% 3.870ms 3.870ms 1
4089
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.822ms 100.28% 3.822ms 3.822ms 1
4090
+ aten::scaled_dot_product_attention 0.44% 25.342us 3.23% 187.955us 62.652us 0.000us 0.00% 3.022ms 1.007ms 3
4091
+ aten::_scaled_dot_product_flash_attention 0.30% 17.701us 2.79% 162.613us 54.204us 0.000us 0.00% 3.022ms 1.007ms 3
4092
+ aten::_flash_attention_forward 0.71% 41.280us 2.11% 122.541us 40.847us 3.022ms 79.29% 3.022ms 1.007ms 3
4093
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.022ms 79.29% 3.022ms 1.007ms 3
4094
+ aten::contiguous 0.16% 9.081us 37.65% 2.191ms 182.597us 0.000us 0.00% 847.483us 70.624us 12
4095
+ aten::clone 0.47% 27.546us 37.50% 2.182ms 181.840us 0.000us 0.00% 847.483us 70.624us 12
4096
+ aten::copy_ 1.40% 81.736us 35.91% 2.090ms 174.156us 789.211us 20.71% 847.483us 70.624us 12
4097
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 789.211us 20.71% 789.211us 65.768us 12
4098
+ Activity Buffer Request 29.46% 1.714ms 29.46% 1.714ms 1.714ms 58.272us 1.53% 58.272us 58.272us 1
4099
+ aten::transpose 0.83% 48.521us 1.13% 65.981us 2.749us 0.000us 0.00% 0.000us 0.000us 24
4100
+ aten::as_strided 0.30% 17.460us 0.30% 17.460us 0.727us 0.000us 0.00% 0.000us 0.000us 24
4101
+ aten::empty_like 0.35% 20.461us 1.45% 84.343us 5.623us 0.000us 0.00% 0.000us 0.000us 15
4102
+ aten::empty 1.38% 80.070us 1.38% 80.070us 3.336us 0.000us 0.00% 0.000us 0.000us 24
4103
+ cudaLaunchKernel 5.47% 318.217us 5.47% 318.217us 21.214us 0.000us 0.00% 0.000us 0.000us 15
4104
+ aten::empty_strided 0.25% 14.521us 0.25% 14.521us 4.840us 0.000us 0.00% 0.000us 0.000us 3
4105
+ cudaDeviceGetAttribute 0.03% 1.689us 0.03% 1.689us 0.282us 0.000us 0.00% 0.000us 0.000us 6
4106
+ cudaFuncSetAttribute 0.08% 4.671us 0.08% 4.671us 1.557us 0.000us 0.00% 0.000us 0.000us 3
4107
+ cudaDeviceSynchronize 54.09% 3.147ms 54.09% 3.147ms 3.147ms 0.000us 0.00% 0.000us 0.000us 1
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
+ Self CPU time total: 5.819ms
4110
+ Self CUDA time total: 3.811ms
4111
 
4112
 
4113
 
 
4117
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4118
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4119
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4120
+ torch_flash_ma 4.79% 300.628us 43.01% 2.699ms 2.699ms 0.000us 0.00% 4.340ms 4.340ms 1
4121
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.290ms 100.25% 4.290ms 4.290ms 1
4122
+ aten::scaled_dot_product_attention 0.40% 25.381us 2.96% 185.704us 61.901us 0.000us 0.00% 3.474ms 1.158ms 3
4123
+ aten::_scaled_dot_product_flash_attention 0.28% 17.780us 2.55% 160.323us 53.441us 0.000us 0.00% 3.474ms 1.158ms 3
4124
+ aten::_flash_attention_forward 0.64% 40.370us 1.93% 121.223us 40.408us 3.474ms 81.17% 3.474ms 1.158ms 3
4125
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.474ms 81.17% 3.474ms 1.158ms 3
4126
+ aten::contiguous 0.14% 9.022us 34.56% 2.169ms 180.719us 0.000us 0.00% 866.336us 72.195us 12
4127
+ aten::clone 0.44% 27.858us 34.41% 2.160ms 179.967us 0.000us 0.00% 866.336us 72.195us 12
4128
+ aten::copy_ 1.24% 77.719us 32.91% 2.066ms 172.130us 806.048us 18.83% 866.336us 72.195us 12
4129
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 806.048us 18.83% 806.048us 67.171us 12
4130
+ Activity Buffer Request 27.70% 1.738ms 27.70% 1.738ms 1.738ms 60.288us 1.41% 60.288us 60.288us 1
4131
+ aten::transpose 0.77% 48.240us 1.05% 65.650us 2.735us 0.000us 0.00% 0.000us 0.000us 24
4132
+ aten::as_strided 0.28% 17.410us 0.28% 17.410us 0.725us 0.000us 0.00% 0.000us 0.000us 24
4133
+ aten::empty_like 0.34% 21.363us 1.38% 86.453us 5.764us 0.000us 0.00% 0.000us 0.000us 15
4134
+ aten::empty 1.28% 80.561us 1.28% 80.561us 3.357us 0.000us 0.00% 0.000us 0.000us 24
4135
+ cudaLaunchKernel 4.36% 273.888us 4.36% 273.888us 18.259us 0.000us 0.00% 0.000us 0.000us 15
4136
+ aten::empty_strided 0.24% 14.900us 0.24% 14.900us 4.967us 0.000us 0.00% 0.000us 0.000us 3
4137
+ cudaDeviceGetAttribute 0.03% 1.700us 0.03% 1.700us 0.283us 0.000us 0.00% 0.000us 0.000us 6
4138
+ cudaFuncSetAttribute 0.07% 4.100us 0.07% 4.100us 1.367us 0.000us 0.00% 0.000us 0.000us 3
4139
+ cudaDeviceSynchronize 56.99% 3.576ms 56.99% 3.576ms 3.576ms 0.000us 0.00% 0.000us 0.000us 1
4140
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4141
+ Self CPU time total: 6.275ms
4142
+ Self CUDA time total: 4.280ms
4143
 
4144
 
4145
 
 
4149
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4150
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4151
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4152
+ torch_flash_ma 4.01% 253.526us 41.16% 2.602ms 2.602ms 0.000us 0.00% 4.429ms 4.429ms 1
4153
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.378ms 100.24% 4.378ms 4.378ms 1
4154
+ aten::scaled_dot_product_attention 0.38% 23.889us 2.89% 182.483us 60.828us 0.000us 0.00% 3.556ms 1.185ms 3
4155
+ aten::_scaled_dot_product_flash_attention 0.27% 17.360us 2.51% 158.594us 52.865us 0.000us 0.00% 3.556ms 1.185ms 3
4156
+ aten::_flash_attention_forward 0.66% 42.013us 1.90% 120.422us 40.141us 3.556ms 81.42% 3.556ms 1.185ms 3
4157
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.556ms 81.42% 3.556ms 1.185ms 3
4158
+ aten::contiguous 0.14% 8.630us 33.58% 2.122ms 176.875us 0.000us 0.00% 872.667us 72.722us 12
4159
+ aten::clone 0.41% 26.047us 33.44% 2.114ms 176.156us 0.000us 0.00% 872.667us 72.722us 12
4160
+ aten::copy_ 1.25% 79.082us 32.00% 2.023ms 168.597us 811.483us 18.58% 872.667us 72.722us 12
4161
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 811.483us 18.58% 811.483us 67.624us 12
4162
+ Activity Buffer Request 26.87% 1.699ms 26.87% 1.699ms 1.699ms 61.184us 1.40% 61.184us 61.184us 1
4163
+ aten::transpose 0.75% 47.653us 1.02% 64.533us 2.689us 0.000us 0.00% 0.000us 0.000us 24
4164
+ aten::as_strided 0.27% 16.880us 0.27% 16.880us 0.703us 0.000us 0.00% 0.000us 0.000us 24
4165
+ aten::empty_like 0.33% 20.879us 1.34% 84.642us 5.643us 0.000us 0.00% 0.000us 0.000us 15
4166
+ aten::empty 1.25% 79.031us 1.25% 79.031us 3.293us 0.000us 0.00% 0.000us 0.000us 24
4167
+ cudaLaunchKernel 4.24% 268.168us 4.24% 268.168us 17.878us 0.000us 0.00% 0.000us 0.000us 15
4168
+ aten::empty_strided 0.23% 14.592us 0.23% 14.592us 4.864us 0.000us 0.00% 0.000us 0.000us 3
4169
+ cudaDeviceGetAttribute 0.03% 1.679us 0.03% 1.679us 0.280us 0.000us 0.00% 0.000us 0.000us 6
4170
+ cudaFuncSetAttribute 0.06% 3.920us 0.06% 3.920us 1.307us 0.000us 0.00% 0.000us 0.000us 3
4171
+ cudaDeviceSynchronize 58.84% 3.719ms 58.84% 3.719ms 3.719ms 0.000us 0.00% 0.000us 0.000us 1
4172
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4173
+ Self CPU time total: 6.322ms
4174
+ Self CUDA time total: 4.367ms
4175
 
4176
 
4177
  impl wl p50(ms) ok
4178
+ torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
4179
+ torch_flash_ma cuda_attn_L256_bfloat16 1.25 True
4180
+ torch_flash_ma cuda_attn_L320_bfloat16 1.28 True
4181
+ torch_flash_ma cuda_attn_L384_bfloat16 1.31 True
4182
+ torch_flash_ma cuda_attn_L448_bfloat16 1.45 True
4183
+ torch_flash_ma cuda_attn_L512_bfloat16 1.49 True
4184
  </pre></div>
4185
  <div class="cell-artifacts">
4186
  <h4>Artifacts:</h4>
flash_attn/impls/hf_kernels_flash_attn.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: benchmark | 6.12s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3943,21 +3943,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3945
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3946
- hf_kernels_flash_attn 3.19% 147.591us 44.59% 2.062ms 2.062ms 0.000us 0.00% 3.719ms 3.719ms 1
3947
- _flash_attn_9e27194::fwd 1.32% 60.849us 41.40% 1.914ms 638.151us 2.771ms 100.00% 3.719ms 1.240ms 3
3948
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.773ms 100.06% 2.773ms 2.773ms 1
3949
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.771ms 100.00% 2.771ms 923.713us 3
3950
- Activity Buffer Request 37.16% 1.718ms 37.16% 1.718ms 1.718ms 947.777us 34.20% 947.777us 947.777us 1
3951
- cudaDeviceGetAttribute 0.09% 4.211us 0.09% 4.211us 0.281us 0.000us 0.00% 0.000us 0.000us 15
3952
- aten::empty_like 0.37% 16.891us 1.10% 50.702us 16.901us 0.000us 0.00% 0.000us 0.000us 3
3953
- aten::empty_strided 0.73% 33.811us 0.73% 33.811us 11.270us 0.000us 0.00% 0.000us 0.000us 3
3954
- aten::empty 0.54% 24.922us 0.54% 24.922us 2.769us 0.000us 0.00% 0.000us 0.000us 9
3955
- cudaFuncSetAttribute 0.27% 12.349us 0.27% 12.349us 4.116us 0.000us 0.00% 0.000us 0.000us 3
3956
- cudaLaunchKernel 0.93% 42.971us 0.93% 42.971us 14.324us 0.000us 0.00% 0.000us 0.000us 3
3957
- cudaDeviceSynchronize 55.41% 2.563ms 55.41% 2.563ms 2.563ms 0.000us 0.00% 0.000us 0.000us 1
3958
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3959
- Self CPU time total: 4.625ms
3960
- Self CUDA time total: 2.771ms
3961
 
3962
 
3963
 
@@ -3967,21 +3967,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
3967
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3968
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3969
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3970
- hf_kernels_flash_attn 1.95% 91.420us 40.89% 1.916ms 1.916ms 0.000us 0.00% 3.901ms 3.901ms 1
3971
- _flash_attn_9e27194::fwd 0.98% 45.792us 38.94% 1.825ms 608.181us 2.914ms 100.00% 3.901ms 1.300ms 3
3972
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.916ms 100.05% 2.916ms 2.916ms 1
3973
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.914ms 100.00% 2.914ms 971.481us 3
3974
- Activity Buffer Request 36.29% 1.700ms 36.29% 1.700ms 1.700ms 986.884us 33.86% 986.884us 986.884us 1
3975
- cudaDeviceGetAttribute 0.07% 3.500us 0.07% 3.500us 0.233us 0.000us 0.00% 0.000us 0.000us 15
3976
- aten::empty_like 0.15% 6.960us 0.52% 24.320us 8.107us 0.000us 0.00% 0.000us 0.000us 3
3977
- aten::empty_strided 0.37% 17.360us 0.37% 17.360us 5.787us 0.000us 0.00% 0.000us 0.000us 3
3978
- aten::empty 0.45% 21.021us 0.45% 21.021us 2.336us 0.000us 0.00% 0.000us 0.000us 9
3979
- cudaFuncSetAttribute 0.08% 3.519us 0.08% 3.519us 1.173us 0.000us 0.00% 0.000us 0.000us 3
3980
- cudaLaunchKernel 0.55% 25.931us 0.55% 25.931us 8.644us 0.000us 0.00% 0.000us 0.000us 3
3981
- cudaDeviceSynchronize 59.11% 2.770ms 59.11% 2.770ms 2.770ms 0.000us 0.00% 0.000us 0.000us 1
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
- Self CPU time total: 4.686ms
3984
- Self CUDA time total: 2.914ms
3985
 
3986
 
3987
 
@@ -3991,21 +3991,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
- hf_kernels_flash_attn 2.13% 103.462us 40.42% 1.967ms 1.967ms 0.000us 0.00% 4.069ms 4.069ms 1
3995
- _flash_attn_9e27194::fwd 0.94% 45.522us 38.30% 1.863ms 621.134us 3.040ms 100.00% 4.069ms 1.356ms 3
3996
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.041ms 100.05% 3.041ms 3.041ms 1
3997
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.040ms 100.00% 3.040ms 1.013ms 3
3998
- Activity Buffer Request 35.70% 1.737ms 35.70% 1.737ms 1.737ms 1.029ms 33.84% 1.029ms 1.029ms 1
3999
- cudaDeviceGetAttribute 0.07% 3.488us 0.07% 3.488us 0.233us 0.000us 0.00% 0.000us 0.000us 15
4000
- aten::empty_like 0.13% 6.550us 0.49% 24.010us 8.003us 0.000us 0.00% 0.000us 0.000us 3
4001
- aten::empty_strided 0.36% 17.460us 0.36% 17.460us 5.820us 0.000us 0.00% 0.000us 0.000us 3
4002
- aten::empty 0.47% 22.651us 0.47% 22.651us 2.517us 0.000us 0.00% 0.000us 0.000us 9
4003
- cudaFuncSetAttribute 0.07% 3.621us 0.07% 3.621us 1.207us 0.000us 0.00% 0.000us 0.000us 3
4004
- cudaLaunchKernel 0.55% 26.960us 0.55% 26.960us 8.987us 0.000us 0.00% 0.000us 0.000us 3
4005
- cudaDeviceSynchronize 59.58% 2.899ms 59.58% 2.899ms 2.899ms 0.000us 0.00% 0.000us 0.000us 1
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
- Self CPU time total: 4.866ms
4008
- Self CUDA time total: 3.040ms
4009
 
4010
 
4011
 
@@ -4015,21 +4015,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
- hf_kernels_flash_attn 2.03% 100.371us 41.00% 2.032ms 2.032ms 0.000us 0.00% 4.098ms 4.098ms 1
4019
- _flash_attn_9e27194::fwd 0.92% 45.401us 38.98% 1.931ms 643.821us 3.066ms 100.00% 4.098ms 1.366ms 3
4020
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.068ms 100.05% 3.068ms 3.068ms 1
4021
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.066ms 100.00% 3.066ms 1.022ms 3
4022
- Activity Buffer Request 32.94% 1.632ms 32.94% 1.632ms 1.632ms 1.032ms 33.68% 1.032ms 1.032ms 1
4023
- cudaDeviceGetAttribute 0.07% 3.502us 0.07% 3.502us 0.233us 0.000us 0.00% 0.000us 0.000us 15
4024
- aten::empty_like 0.14% 6.780us 0.47% 23.270us 7.757us 0.000us 0.00% 0.000us 0.000us 3
4025
- aten::empty_strided 0.33% 16.490us 0.33% 16.490us 5.497us 0.000us 0.00% 0.000us 0.000us 3
4026
- aten::empty 0.45% 22.299us 0.45% 22.299us 2.478us 0.000us 0.00% 0.000us 0.000us 9
4027
- cudaFuncSetAttribute 0.09% 4.220us 0.09% 4.220us 1.407us 0.000us 0.00% 0.000us 0.000us 3
4028
- cudaLaunchKernel 4.04% 200.304us 4.04% 200.304us 66.768us 0.000us 0.00% 0.000us 0.000us 3
4029
- cudaDeviceSynchronize 59.00% 2.924ms 59.00% 2.924ms 2.924ms 0.000us 0.00% 0.000us 0.000us 1
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
- Self CPU time total: 4.956ms
4032
- Self CUDA time total: 3.066ms
4033
 
4034
 
4035
 
@@ -4039,21 +4039,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4041
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4042
- hf_kernels_flash_attn 2.01% 110.531us 38.27% 2.104ms 2.104ms 0.000us 0.00% 4.721ms 4.721ms 1
4043
- _flash_attn_9e27194::fwd 0.85% 46.845us 36.26% 1.993ms 664.435us 3.536ms 100.00% 4.721ms 1.574ms 3
4044
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.537ms 100.04% 3.537ms 3.537ms 1
4045
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.536ms 100.00% 3.536ms 1.179ms 3
4046
- Activity Buffer Request 31.52% 1.733ms 31.52% 1.733ms 1.733ms 1.186ms 33.53% 1.186ms 1.186ms 1
4047
- cudaDeviceGetAttribute 0.07% 3.850us 0.07% 3.850us 0.257us 0.000us 0.00% 0.000us 0.000us 15
4048
- aten::empty_like 0.13% 7.081us 0.42% 23.120us 7.707us 0.000us 0.00% 0.000us 0.000us 3
4049
- aten::empty_strided 0.29% 16.039us 0.29% 16.039us 5.346us 0.000us 0.00% 0.000us 0.000us 3
4050
- aten::empty 0.38% 21.099us 0.38% 21.099us 2.344us 0.000us 0.00% 0.000us 0.000us 9
4051
- cudaFuncSetAttribute 0.07% 3.738us 0.07% 3.738us 1.246us 0.000us 0.00% 0.000us 0.000us 3
4052
- cudaLaunchKernel 2.95% 161.933us 2.95% 161.933us 53.978us 0.000us 0.00% 0.000us 0.000us 3
4053
- cudaDeviceSynchronize 61.73% 3.393ms 61.73% 3.393ms 3.393ms 0.000us 0.00% 0.000us 0.000us 1
4054
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4055
- Self CPU time total: 5.497ms
4056
- Self CUDA time total: 3.536ms
4057
 
4058
 
4059
 
@@ -4063,36 +4063,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
4063
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4064
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
- hf_kernels_flash_attn 1.92% 105.962us 36.83% 2.036ms 2.036ms 0.000us 0.00% 4.864ms 4.864ms 1
4067
- _flash_attn_9e27194::fwd 0.86% 47.350us 34.91% 1.930ms 643.481us 3.642ms 100.00% 4.864ms 1.621ms 3
4068
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.643ms 100.04% 3.643ms 3.643ms 1
4069
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.642ms 100.00% 3.642ms 1.214ms 3
4070
- Activity Buffer Request 30.16% 1.668ms 30.16% 1.668ms 1.668ms 1.222ms 33.55% 1.222ms 1.222ms 1
4071
- cudaDeviceGetAttribute 0.06% 3.551us 0.06% 3.551us 0.237us 0.000us 0.00% 0.000us 0.000us 15
4072
- aten::empty_like 0.12% 6.900us 0.42% 23.180us 7.727us 0.000us 0.00% 0.000us 0.000us 3
4073
- aten::empty_strided 0.29% 16.280us 0.29% 16.280us 5.427us 0.000us 0.00% 0.000us 0.000us 3
4074
- aten::empty 0.40% 21.939us 0.40% 21.939us 2.438us 0.000us 0.00% 0.000us 0.000us 9
4075
- cudaFuncSetAttribute 0.07% 3.861us 0.07% 3.861us 1.287us 0.000us 0.00% 0.000us 0.000us 3
4076
- cudaLaunchKernel 2.95% 163.043us 2.95% 163.043us 54.348us 0.000us 0.00% 0.000us 0.000us 3
4077
- cudaDeviceSynchronize 63.17% 3.493ms 63.17% 3.493ms 3.493ms 0.000us 0.00% 0.000us 0.000us 1
4078
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4079
- Self CPU time total: 5.529ms
4080
- Self CUDA time total: 3.642ms
4081
 
4082
 
4083
  impl wl p50(ms) ok
4084
- hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
4085
- hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.02 True
4086
- hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
4087
  hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
4088
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
4089
- hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.22 True
4090
  </pre></div>
4091
  <div class="cell-stderr">
4092
- Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4093
- Fetching 20 files: 5%|▌ | 1/20 [00:00&lt;00:03, 6.04it/s]
4094
- Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:20, 1.14s/it]
4095
- Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 10.05it/s]
4096
  </div>
4097
  <div class="cell-artifacts">
4098
  <h4>Artifacts:</h4>
 
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: benchmark | 5.91s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3945
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3946
+ hf_kernels_flash_attn 3.35% 155.232us 44.97% 2.082ms 2.082ms 0.000us 0.00% 3.704ms 3.704ms 1
3947
+ _flash_attn_9e27194::fwd 1.43% 66.152us 41.62% 1.927ms 642.264us 2.766ms 100.00% 3.704ms 1.235ms 3
3948
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.768ms 100.06% 2.768ms 2.768ms 1
3949
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.766ms 100.00% 2.766ms 922.153us 3
3950
+ Activity Buffer Request 37.12% 1.719ms 37.12% 1.719ms 1.719ms 937.630us 33.89% 937.630us 937.630us 1
3951
+ cudaDeviceGetAttribute 0.12% 5.360us 0.12% 5.360us 0.357us 0.000us 0.00% 0.000us 0.000us 15
3952
+ aten::empty_like 0.39% 18.222us 1.18% 54.592us 18.197us 0.000us 0.00% 0.000us 0.000us 3
3953
+ aten::empty_strided 0.79% 36.370us 0.79% 36.370us 12.123us 0.000us 0.00% 0.000us 0.000us 3
3954
+ aten::empty 0.56% 25.741us 0.56% 25.741us 2.860us 0.000us 0.00% 0.000us 0.000us 9
3955
+ cudaFuncSetAttribute 0.30% 13.770us 0.30% 13.770us 4.590us 0.000us 0.00% 0.000us 0.000us 3
3956
+ cudaLaunchKernel 0.92% 42.401us 0.92% 42.401us 14.134us 0.000us 0.00% 0.000us 0.000us 3
3957
+ cudaDeviceSynchronize 55.03% 2.548ms 55.03% 2.548ms 2.548ms 0.000us 0.00% 0.000us 0.000us 1
3958
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3959
+ Self CPU time total: 4.630ms
3960
+ Self CUDA time total: 2.766ms
3961
 
3962
 
3963
 
 
3967
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3968
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3969
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3970
+ hf_kernels_flash_attn 1.95% 91.533us 41.78% 1.962ms 1.962ms 0.000us 0.00% 3.856ms 3.856ms 1
3971
+ _flash_attn_9e27194::fwd 1.04% 49.050us 39.83% 1.870ms 623.350us 2.882ms 100.00% 3.856ms 1.285ms 3
3972
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.884ms 100.05% 2.884ms 2.884ms 1
3973
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.882ms 100.00% 2.882ms 960.764us 3
3974
+ Activity Buffer Request 36.88% 1.732ms 36.88% 1.732ms 1.732ms 973.756us 33.78% 973.756us 973.756us 1
3975
+ cudaDeviceGetAttribute 0.09% 4.030us 0.09% 4.030us 0.269us 0.000us 0.00% 0.000us 0.000us 15
3976
+ aten::empty_like 0.18% 8.460us 0.61% 28.490us 9.497us 0.000us 0.00% 0.000us 0.000us 3
3977
+ aten::empty_strided 0.43% 20.030us 0.43% 20.030us 6.677us 0.000us 0.00% 0.000us 0.000us 3
3978
+ aten::empty 0.55% 25.961us 0.55% 25.961us 2.885us 0.000us 0.00% 0.000us 0.000us 9
3979
+ cudaFuncSetAttribute 0.08% 3.700us 0.08% 3.700us 1.233us 0.000us 0.00% 0.000us 0.000us 3
3980
+ cudaLaunchKernel 0.58% 27.091us 0.58% 27.091us 9.030us 0.000us 0.00% 0.000us 0.000us 3
3981
+ cudaDeviceSynchronize 58.22% 2.734ms 58.22% 2.734ms 2.734ms 0.000us 0.00% 0.000us 0.000us 1
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
+ Self CPU time total: 4.695ms
3984
+ Self CUDA time total: 2.882ms
3985
 
3986
 
3987
 
 
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
+ hf_kernels_flash_attn 2.18% 107.861us 40.50% 2.008ms 2.008ms 0.000us 0.00% 4.125ms 4.125ms 1
3995
+ _flash_attn_9e27194::fwd 0.99% 48.872us 38.32% 1.900ms 633.314us 3.094ms 100.00% 4.125ms 1.375ms 3
3996
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.095ms 100.05% 3.095ms 3.095ms 1
3997
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.094ms 100.00% 3.094ms 1.031ms 3
3998
+ Activity Buffer Request 35.67% 1.768ms 35.67% 1.768ms 1.768ms 1.032ms 33.35% 1.032ms 1.032ms 1
3999
+ cudaDeviceGetAttribute 0.09% 4.480us 0.09% 4.480us 0.299us 0.000us 0.00% 0.000us 0.000us 15
4000
+ aten::empty_like 0.13% 6.580us 0.45% 22.520us 7.507us 0.000us 0.00% 0.000us 0.000us 3
4001
+ aten::empty_strided 0.32% 15.940us 0.32% 15.940us 5.313us 0.000us 0.00% 0.000us 0.000us 3
4002
+ aten::empty 0.47% 23.250us 0.47% 23.250us 2.583us 0.000us 0.00% 0.000us 0.000us 9
4003
+ cudaFuncSetAttribute 0.08% 3.791us 0.08% 3.791us 1.264us 0.000us 0.00% 0.000us 0.000us 3
4004
+ cudaLaunchKernel 0.58% 28.541us 0.58% 28.541us 9.514us 0.000us 0.00% 0.000us 0.000us 3
4005
+ cudaDeviceSynchronize 59.50% 2.950ms 59.50% 2.950ms 2.950ms 0.000us 0.00% 0.000us 0.000us 1
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
+ Self CPU time total: 4.958ms
4008
+ Self CUDA time total: 3.094ms
4009
 
4010
 
4011
 
 
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
+ hf_kernels_flash_attn 2.18% 109.362us 41.99% 2.109ms 2.109ms 0.000us 0.00% 4.102ms 4.102ms 1
4019
+ _flash_attn_9e27194::fwd 1.01% 50.650us 39.81% 1.999ms 666.498us 3.061ms 100.00% 4.102ms 1.367ms 3
4020
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.062ms 100.05% 3.062ms 3.062ms 1
4021
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.061ms 100.00% 3.061ms 1.020ms 3
4022
+ Activity Buffer Request 33.41% 1.678ms 33.41% 1.678ms 1.678ms 1.041ms 34.02% 1.041ms 1.041ms 1
4023
+ cudaDeviceGetAttribute 0.08% 4.070us 0.08% 4.070us 0.271us 0.000us 0.00% 0.000us 0.000us 15
4024
+ aten::empty_like 0.14% 6.851us 0.49% 24.381us 8.127us 0.000us 0.00% 0.000us 0.000us 3
4025
+ aten::empty_strided 0.35% 17.530us 0.35% 17.530us 5.843us 0.000us 0.00% 0.000us 0.000us 3
4026
+ aten::empty 0.44% 22.140us 0.44% 22.140us 2.460us 0.000us 0.00% 0.000us 0.000us 9
4027
+ cudaFuncSetAttribute 0.08% 3.810us 0.08% 3.810us 1.270us 0.000us 0.00% 0.000us 0.000us 3
4028
+ cudaLaunchKernel 4.31% 216.396us 4.31% 216.396us 72.132us 0.000us 0.00% 0.000us 0.000us 3
4029
+ cudaDeviceSynchronize 58.01% 2.914ms 58.01% 2.914ms 2.914ms 0.000us 0.00% 0.000us 0.000us 1
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
+ Self CPU time total: 5.023ms
4032
+ Self CUDA time total: 3.061ms
4033
 
4034
 
4035
 
 
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4041
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4042
+ hf_kernels_flash_attn 1.91% 108.693us 38.60% 2.193ms 2.193ms 0.000us 0.00% 4.850ms 4.850ms 1
4043
+ _flash_attn_9e27194::fwd 0.87% 49.481us 36.69% 2.084ms 694.644us 3.635ms 100.00% 4.850ms 1.617ms 3
4044
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.637ms 100.05% 3.637ms 3.637ms 1
4045
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.635ms 100.00% 3.635ms 1.212ms 3
4046
+ Activity Buffer Request 31.43% 1.785ms 31.43% 1.785ms 1.785ms 1.215ms 33.41% 1.215ms 1.215ms 1
4047
+ cudaDeviceGetAttribute 0.07% 3.761us 0.07% 3.761us 0.251us 0.000us 0.00% 0.000us 0.000us 15
4048
+ aten::empty_like 0.12% 6.970us 0.43% 24.340us 8.113us 0.000us 0.00% 0.000us 0.000us 3
4049
+ aten::empty_strided 0.31% 17.370us 0.31% 17.370us 5.790us 0.000us 0.00% 0.000us 0.000us 3
4050
+ aten::empty 0.43% 24.270us 0.43% 24.270us 2.697us 0.000us 0.00% 0.000us 0.000us 9
4051
+ cudaFuncSetAttribute 0.07% 3.730us 0.07% 3.730us 1.243us 0.000us 0.00% 0.000us 0.000us 3
4052
+ cudaLaunchKernel 3.40% 193.224us 3.40% 193.224us 64.408us 0.000us 0.00% 0.000us 0.000us 3
4053
+ cudaDeviceSynchronize 61.40% 3.487ms 61.40% 3.487ms 3.487ms 0.000us 0.00% 0.000us 0.000us 1
4054
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4055
+ Self CPU time total: 5.680ms
4056
+ Self CUDA time total: 3.635ms
4057
 
4058
 
4059
 
 
4063
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4064
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
+ hf_kernels_flash_attn 1.90% 106.201us 36.85% 2.064ms 2.064ms 0.000us 0.00% 4.915ms 4.915ms 1
4067
+ _flash_attn_9e27194::fwd 0.89% 50.062us 34.96% 1.958ms 652.751us 3.682ms 100.00% 4.915ms 1.638ms 3
4068
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.684ms 100.05% 3.684ms 3.684ms 1
4069
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.682ms 100.00% 3.682ms 1.227ms 3
4070
+ Activity Buffer Request 29.73% 1.666ms 29.73% 1.666ms 1.666ms 1.233ms 33.48% 1.233ms 1.233ms 1
4071
+ cudaDeviceGetAttribute 0.07% 4.189us 0.07% 4.189us 0.279us 0.000us 0.00% 0.000us 0.000us 15
4072
+ aten::empty_like 0.12% 6.851us 0.45% 25.301us 8.434us 0.000us 0.00% 0.000us 0.000us 3
4073
+ aten::empty_strided 0.33% 18.450us 0.33% 18.450us 6.150us 0.000us 0.00% 0.000us 0.000us 3
4074
+ aten::empty 0.40% 22.632us 0.40% 22.632us 2.515us 0.000us 0.00% 0.000us 0.000us 9
4075
+ cudaFuncSetAttribute 0.07% 3.850us 0.07% 3.850us 1.283us 0.000us 0.00% 0.000us 0.000us 3
4076
+ cudaLaunchKernel 3.33% 186.623us 3.33% 186.623us 62.208us 0.000us 0.00% 0.000us 0.000us 3
4077
+ cudaDeviceSynchronize 63.15% 3.537ms 63.15% 3.537ms 3.537ms 0.000us 0.00% 0.000us 0.000us 1
4078
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4079
+ Self CPU time total: 5.602ms
4080
+ Self CUDA time total: 3.682ms
4081
 
4082
 
4083
  impl wl p50(ms) ok
4084
+ hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
4085
+ hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
4086
+ hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.04 True
4087
  hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
4088
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
4089
+ hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.24 True
4090
  </pre></div>
4091
  <div class="cell-stderr">
4092
+ Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
4093
+
4094
+ Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:15, 1.16it/s]
4095
+ Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 11.63it/s]
4096
  </div>
4097
  <div class="cell-artifacts">
4098
  <h4>Artifacts:</h4>
flash_attn/impls/hf_kernels_flash_attn3.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: benchmark | 6.42s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3942,19 +3942,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
3942
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3943
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3944
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3945
- hf_kernels_flash_attn3 3.60% 164.063us 47.53% 2.169ms 2.169ms 0.000us 0.00% 3.577ms 3.577ms 1
3946
- FlashAttnFunc 2.65% 121.151us 43.94% 2.005ms 668.341us 0.000us 0.00% 3.577ms 1.192ms 3
3947
- _flash_attn3_1d39a44::fwd 1.62% 73.763us 41.28% 1.884ms 627.958us 2.686ms 100.00% 3.577ms 1.192ms 3
3948
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.688ms 100.07% 2.688ms 2.688ms 1
3949
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.686ms 100.00% 2.686ms 895.374us 3
3950
- Activity Buffer Request 37.38% 1.706ms 37.38% 1.706ms 1.706ms 891.299us 33.18% 891.299us 891.299us 1
3951
- aten::empty 0.94% 42.930us 0.94% 42.930us 7.155us 0.000us 0.00% 0.000us 0.000us 6
3952
- cudaFuncSetAttribute 0.33% 14.999us 0.33% 14.999us 5.000us 0.000us 0.00% 0.000us 0.000us 3
3953
- cudaLaunchKernel 1.02% 46.432us 1.02% 46.432us 15.477us 0.000us 0.00% 0.000us 0.000us 3
3954
- cudaDeviceSynchronize 52.47% 2.394ms 52.47% 2.394ms 2.394ms 0.000us 0.00% 0.000us 0.000us 1
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
- Self CPU time total: 4.563ms
3957
- Self CUDA time total: 2.686ms
3958
 
3959
 
3960
 
@@ -3964,19 +3964,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
3964
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3965
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3966
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3967
- hf_kernels_flash_attn3 2.68% 123.103us 45.27% 2.082ms 2.082ms 0.000us 0.00% 3.670ms 3.670ms 1
3968
- FlashAttnFunc 2.03% 93.300us 42.60% 1.959ms 653.024us 0.000us 0.00% 3.670ms 1.223ms 3
3969
- _flash_attn3_1d39a44::fwd 1.05% 48.412us 40.57% 1.866ms 621.924us 2.738ms 100.00% 3.670ms 1.223ms 3
3970
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.739ms 100.06% 2.739ms 2.739ms 1
3971
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.738ms 100.00% 2.738ms 912.629us 3
3972
- Activity Buffer Request 38.14% 1.754ms 38.14% 1.754ms 1.754ms 932.416us 34.06% 932.416us 932.416us 1
3973
- aten::empty 0.59% 27.041us 0.59% 27.041us 4.507us 0.000us 0.00% 0.000us 0.000us 6
3974
- cudaFuncSetAttribute 0.14% 6.480us 0.14% 6.480us 2.160us 0.000us 0.00% 0.000us 0.000us 3
3975
- cudaLaunchKernel 0.64% 29.621us 0.64% 29.621us 9.874us 0.000us 0.00% 0.000us 0.000us 3
3976
- cudaDeviceSynchronize 54.73% 2.517ms 54.73% 2.517ms 2.517ms 0.000us 0.00% 0.000us 0.000us 1
3977
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3978
- Self CPU time total: 4.599ms
3979
- Self CUDA time total: 2.738ms
3980
 
3981
 
3982
 
@@ -3986,19 +3986,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
- hf_kernels_flash_attn3 2.66% 126.472us 43.74% 2.079ms 2.079ms 0.000us 0.00% 3.863ms 3.863ms 1
3990
- FlashAttnFunc 1.87% 89.050us 41.07% 1.952ms 650.694us 0.000us 0.00% 3.863ms 1.288ms 3
3991
- _flash_attn3_1d39a44::fwd 1.00% 47.600us 39.20% 1.863ms 621.011us 2.883ms 100.00% 3.863ms 1.288ms 3
3992
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.885ms 100.05% 2.885ms 2.885ms 1
3993
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.883ms 100.00% 2.883ms 961.034us 3
3994
- Activity Buffer Request 36.94% 1.756ms 36.94% 1.756ms 1.756ms 979.903us 33.99% 979.903us 979.903us 1
3995
- aten::empty 0.53% 25.081us 0.53% 25.081us 4.180us 0.000us 0.00% 0.000us 0.000us 6
3996
- cudaFuncSetAttribute 0.11% 5.050us 0.11% 5.050us 1.683us 0.000us 0.00% 0.000us 0.000us 3
3997
- cudaLaunchKernel 0.62% 29.612us 0.62% 29.612us 9.871us 0.000us 0.00% 0.000us 0.000us 3
3998
- cudaDeviceSynchronize 56.26% 2.674ms 56.26% 2.674ms 2.674ms 0.000us 0.00% 0.000us 0.000us 1
3999
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4000
- Self CPU time total: 4.753ms
4001
- Self CUDA time total: 2.883ms
4002
 
4003
 
4004
 
@@ -4008,19 +4008,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
- hf_kernels_flash_attn3 2.48% 119.623us 44.91% 2.170ms 2.170ms 0.000us 0.00% 3.846ms 3.846ms 1
4012
- FlashAttnFunc 1.87% 90.201us 42.43% 2.050ms 683.325us 0.000us 0.00% 3.846ms 1.282ms 3
4013
- _flash_attn3_1d39a44::fwd 0.98% 47.571us 40.56% 1.960ms 653.258us 2.874ms 100.00% 3.846ms 1.282ms 3
4014
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.876ms 100.05% 2.876ms 2.876ms 1
4015
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.874ms 100.00% 2.874ms 957.983us 3
4016
- Activity Buffer Request 34.13% 1.649ms 34.13% 1.649ms 1.649ms 972.223us 33.83% 972.223us 972.223us 1
4017
- aten::empty 0.55% 26.410us 0.55% 26.410us 4.402us 0.000us 0.00% 0.000us 0.000us 6
4018
- cudaFuncSetAttribute 0.11% 5.420us 0.11% 5.420us 1.807us 0.000us 0.00% 0.000us 0.000us 3
4019
- cudaLaunchKernel 4.79% 231.213us 4.79% 231.213us 77.071us 0.000us 0.00% 0.000us 0.000us 3
4020
- cudaDeviceSynchronize 55.09% 2.662ms 55.09% 2.662ms 2.662ms 0.000us 0.00% 0.000us 0.000us 1
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
- Self CPU time total: 4.831ms
4023
- Self CUDA time total: 2.874ms
4024
 
4025
 
4026
 
@@ -4030,19 +4030,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4032
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4033
- hf_kernels_flash_attn3 2.24% 122.153us 41.67% 2.277ms 2.277ms 0.000us 0.00% 4.541ms 4.541ms 1
4034
- FlashAttnFunc 1.69% 92.610us 39.43% 2.155ms 718.395us 0.000us 0.00% 4.541ms 1.514ms 3
4035
- _flash_attn3_1d39a44::fwd 0.86% 47.089us 37.74% 2.063ms 687.525us 3.403ms 100.00% 4.541ms 1.514ms 3
4036
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.404ms 100.05% 3.404ms 3.404ms 1
4037
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.403ms 100.00% 3.403ms 1.134ms 3
4038
- Activity Buffer Request 32.14% 1.757ms 32.14% 1.757ms 1.757ms 1.138ms 33.45% 1.138ms 1.138ms 1
4039
- aten::empty 0.49% 26.951us 0.49% 26.951us 4.492us 0.000us 0.00% 0.000us 0.000us 6
4040
- cudaFuncSetAttribute 0.09% 4.812us 0.09% 4.812us 1.604us 0.000us 0.00% 0.000us 0.000us 3
4041
- cudaLaunchKernel 4.15% 227.044us 4.15% 227.044us 75.681us 0.000us 0.00% 0.000us 0.000us 3
4042
- cudaDeviceSynchronize 58.33% 3.188ms 58.33% 3.188ms 3.188ms 0.000us 0.00% 0.000us 0.000us 1
4043
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
- Self CPU time total: 5.465ms
4045
- Self CUDA time total: 3.403ms
4046
 
4047
 
4048
 
@@ -4052,38 +4052,40 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
4052
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4053
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4054
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4055
- hf_kernels_flash_attn3 2.06% 111.143us 40.98% 2.214ms 2.214ms 0.000us 0.00% 4.541ms 4.541ms 1
4056
- FlashAttnFunc 1.64% 88.581us 38.92% 2.103ms 700.975us 0.000us 0.00% 4.541ms 1.514ms 3
4057
- _flash_attn3_1d39a44::fwd 0.89% 48.319us 37.28% 2.014ms 671.448us 3.401ms 100.00% 4.541ms 1.514ms 3
4058
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.402ms 100.04% 3.402ms 3.402ms 1
4059
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.401ms 100.00% 3.401ms 1.134ms 3
4060
- Activity Buffer Request 31.65% 1.710ms 31.65% 1.710ms 1.710ms 1.140ms 33.52% 1.140ms 1.140ms 1
4061
- aten::empty 0.48% 25.892us 0.48% 25.892us 4.315us 0.000us 0.00% 0.000us 0.000us 6
4062
- cudaFuncSetAttribute 0.09% 4.710us 0.09% 4.710us 1.570us 0.000us 0.00% 0.000us 0.000us 3
4063
- cudaLaunchKernel 4.17% 225.304us 4.17% 225.304us 75.101us 0.000us 0.00% 0.000us 0.000us 3
4064
- cudaDeviceSynchronize 59.02% 3.189ms 59.02% 3.189ms 3.189ms 0.000us 0.00% 0.000us 0.000us 1
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
- Self CPU time total: 5.403ms
4067
- Self CUDA time total: 3.401ms
4068
 
4069
 
4070
  impl wl p50(ms) ok
4071
- hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
4072
- hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
4073
- hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.00 True
4074
  hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True
4075
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.16 True
4076
  hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True
4077
  </pre></div>
4078
  <div class="uv-install-logs" id="uv-logs-benchmark">
4079
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4080
  <div class="uv-logs-content" style="display: none;">
4081
- Installed 14 packages in 11ms
4082
  </div>
4083
  </div>
4084
- <div class="cell-stderr">Fetching 5 files: 0%| | 0/5 [00:00&lt;?, ?it/s]
4085
- Fetching 5 files: 40%|████ | 2/5 [00:01&lt;00:02, 1.24it/s]
4086
- Fetching 5 files: 100%|██████████| 5/5 [00:01&lt;00:00, 3.09it/s]</div>
 
 
4087
  <div class="cell-artifacts">
4088
  <h4>Artifacts:</h4>
4089
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: benchmark | 6.33s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3942
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3943
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3944
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3945
+ hf_kernels_flash_attn3 3.59% 165.413us 48.47% 2.234ms 2.234ms 0.000us 0.00% 3.561ms 3.561ms 1
3946
+ FlashAttnFunc 2.69% 124.054us 44.88% 2.069ms 689.509us 0.000us 0.00% 3.561ms 1.187ms 3
3947
+ _flash_attn3_1d39a44::fwd 1.63% 74.991us 42.19% 1.944ms 648.158us 2.673ms 100.00% 3.561ms 1.187ms 3
3948
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.674ms 100.05% 2.674ms 2.674ms 1
3949
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.673ms 100.00% 2.673ms 890.896us 3
3950
+ Activity Buffer Request 38.25% 1.763ms 38.25% 1.763ms 1.763ms 888.250us 33.23% 888.250us 888.250us 1
3951
+ aten::empty 0.95% 43.951us 0.95% 43.951us 7.325us 0.000us 0.00% 0.000us 0.000us 6
3952
+ cudaFuncSetAttribute 0.32% 14.620us 0.32% 14.620us 4.873us 0.000us 0.00% 0.000us 0.000us 3
3953
+ cudaLaunchKernel 1.04% 47.991us 1.04% 47.991us 15.997us 0.000us 0.00% 0.000us 0.000us 3
3954
+ cudaDeviceSynchronize 51.53% 2.375ms 51.53% 2.375ms 2.375ms 0.000us 0.00% 0.000us 0.000us 1
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
+ Self CPU time total: 4.609ms
3957
+ Self CUDA time total: 2.673ms
3958
 
3959
 
3960
 
 
3964
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3965
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3966
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3967
+ hf_kernels_flash_attn3 2.68% 124.013us 44.92% 2.080ms 2.080ms 0.000us 0.00% 3.716ms 3.716ms 1
3968
+ FlashAttnFunc 1.96% 90.863us 42.24% 1.956ms 652.078us 0.000us 0.00% 3.716ms 1.239ms 3
3969
+ _flash_attn3_1d39a44::fwd 1.06% 49.109us 40.28% 1.865ms 621.790us 2.770ms 100.00% 3.716ms 1.239ms 3
3970
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.772ms 100.05% 2.772ms 2.772ms 1
3971
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.770ms 100.00% 2.770ms 923.461us 3
3972
+ Activity Buffer Request 37.83% 1.752ms 37.83% 1.752ms 1.752ms 945.210us 34.12% 945.210us 945.210us 1
3973
+ aten::empty 0.60% 27.931us 0.60% 27.931us 4.655us 0.000us 0.00% 0.000us 0.000us 6
3974
+ cudaFuncSetAttribute 0.12% 5.520us 0.12% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3
3975
+ cudaLaunchKernel 0.67% 30.831us 0.67% 30.831us 10.277us 0.000us 0.00% 0.000us 0.000us 3
3976
+ cudaDeviceSynchronize 55.08% 2.551ms 55.08% 2.551ms 2.551ms 0.000us 0.00% 0.000us 0.000us 1
3977
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3978
+ Self CPU time total: 4.631ms
3979
+ Self CUDA time total: 2.770ms
3980
 
3981
 
3982
 
 
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
+ hf_kernels_flash_attn3 2.68% 125.914us 44.02% 2.072ms 2.072ms 0.000us 0.00% 3.816ms 3.816ms 1
3990
+ FlashAttnFunc 1.89% 89.112us 41.34% 1.946ms 648.608us 0.000us 0.00% 3.816ms 1.272ms 3
3991
+ _flash_attn3_1d39a44::fwd 1.01% 47.500us 39.45% 1.857ms 618.904us 2.847ms 100.00% 3.816ms 1.272ms 3
3992
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.849ms 100.05% 2.849ms 2.849ms 1
3993
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.847ms 100.00% 2.847ms 949.087us 3
3994
+ Activity Buffer Request 37.07% 1.745ms 37.07% 1.745ms 1.745ms 968.895us 34.03% 968.895us 968.895us 1
3995
+ aten::empty 0.58% 27.171us 0.58% 27.171us 4.529us 0.000us 0.00% 0.000us 0.000us 6
3996
+ cudaFuncSetAttribute 0.12% 5.621us 0.12% 5.621us 1.874us 0.000us 0.00% 0.000us 0.000us 3
3997
+ cudaLaunchKernel 0.67% 31.690us 0.67% 31.690us 10.563us 0.000us 0.00% 0.000us 0.000us 3
3998
+ cudaDeviceSynchronize 55.98% 2.635ms 55.98% 2.635ms 2.635ms 0.000us 0.00% 0.000us 0.000us 1
3999
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4000
+ Self CPU time total: 4.706ms
4001
+ Self CUDA time total: 2.847ms
4002
 
4003
 
4004
 
 
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
+ hf_kernels_flash_attn3 2.55% 127.134us 45.51% 2.268ms 2.268ms 0.000us 0.00% 3.920ms 3.920ms 1
4012
+ FlashAttnFunc 1.80% 89.881us 42.96% 2.141ms 713.505us 0.000us 0.00% 3.920ms 1.307ms 3
4013
+ _flash_attn3_1d39a44::fwd 0.97% 48.541us 41.15% 2.051ms 683.545us 2.930ms 100.00% 3.920ms 1.307ms 3
4014
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.932ms 100.05% 2.932ms 2.932ms 1
4015
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.930ms 100.00% 2.930ms 976.824us 3
4016
+ Activity Buffer Request 35.08% 1.748ms 35.08% 1.748ms 1.748ms 989.112us 33.75% 989.112us 989.112us 1
4017
+ aten::empty 0.54% 27.071us 0.54% 27.071us 4.512us 0.000us 0.00% 0.000us 0.000us 6
4018
+ cudaFuncSetAttribute 0.11% 5.498us 0.11% 5.498us 1.833us 0.000us 0.00% 0.000us 0.000us 3
4019
+ cudaLaunchKernel 4.45% 221.646us 4.45% 221.646us 73.882us 0.000us 0.00% 0.000us 0.000us 3
4020
+ cudaDeviceSynchronize 54.49% 2.715ms 54.49% 2.715ms 2.715ms 0.000us 0.00% 0.000us 0.000us 1
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
+ Self CPU time total: 4.983ms
4023
+ Self CUDA time total: 2.930ms
4024
 
4025
 
4026
 
 
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4032
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4033
+ hf_kernels_flash_attn3 2.34% 128.034us 40.76% 2.227ms 2.227ms 0.000us 0.00% 4.607ms 4.607ms 1
4034
+ FlashAttnFunc 1.67% 91.131us 38.42% 2.098ms 699.492us 0.000us 0.00% 4.607ms 1.536ms 3
4035
+ _flash_attn3_1d39a44::fwd 0.87% 47.661us 36.75% 2.007ms 669.115us 3.452ms 100.00% 4.607ms 1.536ms 3
4036
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.453ms 100.05% 3.453ms 3.453ms 1
4037
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.452ms 100.00% 3.452ms 1.151ms 3
4038
+ Activity Buffer Request 31.93% 1.744ms 31.93% 1.744ms 1.744ms 1.156ms 33.48% 1.156ms 1.156ms 1
4039
+ aten::empty 0.52% 28.231us 0.52% 28.231us 4.705us 0.000us 0.00% 0.000us 0.000us 6
4040
+ cudaFuncSetAttribute 0.10% 5.270us 0.10% 5.270us 1.757us 0.000us 0.00% 0.000us 0.000us 3
4041
+ cudaLaunchKernel 3.33% 181.994us 3.33% 181.994us 60.665us 0.000us 0.00% 0.000us 0.000us 3
4042
+ cudaDeviceSynchronize 59.24% 3.235ms 59.24% 3.235ms 3.235ms 0.000us 0.00% 0.000us 0.000us 1
4043
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
+ Self CPU time total: 5.462ms
4045
+ Self CUDA time total: 3.452ms
4046
 
4047
 
4048
 
 
4052
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4053
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4054
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4055
+ hf_kernels_flash_attn3 2.42% 135.303us 41.95% 2.345ms 2.345ms 0.000us 0.00% 4.617ms 4.617ms 1
4056
+ FlashAttnFunc 1.78% 99.322us 39.53% 2.210ms 736.513us 0.000us 0.00% 4.617ms 1.539ms 3
4057
+ _flash_attn3_1d39a44::fwd 0.92% 51.382us 37.75% 2.110ms 703.406us 3.463ms 100.00% 4.617ms 1.539ms 3
4058
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.464ms 100.05% 3.464ms 3.464ms 1
4059
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.463ms 100.00% 3.463ms 1.154ms 3
4060
+ Activity Buffer Request 33.12% 1.851ms 33.12% 1.851ms 1.851ms 1.155ms 33.34% 1.155ms 1.155ms 1
4061
+ aten::empty 0.54% 30.101us 0.54% 30.101us 5.017us 0.000us 0.00% 0.000us 0.000us 6
4062
+ cudaFuncSetAttribute 0.10% 5.430us 0.10% 5.430us 1.810us 0.000us 0.00% 0.000us 0.000us 3
4063
+ cudaLaunchKernel 3.08% 171.953us 3.08% 171.953us 57.318us 0.000us 0.00% 0.000us 0.000us 3
4064
+ cudaDeviceSynchronize 58.05% 3.245ms 58.05% 3.245ms 3.245ms 0.000us 0.00% 0.000us 0.000us 1
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
+ Self CPU time total: 5.590ms
4067
+ Self CUDA time total: 3.463ms
4068
 
4069
 
4070
  impl wl p50(ms) ok
4071
+ hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.91 True
4072
+ hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
4073
+ hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
4074
  hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True
4075
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
4076
  hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True
4077
  </pre></div>
4078
  <div class="uv-install-logs" id="uv-logs-benchmark">
4079
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4080
  <div class="uv-logs-content" style="display: none;">
4081
+ Installed 14 packages in 12ms
4082
  </div>
4083
  </div>
4084
+ <div class="cell-stderr">Fetching 5 files: 0%| | 0/5 [00:00&lt;?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
4085
+
4086
+ Fetching 5 files: 20%|██ | 1/5 [00:00&lt;00:01, 3.45it/s]
4087
+ Fetching 5 files: 40%|████ | 2/5 [00:01&lt;00:02, 1.11it/s]
4088
+ Fetching 5 files: 100%|██████████| 5/5 [00:01&lt;00:00, 3.08it/s]</div>
4089
  <div class="cell-artifacts">
4090
  <h4>Artifacts:</h4>
4091
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/mem_efficient_attention.html CHANGED
@@ -3886,9 +3886,9 @@ body[data-tool="eraser"] .main-content {
3886
  <span class="collapse-indicators">
3887
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: benchmark | 4.15s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3941,28 +3941,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
3941
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3942
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
- torch_mem_eff 4.11% 302.695us 35.19% 2.592ms 2.592ms 0.000us 0.00% 5.476ms 5.476ms 1
3945
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.440ms 100.33% 5.440ms 5.440ms 1
3946
- aten::scaled_dot_product_attention 0.40% 29.210us 2.30% 169.213us 56.404us 0.000us 0.00% 4.805ms 1.602ms 3
3947
- aten::_scaled_dot_product_efficient_attention 0.29% 21.719us 1.90% 140.003us 46.668us 0.000us 0.00% 4.805ms 1.602ms 3
3948
- aten::_efficient_attention_forward 0.48% 35.571us 1.32% 97.242us 32.414us 4.805ms 88.62% 4.805ms 1.602ms 3
3949
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.805ms 88.62% 4.805ms 1.602ms 3
3950
- aten::contiguous 0.13% 9.829us 27.98% 2.062ms 229.090us 0.000us 0.00% 670.404us 74.489us 9
3951
- aten::clone 0.35% 25.869us 27.85% 2.052ms 227.998us 0.000us 0.00% 670.404us 74.489us 9
3952
- aten::copy_ 0.98% 72.210us 26.54% 1.956ms 217.285us 616.836us 11.38% 670.404us 74.489us 9
3953
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 616.836us 11.38% 616.836us 68.537us 9
3954
- Activity Buffer Request 24.39% 1.797ms 24.39% 1.797ms 1.797ms 53.568us 0.99% 53.568us 53.568us 1
3955
- aten::transpose 0.81% 59.530us 1.08% 79.784us 3.324us 0.000us 0.00% 0.000us 0.000us 24
3956
- aten::as_strided 0.27% 20.254us 0.27% 20.254us 0.844us 0.000us 0.00% 0.000us 0.000us 24
3957
- aten::empty_like 0.20% 14.892us 0.96% 70.554us 7.839us 0.000us 0.00% 0.000us 0.000us 9
3958
- aten::empty 1.12% 82.341us 1.12% 82.341us 3.921us 0.000us 0.00% 0.000us 0.000us 21
3959
- cudaLaunchKernel 1.48% 109.241us 1.48% 109.241us 9.103us 0.000us 0.00% 0.000us 0.000us 12
3960
- cudaStreamIsCapturing 0.04% 3.240us 0.04% 3.240us 1.080us 0.000us 0.00% 0.000us 0.000us 3
3961
- cudaFuncSetAttribute 0.12% 9.162us 0.12% 9.162us 3.054us 0.000us 0.00% 0.000us 0.000us 3
3962
- cudaDeviceSynchronize 64.81% 4.776ms 64.81% 4.776ms 4.776ms 0.000us 0.00% 0.000us 0.000us 1
3963
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3964
- Self CPU time total: 7.368ms
3965
- Self CUDA time total: 5.422ms
3966
 
3967
 
3968
 
@@ -3972,28 +3972,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
- torch_mem_eff 3.18% 243.704us 30.16% 2.312ms 2.312ms 0.000us 0.00% 5.946ms 5.946ms 1
3976
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.900ms 100.14% 5.900ms 5.900ms 1
3977
- aten::scaled_dot_product_attention 0.23% 17.410us 1.83% 139.893us 46.631us 0.000us 0.00% 5.256ms 1.752ms 3
3978
- aten::_scaled_dot_product_efficient_attention 0.24% 18.330us 1.60% 122.483us 40.828us 0.000us 0.00% 5.256ms 1.752ms 3
3979
- aten::_efficient_attention_forward 0.36% 27.350us 1.07% 81.803us 27.268us 5.256ms 89.21% 5.256ms 1.752ms 3
3980
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.256ms 89.21% 5.256ms 1.752ms 3
3981
- aten::contiguous 0.10% 7.470us 24.63% 1.888ms 209.765us 0.000us 0.00% 690.500us 76.722us 9
3982
- aten::clone 0.27% 20.522us 24.53% 1.880ms 208.935us 0.000us 0.00% 690.500us 76.722us 9
3983
- aten::copy_ 0.86% 65.740us 23.60% 1.809ms 200.963us 635.844us 10.79% 690.500us 76.722us 9
3984
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 635.844us 10.79% 635.844us 70.649us 9
3985
- Activity Buffer Request 21.87% 1.676ms 21.87% 1.676ms 1.676ms 54.656us 0.93% 54.656us 54.656us 1
3986
- aten::transpose 0.62% 47.210us 0.82% 62.900us 2.621us 0.000us 0.00% 0.000us 0.000us 24
3987
- aten::as_strided 0.20% 15.690us 0.20% 15.690us 0.654us 0.000us 0.00% 0.000us 0.000us 24
3988
- aten::empty_like 0.16% 11.901us 0.67% 51.221us 5.691us 0.000us 0.00% 0.000us 0.000us 9
3989
- aten::empty 0.85% 65.201us 0.85% 65.201us 3.105us 0.000us 0.00% 0.000us 0.000us 21
3990
- cudaLaunchKernel 1.16% 89.161us 1.16% 89.161us 7.430us 0.000us 0.00% 0.000us 0.000us 12
3991
- cudaStreamIsCapturing 0.03% 2.381us 0.03% 2.381us 0.794us 0.000us 0.00% 0.000us 0.000us 3
3992
- cudaFuncSetAttribute 0.05% 3.881us 0.05% 3.881us 1.294us 0.000us 0.00% 0.000us 0.000us 3
3993
- cudaDeviceSynchronize 69.84% 5.353ms 69.84% 5.353ms 5.353ms 0.000us 0.00% 0.000us 0.000us 1
3994
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3995
- Self CPU time total: 7.665ms
3996
- Self CUDA time total: 5.891ms
3997
 
3998
 
3999
 
@@ -4003,28 +4003,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
- torch_mem_eff 3.05% 239.816us 30.60% 2.409ms 2.409ms 0.000us 0.00% 6.068ms 6.068ms 1
4007
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.021ms 100.14% 6.021ms 6.021ms 1
4008
- aten::scaled_dot_product_attention 0.23% 17.959us 1.79% 140.600us 46.867us 0.000us 0.00% 5.365ms 1.788ms 3
4009
- aten::_scaled_dot_product_efficient_attention 0.23% 18.141us 1.56% 122.641us 40.880us 0.000us 0.00% 5.365ms 1.788ms 3
4010
- aten::_efficient_attention_forward 0.36% 28.699us 1.04% 81.531us 27.177us 5.365ms 89.24% 5.365ms 1.788ms 3
4011
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.365ms 89.24% 5.365ms 1.788ms 3
4012
- aten::contiguous 0.10% 7.861us 25.24% 1.987ms 220.773us 0.000us 0.00% 702.468us 78.052us 9
4013
- aten::clone 0.26% 20.540us 25.14% 1.979ms 219.899us 0.000us 0.00% 702.468us 78.052us 9
4014
- aten::copy_ 0.92% 72.171us 24.24% 1.908ms 212.002us 646.884us 10.76% 702.468us 78.052us 9
4015
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 646.884us 10.76% 646.884us 71.876us 9
4016
- Activity Buffer Request 22.46% 1.768ms 22.46% 1.768ms 1.768ms 55.584us 0.92% 55.584us 55.584us 1
4017
- aten::transpose 0.60% 47.471us 0.81% 64.120us 2.672us 0.000us 0.00% 0.000us 0.000us 24
4018
- aten::as_strided 0.21% 16.649us 0.21% 16.649us 0.694us 0.000us 0.00% 0.000us 0.000us 24
4019
- aten::empty_like 0.15% 11.960us 0.64% 50.531us 5.615us 0.000us 0.00% 0.000us 0.000us 9
4020
- aten::empty 0.81% 63.971us 0.81% 63.971us 3.046us 0.000us 0.00% 0.000us 0.000us 21
4021
- cudaLaunchKernel 1.13% 89.282us 1.13% 89.282us 7.440us 0.000us 0.00% 0.000us 0.000us 12
4022
- cudaStreamIsCapturing 0.03% 2.660us 0.03% 2.660us 0.887us 0.000us 0.00% 0.000us 0.000us 3
4023
- cudaFuncSetAttribute 0.04% 3.150us 0.04% 3.150us 1.050us 0.000us 0.00% 0.000us 0.000us 3
4024
- cudaDeviceSynchronize 69.40% 5.462ms 69.40% 5.462ms 5.462ms 0.000us 0.00% 0.000us 0.000us 1
4025
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4026
- Self CPU time total: 7.871ms
4027
- Self CUDA time total: 6.012ms
4028
 
4029
 
4030
 
@@ -4034,28 +4034,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
- torch_mem_eff 2.93% 240.625us 31.13% 2.555ms 2.555ms 0.000us 0.00% 6.259ms 6.259ms 1
4038
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.208ms 100.13% 6.208ms 6.208ms 1
4039
- aten::scaled_dot_product_attention 0.21% 17.361us 1.73% 142.203us 47.401us 0.000us 0.00% 5.537ms 1.846ms 3
4040
- aten::_scaled_dot_product_efficient_attention 0.22% 18.441us 1.52% 124.842us 41.614us 0.000us 0.00% 5.537ms 1.846ms 3
4041
- aten::_efficient_attention_forward 0.36% 29.601us 1.03% 84.471us 28.157us 5.537ms 89.30% 5.537ms 1.846ms 3
4042
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.537ms 89.30% 5.537ms 1.846ms 3
4043
- aten::contiguous 0.09% 7.769us 25.95% 2.130ms 236.658us 0.000us 0.00% 721.984us 80.220us 9
4044
- aten::clone 0.26% 21.609us 25.85% 2.122ms 235.795us 0.000us 0.00% 721.984us 80.220us 9
4045
- aten::copy_ 0.80% 65.822us 24.94% 2.047ms 227.475us 663.552us 10.70% 721.984us 80.220us 9
4046
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 663.552us 10.70% 663.552us 73.728us 9
4047
- Activity Buffer Request 21.30% 1.749ms 21.30% 1.749ms 1.749ms 58.432us 0.94% 58.432us 58.432us 1
4048
- aten::transpose 0.59% 48.680us 0.78% 64.131us 2.672us 0.000us 0.00% 0.000us 0.000us 24
4049
- aten::as_strided 0.19% 15.451us 0.19% 15.451us 0.644us 0.000us 0.00% 0.000us 0.000us 24
4050
- aten::empty_like 0.15% 12.591us 0.65% 53.271us 5.919us 0.000us 0.00% 0.000us 0.000us 9
4051
- aten::empty 0.81% 66.120us 0.81% 66.120us 3.149us 0.000us 0.00% 0.000us 0.000us 21
4052
- cudaLaunchKernel 3.12% 256.044us 3.12% 256.044us 21.337us 0.000us 0.00% 0.000us 0.000us 12
4053
- cudaStreamIsCapturing 0.03% 2.670us 0.03% 2.670us 0.890us 0.000us 0.00% 0.000us 0.000us 3
4054
- cudaFuncSetAttribute 0.04% 3.480us 0.04% 3.480us 1.160us 0.000us 0.00% 0.000us 0.000us 3
4055
- cudaDeviceSynchronize 68.87% 5.653ms 68.87% 5.653ms 5.653ms 0.000us 0.00% 0.000us 0.000us 1
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
- Self CPU time total: 8.208ms
4058
- Self CUDA time total: 6.200ms
4059
 
4060
 
4061
 
@@ -4065,28 +4065,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4067
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4068
- torch_mem_eff 2.93% 245.582us 31.52% 2.645ms 2.645ms 0.000us 0.00% 6.354ms 6.354ms 1
4069
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.303ms 100.13% 6.303ms 6.303ms 1
4070
- aten::scaled_dot_product_attention 0.20% 17.170us 1.68% 140.693us 46.898us 0.000us 0.00% 5.628ms 1.876ms 3
4071
- aten::_scaled_dot_product_efficient_attention 0.21% 17.520us 1.47% 123.523us 41.174us 0.000us 0.00% 5.628ms 1.876ms 3
4072
- aten::_efficient_attention_forward 0.35% 29.440us 1.00% 84.263us 28.088us 5.628ms 89.41% 5.628ms 1.876ms 3
4073
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.628ms 89.41% 5.628ms 1.876ms 3
4074
- aten::contiguous 0.09% 7.259us 26.43% 2.218ms 246.393us 0.000us 0.00% 726.309us 80.701us 9
4075
- aten::clone 0.25% 21.219us 26.34% 2.210ms 245.587us 0.000us 0.00% 726.309us 80.701us 9
4076
- aten::copy_ 0.78% 65.083us 25.46% 2.136ms 237.368us 666.948us 10.59% 726.309us 80.701us 9
4077
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 666.948us 10.59% 666.948us 74.105us 9
4078
- Activity Buffer Request 21.84% 1.833ms 21.84% 1.833ms 1.833ms 59.361us 0.94% 59.361us 59.361us 1
4079
- aten::transpose 0.56% 46.780us 0.75% 62.730us 2.614us 0.000us 0.00% 0.000us 0.000us 24
4080
- aten::as_strided 0.19% 15.950us 0.19% 15.950us 0.665us 0.000us 0.00% 0.000us 0.000us 24
4081
- aten::empty_like 0.14% 11.512us 0.63% 52.753us 5.861us 0.000us 0.00% 0.000us 0.000us 9
4082
- aten::empty 0.79% 66.642us 0.79% 66.642us 3.173us 0.000us 0.00% 0.000us 0.000us 21
4083
- cudaLaunchKernel 3.12% 261.945us 3.12% 261.945us 21.829us 0.000us 0.00% 0.000us 0.000us 12
4084
- cudaStreamIsCapturing 0.03% 2.500us 0.03% 2.500us 0.833us 0.000us 0.00% 0.000us 0.000us 3
4085
- cudaFuncSetAttribute 0.04% 3.581us 0.04% 3.581us 1.194us 0.000us 0.00% 0.000us 0.000us 3
4086
- cudaDeviceSynchronize 68.48% 5.745ms 68.48% 5.745ms 5.745ms 0.000us 0.00% 0.000us 0.000us 1
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
- Self CPU time total: 8.390ms
4089
- Self CUDA time total: 6.295ms
4090
 
4091
 
4092
 
@@ -4096,38 +4096,44 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
4096
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4097
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4098
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4099
- torch_mem_eff 2.68% 234.298us 28.81% 2.516ms 2.516ms 0.000us 0.00% 6.820ms 6.820ms 1
4100
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.768ms 100.12% 6.768ms 6.768ms 1
4101
- aten::scaled_dot_product_attention 0.20% 17.618us 1.61% 140.900us 46.967us 0.000us 0.00% 6.087ms 2.029ms 3
4102
- aten::_scaled_dot_product_efficient_attention 0.21% 18.311us 1.41% 123.282us 41.094us 0.000us 0.00% 6.087ms 2.029ms 3
4103
- aten::_efficient_attention_forward 0.33% 29.191us 0.95% 82.621us 27.540us 6.087ms 90.04% 6.087ms 2.029ms 3
4104
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 6.087ms 90.04% 6.087ms 2.029ms 3
4105
- aten::contiguous 0.09% 7.641us 24.06% 2.101ms 233.417us 0.000us 0.00% 733.380us 81.487us 9
4106
- aten::clone 0.23% 20.279us 23.97% 2.093ms 232.568us 0.000us 0.00% 733.380us 81.487us 9
4107
- aten::copy_ 0.74% 64.431us 23.10% 2.017ms 224.097us 672.964us 9.96% 733.380us 81.487us 9
4108
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 672.964us 9.96% 672.964us 74.774us 9
4109
- Activity Buffer Request 19.61% 1.713ms 19.61% 1.713ms 1.713ms 60.416us 0.89% 60.416us 60.416us 1
4110
- aten::transpose 0.53% 46.410us 0.71% 62.109us 2.588us 0.000us 0.00% 0.000us 0.000us 24
4111
- aten::as_strided 0.18% 15.699us 0.18% 15.699us 0.654us 0.000us 0.00% 0.000us 0.000us 24
4112
- aten::empty_like 0.15% 12.751us 0.64% 55.961us 6.218us 0.000us 0.00% 0.000us 0.000us 9
4113
- aten::empty 0.79% 69.050us 0.79% 69.050us 3.288us 0.000us 0.00% 0.000us 0.000us 21
4114
- cudaLaunchKernel 2.99% 261.415us 2.99% 261.415us 21.785us 0.000us 0.00% 0.000us 0.000us 12
4115
- cudaStreamIsCapturing 0.03% 2.920us 0.03% 2.920us 0.973us 0.000us 0.00% 0.000us 0.000us 3
4116
- cudaFuncSetAttribute 0.03% 2.980us 0.03% 2.980us 0.993us 0.000us 0.00% 0.000us 0.000us 3
4117
- cudaDeviceSynchronize 71.19% 6.216ms 71.19% 6.216ms 6.216ms 0.000us 0.00% 0.000us 0.000us 1
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
- Self CPU time total: 8.732ms
4120
- Self CUDA time total: 6.759ms
4121
 
4122
 
4123
  impl wl p50(ms) ok
4124
- torch_mem_eff cuda_attn_L128_bfloat16 1.84 True
4125
- torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
4126
- torch_mem_eff cuda_attn_L320_bfloat16 1.97 True
4127
- torch_mem_eff cuda_attn_L384_bfloat16 2.08 True
4128
- torch_mem_eff cuda_attn_L448_bfloat16 2.04 True
4129
- torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
4130
  </pre></div>
 
 
 
 
 
 
4131
  <div class="cell-artifacts">
4132
  <h4>Artifacts:</h4>
4133
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3886
  <span class="collapse-indicators">
3887
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: benchmark | 8.14s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3941
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3942
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.462ms 101.52% 5.462ms 5.462ms 1
3945
+ torch_mem_eff 4.78% 351.785us 36.36% 2.675ms 2.675ms 0.000us 0.00% 5.434ms 5.434ms 1
3946
+ aten::scaled_dot_product_attention 0.44% 32.361us 3.09% 227.216us 75.739us 0.000us 0.00% 4.760ms 1.587ms 3
3947
+ aten::_scaled_dot_product_efficient_attention 0.32% 23.392us 2.65% 194.855us 64.952us 0.000us 0.00% 4.760ms 1.587ms 3
3948
+ aten::_efficient_attention_forward 0.47% 34.731us 1.98% 145.602us 48.534us 4.760ms 88.47% 4.760ms 1.587ms 3
3949
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.760ms 88.47% 4.760ms 1.587ms 3
3950
+ aten::contiguous 0.14% 10.161us 27.51% 2.023ms 224.817us 0.000us 0.00% 673.947us 74.883us 9
3951
+ aten::clone 0.40% 29.063us 27.37% 2.013ms 223.688us 0.000us 0.00% 673.947us 74.883us 9
3952
+ aten::copy_ 1.06% 77.620us 25.90% 1.905ms 211.680us 620.444us 11.53% 673.947us 74.883us 9
3953
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 620.444us 11.53% 620.444us 68.938us 9
3954
+ Activity Buffer Request 23.68% 1.742ms 23.68% 1.742ms 1.742ms 53.503us 0.99% 53.503us 53.503us 1
3955
+ aten::transpose 0.99% 72.964us 1.33% 98.194us 4.091us 0.000us 0.00% 0.000us 0.000us 24
3956
+ aten::as_strided 0.34% 25.230us 0.34% 25.230us 1.051us 0.000us 0.00% 0.000us 0.000us 24
3957
+ aten::empty_like 0.25% 18.168us 1.07% 79.009us 8.779us 0.000us 0.00% 0.000us 0.000us 9
3958
+ aten::empty 1.28% 94.381us 1.28% 94.381us 4.494us 0.000us 0.00% 0.000us 0.000us 21
3959
+ cudaLaunchKernel 1.49% 109.573us 1.49% 109.573us 9.131us 0.000us 0.00% 0.000us 0.000us 12
3960
+ cudaStreamIsCapturing 0.05% 3.660us 0.05% 3.660us 1.220us 0.000us 0.00% 0.000us 0.000us 3
3961
+ cudaFuncSetAttribute 0.67% 49.491us 0.67% 49.491us 16.497us 0.000us 0.00% 0.000us 0.000us 3
3962
+ cudaDeviceSynchronize 63.64% 4.681ms 63.64% 4.681ms 4.681ms 0.000us 0.00% 0.000us 0.000us 1
3963
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3964
+ Self CPU time total: 7.356ms
3965
+ Self CUDA time total: 5.380ms
3966
 
3967
 
3968
 
 
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
+ torch_mem_eff 2.99% 227.637us 31.17% 2.369ms 2.369ms 0.000us 0.00% 5.835ms 5.835ms 1
3976
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.790ms 100.14% 5.790ms 5.790ms 1
3977
+ aten::scaled_dot_product_attention 0.23% 17.721us 1.87% 142.143us 47.381us 0.000us 0.00% 5.146ms 1.715ms 3
3978
+ aten::_scaled_dot_product_efficient_attention 0.25% 18.819us 1.64% 124.422us 41.474us 0.000us 0.00% 5.146ms 1.715ms 3
3979
+ aten::_efficient_attention_forward 0.37% 28.141us 1.08% 82.262us 27.421us 5.146ms 89.01% 5.146ms 1.715ms 3
3980
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.146ms 89.01% 5.146ms 1.715ms 3
3981
+ aten::contiguous 0.09% 6.739us 25.75% 1.957ms 217.483us 0.000us 0.00% 689.503us 76.611us 9
3982
+ aten::clone 0.27% 20.691us 25.66% 1.951ms 216.734us 0.000us 0.00% 689.503us 76.611us 9
3983
+ aten::copy_ 0.83% 62.851us 24.72% 1.879ms 208.808us 635.680us 10.99% 689.503us 76.611us 9
3984
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 635.680us 10.99% 635.680us 70.631us 9
3985
+ Activity Buffer Request 23.06% 1.753ms 23.06% 1.753ms 1.753ms 53.823us 0.93% 53.823us 53.823us 1
3986
+ aten::transpose 0.63% 47.890us 0.86% 65.431us 2.726us 0.000us 0.00% 0.000us 0.000us 24
3987
+ aten::as_strided 0.23% 17.541us 0.23% 17.541us 0.731us 0.000us 0.00% 0.000us 0.000us 24
3988
+ aten::empty_like 0.15% 11.310us 0.67% 50.641us 5.627us 0.000us 0.00% 0.000us 0.000us 9
3989
+ aten::empty 0.87% 66.232us 0.87% 66.232us 3.154us 0.000us 0.00% 0.000us 0.000us 21
3990
+ cudaLaunchKernel 1.12% 85.492us 1.12% 85.492us 7.124us 0.000us 0.00% 0.000us 0.000us 12
3991
+ cudaStreamIsCapturing 0.03% 2.460us 0.03% 2.460us 0.820us 0.000us 0.00% 0.000us 0.000us 3
3992
+ cudaFuncSetAttribute 0.04% 3.070us 0.04% 3.070us 1.023us 0.000us 0.00% 0.000us 0.000us 3
3993
+ cudaDeviceSynchronize 68.83% 5.232ms 68.83% 5.232ms 5.232ms 0.000us 0.00% 0.000us 0.000us 1
3994
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3995
+ Self CPU time total: 7.601ms
3996
+ Self CUDA time total: 5.782ms
3997
 
3998
 
3999
 
 
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
+ torch_mem_eff 2.88% 222.044us 30.17% 2.327ms 2.327ms 0.000us 0.00% 5.986ms 5.986ms 1
4007
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.939ms 100.13% 5.939ms 5.939ms 1
4008
+ aten::scaled_dot_product_attention 0.24% 18.710us 1.85% 142.303us 47.434us 0.000us 0.00% 5.284ms 1.761ms 3
4009
+ aten::_scaled_dot_product_efficient_attention 0.25% 19.190us 1.60% 123.593us 41.198us 0.000us 0.00% 5.284ms 1.761ms 3
4010
+ aten::_efficient_attention_forward 0.36% 27.947us 1.05% 81.281us 27.094us 5.284ms 89.10% 5.284ms 1.761ms 3
4011
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.284ms 89.10% 5.284ms 1.761ms 3
4012
+ aten::contiguous 0.09% 7.300us 24.90% 1.920ms 213.350us 0.000us 0.00% 702.238us 78.026us 9
4013
+ aten::clone 0.28% 21.930us 24.80% 1.913ms 212.539us 0.000us 0.00% 702.238us 78.026us 9
4014
+ aten::copy_ 0.79% 60.872us 23.86% 1.840ms 204.449us 646.526us 10.90% 702.238us 78.026us 9
4015
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 646.526us 10.90% 646.526us 71.836us 9
4016
+ Activity Buffer Request 22.23% 1.715ms 22.23% 1.715ms 1.715ms 55.712us 0.94% 55.712us 55.712us 1
4017
+ aten::transpose 0.63% 48.814us 0.85% 65.893us 2.746us 0.000us 0.00% 0.000us 0.000us 24
4018
+ aten::as_strided 0.22% 17.079us 0.22% 17.079us 0.712us 0.000us 0.00% 0.000us 0.000us 24
4019
+ aten::empty_like 0.15% 11.801us 0.66% 50.882us 5.654us 0.000us 0.00% 0.000us 0.000us 9
4020
+ aten::empty 0.85% 65.644us 0.85% 65.644us 3.126us 0.000us 0.00% 0.000us 0.000us 21
4021
+ cudaLaunchKernel 1.11% 85.622us 1.11% 85.622us 7.135us 0.000us 0.00% 0.000us 0.000us 12
4022
+ cudaStreamIsCapturing 0.03% 2.511us 0.03% 2.511us 0.837us 0.000us 0.00% 0.000us 0.000us 3
4023
+ cudaFuncSetAttribute 0.04% 3.110us 0.04% 3.110us 1.037us 0.000us 0.00% 0.000us 0.000us 3
4024
+ cudaDeviceSynchronize 69.83% 5.385ms 69.83% 5.385ms 5.385ms 0.000us 0.00% 0.000us 0.000us 1
4025
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4026
+ Self CPU time total: 7.713ms
4027
+ Self CUDA time total: 5.931ms
4028
 
4029
 
4030
 
 
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
+ torch_mem_eff 3.05% 248.737us 32.15% 2.620ms 2.620ms 0.000us 0.00% 6.167ms 6.167ms 1
4038
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.117ms 100.13% 6.117ms 6.117ms 1
4039
+ aten::scaled_dot_product_attention 0.24% 19.380us 1.81% 147.173us 49.058us 0.000us 0.00% 5.450ms 1.817ms 3
4040
+ aten::_scaled_dot_product_efficient_attention 0.23% 19.059us 1.57% 127.793us 42.598us 0.000us 0.00% 5.450ms 1.817ms 3
4041
+ aten::_efficient_attention_forward 0.34% 28.111us 1.04% 84.373us 28.124us 5.450ms 89.21% 5.450ms 1.817ms 3
4042
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.450ms 89.21% 5.450ms 1.817ms 3
4043
+ aten::contiguous 0.09% 7.070us 26.79% 2.183ms 242.545us 0.000us 0.00% 717.472us 79.719us 9
4044
+ aten::clone 0.26% 21.211us 26.70% 2.176ms 241.760us 0.000us 0.00% 717.472us 79.719us 9
4045
+ aten::copy_ 0.77% 62.427us 25.76% 2.100ms 233.287us 658.976us 10.79% 717.472us 79.719us 9
4046
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 658.976us 10.79% 658.976us 73.220us 9
4047
+ Activity Buffer Request 21.68% 1.767ms 21.68% 1.767ms 1.767ms 58.496us 0.96% 58.496us 58.496us 1
4048
+ aten::transpose 0.59% 47.765us 0.81% 65.883us 2.745us 0.000us 0.00% 0.000us 0.000us 24
4049
+ aten::as_strided 0.22% 18.118us 0.22% 18.118us 0.755us 0.000us 0.00% 0.000us 0.000us 24
4050
+ aten::empty_like 0.14% 11.420us 0.68% 55.041us 6.116us 0.000us 0.00% 0.000us 0.000us 9
4051
+ aten::empty 0.87% 71.281us 0.87% 71.281us 3.394us 0.000us 0.00% 0.000us 0.000us 21
4052
+ cudaLaunchKernel 3.59% 292.889us 3.59% 292.889us 24.407us 0.000us 0.00% 0.000us 0.000us 12
4053
+ cudaStreamIsCapturing 0.03% 2.781us 0.03% 2.781us 0.927us 0.000us 0.00% 0.000us 0.000us 3
4054
+ cudaFuncSetAttribute 0.04% 3.020us 0.04% 3.020us 1.007us 0.000us 0.00% 0.000us 0.000us 3
4055
+ cudaDeviceSynchronize 67.85% 5.529ms 67.85% 5.529ms 5.529ms 0.000us 0.00% 0.000us 0.000us 1
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
+ Self CPU time total: 8.150ms
4058
+ Self CUDA time total: 6.109ms
4059
 
4060
 
4061
 
 
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4067
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4068
+ torch_mem_eff 2.74% 222.904us 29.02% 2.363ms 2.363ms 0.000us 0.00% 6.392ms 6.392ms 1
4069
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.341ms 100.13% 6.341ms 6.341ms 1
4070
+ aten::scaled_dot_product_attention 0.23% 18.463us 1.76% 143.054us 47.685us 0.000us 0.00% 5.664ms 1.888ms 3
4071
+ aten::_scaled_dot_product_efficient_attention 0.23% 18.699us 1.53% 124.591us 41.530us 0.000us 0.00% 5.664ms 1.888ms 3
4072
+ aten::_efficient_attention_forward 0.35% 28.650us 1.01% 82.071us 27.357us 5.664ms 89.43% 5.664ms 1.888ms 3
4073
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664ms 89.43% 5.664ms 1.888ms 3
4074
+ aten::contiguous 0.09% 7.480us 24.00% 1.954ms 217.122us 0.000us 0.00% 727.838us 80.871us 9
4075
+ aten::clone 0.26% 21.231us 23.90% 1.947ms 216.290us 0.000us 0.00% 727.838us 80.871us 9
4076
+ aten::copy_ 0.78% 63.523us 23.01% 1.874ms 208.176us 669.182us 10.57% 727.838us 80.871us 9
4077
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 669.182us 10.57% 669.182us 74.354us 9
4078
+ Activity Buffer Request 19.19% 1.562ms 19.19% 1.562ms 1.562ms 58.656us 0.93% 58.656us 58.656us 1
4079
+ aten::transpose 0.60% 48.754us 0.82% 66.672us 2.778us 0.000us 0.00% 0.000us 0.000us 24
4080
+ aten::as_strided 0.22% 17.918us 0.22% 17.918us 0.747us 0.000us 0.00% 0.000us 0.000us 24
4081
+ aten::empty_like 0.14% 11.269us 0.64% 51.800us 5.756us 0.000us 0.00% 0.000us 0.000us 9
4082
+ aten::empty 0.81% 66.291us 0.81% 66.291us 3.157us 0.000us 0.00% 0.000us 0.000us 21
4083
+ cudaLaunchKernel 3.31% 269.756us 3.31% 269.756us 22.480us 0.000us 0.00% 0.000us 0.000us 12
4084
+ cudaStreamIsCapturing 0.03% 2.590us 0.03% 2.590us 0.863us 0.000us 0.00% 0.000us 0.000us 3
4085
+ cudaFuncSetAttribute 0.04% 2.940us 0.04% 2.940us 0.980us 0.000us 0.00% 0.000us 0.000us 3
4086
+ cudaDeviceSynchronize 70.98% 5.781ms 70.98% 5.781ms 5.781ms 0.000us 0.00% 0.000us 0.000us 1
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
+ Self CPU time total: 8.144ms
4089
+ Self CUDA time total: 6.333ms
4090
 
4091
 
4092
 
 
4096
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4097
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4098
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4099
+ torch_mem_eff 2.91% 254.056us 31.19% 2.722ms 2.722ms 0.000us 0.00% 6.645ms 6.645ms 1
4100
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.592ms 100.12% 6.592ms 6.592ms 1
4101
+ aten::scaled_dot_product_attention 0.23% 20.440us 1.69% 147.533us 49.178us 0.000us 0.00% 5.910ms 1.970ms 3
4102
+ aten::_scaled_dot_product_efficient_attention 0.22% 19.250us 1.46% 127.093us 42.364us 0.000us 0.00% 5.910ms 1.970ms 3
4103
+ aten::_efficient_attention_forward 0.33% 28.899us 0.98% 85.242us 28.414us 5.910ms 89.76% 5.910ms 1.970ms 3
4104
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.910ms 89.76% 5.910ms 1.970ms 3
4105
+ aten::contiguous 0.08% 7.268us 26.04% 2.272ms 252.404us 0.000us 0.00% 734.815us 81.646us 9
4106
+ aten::clone 0.28% 24.054us 25.95% 2.264ms 251.596us 0.000us 0.00% 734.815us 81.646us 9
4107
+ aten::copy_ 0.77% 66.891us 25.04% 2.185ms 242.745us 674.239us 10.24% 734.815us 81.646us 9
4108
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 674.239us 10.24% 674.239us 74.915us 9
4109
+ Activity Buffer Request 20.22% 1.764ms 20.22% 1.764ms 1.764ms 60.576us 0.92% 60.576us 60.576us 1
4110
+ aten::transpose 0.62% 53.860us 0.81% 70.972us 2.957us 0.000us 0.00% 0.000us 0.000us 24
4111
+ aten::as_strided 0.20% 17.112us 0.20% 17.112us 0.713us 0.000us 0.00% 0.000us 0.000us 24
4112
+ aten::empty_like 0.15% 12.910us 0.64% 55.601us 6.178us 0.000us 0.00% 0.000us 0.000us 9
4113
+ aten::empty 0.82% 71.503us 0.82% 71.503us 3.405us 0.000us 0.00% 0.000us 0.000us 21
4114
+ cudaLaunchKernel 4.30% 375.338us 4.30% 375.338us 31.278us 0.000us 0.00% 0.000us 0.000us 12
4115
+ cudaStreamIsCapturing 0.03% 2.571us 0.03% 2.571us 0.857us 0.000us 0.00% 0.000us 0.000us 3
4116
+ cudaFuncSetAttribute 0.03% 3.000us 0.03% 3.000us 1.000us 0.000us 0.00% 0.000us 0.000us 3
4117
+ cudaDeviceSynchronize 68.81% 6.003ms 68.81% 6.003ms 6.003ms 0.000us 0.00% 0.000us 0.000us 1
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
+ Self CPU time total: 8.725ms
4120
+ Self CUDA time total: 6.584ms
4121
 
4122
 
4123
  impl wl p50(ms) ok
4124
+ torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
4125
+ torch_mem_eff cuda_attn_L256_bfloat16 1.93 True
4126
+ torch_mem_eff cuda_attn_L320_bfloat16 1.95 True
4127
+ torch_mem_eff cuda_attn_L384_bfloat16 2.04 True
4128
+ torch_mem_eff cuda_attn_L448_bfloat16 2.08 True
4129
+ torch_mem_eff cuda_attn_L512_bfloat16 2.17 True
4130
  </pre></div>
4131
+ <div class="uv-install-logs" id="uv-logs-benchmark">
4132
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4133
+ <div class="uv-logs-content" style="display: none;">
4134
+ Installed 37 packages in 340ms
4135
+ </div>
4136
+ </div>
4137
  <div class="cell-artifacts">
4138
  <h4>Artifacts:</h4>
4139
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/sage_attention.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: benchmark | 4.58s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3938,22 +3938,24 @@ Cell: benchmark | 4.58s
3938
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
3939
  impl wl p50(ms) ok
3940
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
3941
- Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
3942
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
3943
- Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
3944
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
3945
- Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
3946
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
3947
- Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
3948
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
3949
- Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
3950
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
3951
- Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
3952
  </pre></div>
3953
  <div class="cell-stderr">
3954
- Fetching 8 files: 0%| | 0/8 [00:00&lt;?, ?it/s]
3955
- Fetching 8 files: 38%|███▊ | 3/8 [00:00&lt;00:00, 5.88it/s]
3956
- Fetching 8 files: 100%|██████████| 8/8 [00:00&lt;00:00, 15.67it/s]
 
 
3957
  </div>
3958
  <div class="cell-artifacts">
3959
  <h4>Artifacts:</h4>
 
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: benchmark | 4.72s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3938
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
3939
  impl wl p50(ms) ok
3940
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
3941
+ Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
3942
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
3943
+ Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
3944
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
3945
+ Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
3946
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
3947
+ Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
3948
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
3949
+ Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
3950
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
3951
+ Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
3952
  </pre></div>
3953
  <div class="cell-stderr">
3954
+ Fetching 8 files: 0%| | 0/8 [00:00&lt;?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
3955
+
3956
+ Fetching 8 files: 12%|█▎ | 1/8 [00:00&lt;00:00, 7.67it/s]
3957
+ Fetching 8 files: 38%|███▊ | 3/8 [00:00&lt;00:01, 3.86it/s]
3958
+ Fetching 8 files: 100%|██████████| 8/8 [00:00&lt;00:00, 10.82it/s]
3959
  </div>
3960
  <div class="cell-artifacts">
3961
  <h4>Artifacts:</h4>
flash_attn/impls/xformers.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: benchmark | 8.92s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3940,21 +3940,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
3940
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3941
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3942
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3943
- xformers_meff 9.64% 463.299us 53.77% 2.584ms 2.584ms 0.000us 0.00% 3.636ms 3.636ms 1
3944
- xformers_flash3::flash_fwd 3.92% 188.192us 43.38% 2.085ms 694.978us 0.000us 0.00% 3.636ms 1.212ms 3
3945
- flash_attn_3::fwd 1.40% 67.082us 39.46% 1.897ms 632.248us 2.748ms 100.00% 3.636ms 1.212ms 3
3946
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.749ms 100.05% 2.749ms 2.749ms 1
3947
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.748ms 100.00% 2.748ms 915.935us 3
3948
- Activity Buffer Request 36.10% 1.735ms 36.10% 1.735ms 1.735ms 887.807us 32.31% 887.807us 887.807us 1
3949
- aten::empty 0.82% 39.381us 0.82% 39.381us 6.563us 0.000us 0.00% 0.000us 0.000us 6
3950
- cudaFuncSetAttribute 0.26% 12.540us 0.26% 12.540us 4.180us 0.000us 0.00% 0.000us 0.000us 3
3951
- cudaLaunchKernel 0.88% 42.510us 0.88% 42.510us 14.170us 0.000us 0.00% 0.000us 0.000us 3
3952
- aten::reshape 0.25% 12.121us 0.75% 35.870us 5.978us 0.000us 0.00% 0.000us 0.000us 6
3953
- aten::view 0.49% 23.749us 0.49% 23.749us 3.958us 0.000us 0.00% 0.000us 0.000us 6
3954
- cudaDeviceSynchronize 46.23% 2.222ms 46.23% 2.222ms 2.222ms 0.000us 0.00% 0.000us 0.000us 1
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
- Self CPU time total: 4.806ms
3957
- Self CUDA time total: 2.748ms
3958
 
3959
 
3960
 
@@ -3964,21 +3964,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
3964
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3965
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3966
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3967
- xformers_meff 6.94% 327.436us 51.65% 2.436ms 2.436ms 0.000us 0.00% 3.659ms 3.659ms 1
3968
- xformers_flash3::flash_fwd 3.29% 155.063us 44.22% 2.085ms 695.085us 0.000us 0.00% 3.659ms 1.220ms 3
3969
- flash_attn_3::fwd 1.15% 54.292us 40.93% 1.930ms 643.398us 2.737ms 100.00% 3.659ms 1.220ms 3
3970
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.738ms 100.05% 2.738ms 2.738ms 1
3971
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.737ms 100.00% 2.737ms 912.235us 3
3972
- Activity Buffer Request 38.21% 1.802ms 38.21% 1.802ms 1.802ms 922.336us 33.70% 922.336us 922.336us 1
3973
- aten::empty 0.70% 32.930us 0.70% 32.930us 5.488us 0.000us 0.00% 0.000us 0.000us 6
3974
- cudaFuncSetAttribute 0.12% 5.760us 0.12% 5.760us 1.920us 0.000us 0.00% 0.000us 0.000us 3
3975
- cudaLaunchKernel 0.75% 35.410us 0.75% 35.410us 11.803us 0.000us 0.00% 0.000us 0.000us 3
3976
- aten::reshape 0.20% 9.409us 0.49% 22.989us 3.831us 0.000us 0.00% 0.000us 0.000us 6
3977
- aten::view 0.29% 13.580us 0.29% 13.580us 2.263us 0.000us 0.00% 0.000us 0.000us 6
3978
- cudaDeviceSynchronize 48.35% 2.280ms 48.35% 2.280ms 2.280ms 0.000us 0.00% 0.000us 0.000us 1
3979
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3980
- Self CPU time total: 4.715ms
3981
- Self CUDA time total: 2.737ms
3982
 
3983
 
3984
 
@@ -3988,21 +3988,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3990
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3991
- xformers_meff 6.35% 296.325us 47.95% 2.238ms 2.238ms 0.000us 0.00% 3.787ms 3.787ms 1
3992
- xformers_flash3::flash_fwd 2.95% 137.473us 41.12% 1.919ms 639.648us 0.000us 0.00% 3.787ms 1.262ms 3
3993
- flash_attn_3::fwd 1.09% 50.850us 38.17% 1.781ms 593.823us 2.829ms 100.00% 3.787ms 1.262ms 3
3994
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.831ms 100.05% 2.831ms 2.831ms 1
3995
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.829ms 100.00% 2.829ms 943.127us 3
3996
- Activity Buffer Request 35.64% 1.663ms 35.64% 1.663ms 1.663ms 957.186us 33.83% 957.186us 957.186us 1
3997
- aten::empty 0.63% 29.301us 0.63% 29.301us 4.884us 0.000us 0.00% 0.000us 0.000us 6
3998
- cudaFuncSetAttribute 0.11% 5.090us 0.11% 5.090us 1.697us 0.000us 0.00% 0.000us 0.000us 3
3999
- cudaLaunchKernel 0.71% 33.151us 0.71% 33.151us 11.050us 0.000us 0.00% 0.000us 0.000us 3
4000
- aten::reshape 0.18% 8.531us 0.48% 22.580us 3.763us 0.000us 0.00% 0.000us 0.000us 6
4001
- aten::view 0.30% 14.049us 0.30% 14.049us 2.341us 0.000us 0.00% 0.000us 0.000us 6
4002
- cudaDeviceSynchronize 52.05% 2.429ms 52.05% 2.429ms 2.429ms 0.000us 0.00% 0.000us 0.000us 1
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
- Self CPU time total: 4.667ms
4005
- Self CUDA time total: 2.829ms
4006
 
4007
 
4008
 
@@ -4012,21 +4012,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4014
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4015
- xformers_meff 6.11% 304.138us 50.43% 2.511ms 2.511ms 0.000us 0.00% 3.860ms 3.860ms 1
4016
- xformers_flash3::flash_fwd 3.07% 152.860us 43.87% 2.184ms 727.989us 0.000us 0.00% 3.860ms 1.287ms 3
4017
- flash_attn_3::fwd 1.07% 53.395us 40.80% 2.031ms 677.035us 2.883ms 100.00% 3.860ms 1.287ms 3
4018
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.885ms 100.05% 2.885ms 2.885ms 1
4019
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.883ms 100.00% 2.883ms 961.001us 3
4020
- Activity Buffer Request 34.97% 1.741ms 34.97% 1.741ms 1.741ms 977.086us 33.89% 977.086us 977.086us 1
4021
- aten::empty 0.66% 32.699us 0.66% 32.699us 5.450us 0.000us 0.00% 0.000us 0.000us 6
4022
- cudaFuncSetAttribute 0.12% 6.109us 0.12% 6.109us 2.036us 0.000us 0.00% 0.000us 0.000us 3
4023
- cudaLaunchKernel 3.98% 197.963us 3.98% 197.963us 65.988us 0.000us 0.00% 0.000us 0.000us 3
4024
- aten::reshape 0.17% 8.489us 0.45% 22.539us 3.757us 0.000us 0.00% 0.000us 0.000us 6
4025
- aten::view 0.28% 14.050us 0.28% 14.050us 2.342us 0.000us 0.00% 0.000us 0.000us 6
4026
- cudaDeviceSynchronize 49.57% 2.468ms 49.57% 2.468ms 2.468ms 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
- Self CPU time total: 4.978ms
4029
- Self CUDA time total: 2.883ms
4030
 
4031
 
4032
 
@@ -4036,21 +4036,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
- xformers_meff 5.45% 299.105us 45.26% 2.482ms 2.482ms 0.000us 0.00% 4.556ms 4.556ms 1
4040
- xformers_flash3::flash_fwd 2.57% 140.761us 39.42% 2.162ms 720.685us 0.000us 0.00% 4.556ms 1.519ms 3
4041
- flash_attn_3::fwd 0.92% 50.555us 36.85% 2.021ms 673.765us 3.406ms 100.00% 4.556ms 1.519ms 3
4042
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.408ms 100.05% 3.408ms 3.408ms 1
4043
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.406ms 100.00% 3.406ms 1.135ms 3
4044
- Activity Buffer Request 31.74% 1.741ms 31.74% 1.741ms 1.741ms 1.150ms 33.76% 1.150ms 1.150ms 1
4045
- aten::empty 0.52% 28.258us 0.52% 28.258us 4.710us 0.000us 0.00% 0.000us 0.000us 6
4046
- cudaFuncSetAttribute 0.10% 5.340us 0.10% 5.340us 1.780us 0.000us 0.00% 0.000us 0.000us 3
4047
- cudaLaunchKernel 3.58% 196.453us 3.58% 196.453us 65.484us 0.000us 0.00% 0.000us 0.000us 3
4048
- aten::reshape 0.14% 7.863us 0.39% 21.181us 3.530us 0.000us 0.00% 0.000us 0.000us 6
4049
- aten::view 0.24% 13.318us 0.24% 13.318us 2.220us 0.000us 0.00% 0.000us 0.000us 6
4050
- cudaDeviceSynchronize 54.74% 3.003ms 54.74% 3.003ms 3.003ms 0.000us 0.00% 0.000us 0.000us 1
4051
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4052
- Self CPU time total: 5.485ms
4053
- Self CUDA time total: 3.406ms
4054
 
4055
 
4056
 
@@ -4060,37 +4060,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
4060
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4061
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4062
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4063
- xformers_meff 5.08% 273.484us 44.98% 2.423ms 2.423ms 0.000us 0.00% 4.494ms 4.494ms 1
4064
- xformers_flash3::flash_fwd 2.55% 137.253us 39.52% 2.129ms 709.536us 0.000us 0.00% 4.494ms 1.498ms 3
4065
- flash_attn_3::fwd 0.94% 50.440us 36.97% 1.991ms 663.785us 3.366ms 100.00% 4.494ms 1.498ms 3
4066
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.368ms 100.05% 3.368ms 3.368ms 1
4067
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.366ms 100.00% 3.366ms 1.122ms 3
4068
- Activity Buffer Request 31.81% 1.713ms 31.81% 1.713ms 1.713ms 1.127ms 33.48% 1.127ms 1.127ms 1
4069
- aten::empty 0.56% 30.302us 0.56% 30.302us 5.050us 0.000us 0.00% 0.000us 0.000us 6
4070
- cudaFuncSetAttribute 0.10% 5.300us 0.10% 5.300us 1.767us 0.000us 0.00% 0.000us 0.000us 3
4071
- cudaLaunchKernel 3.56% 191.983us 3.56% 191.983us 63.994us 0.000us 0.00% 0.000us 0.000us 3
4072
- aten::reshape 0.15% 8.029us 0.39% 20.930us 3.488us 0.000us 0.00% 0.000us 0.000us 6
4073
- aten::view 0.24% 12.901us 0.24% 12.901us 2.150us 0.000us 0.00% 0.000us 0.000us 6
4074
- cudaDeviceSynchronize 55.02% 2.964ms 55.02% 2.964ms 2.964ms 0.000us 0.00% 0.000us 0.000us 1
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
- Self CPU time total: 5.387ms
4077
- Self CUDA time total: 3.366ms
4078
 
4079
 
4080
  impl wl p50(ms) ok
4081
  xformers_meff cuda_attn_L128_bfloat16 0.98 True
4082
- xformers_meff cuda_attn_L256_bfloat16 1.03 True
4083
  xformers_meff cuda_attn_L320_bfloat16 1.06 True
4084
- xformers_meff cuda_attn_L384_bfloat16 1.06 True
4085
- xformers_meff cuda_attn_L448_bfloat16 1.25 True
4086
- xformers_meff cuda_attn_L512_bfloat16 1.23 True
4087
  </pre></div>
4088
  <div class="uv-install-logs" id="uv-logs-benchmark">
4089
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4090
  <div class="uv-logs-content" style="display: none;">
4091
  Downloading xformers (111.8MiB)
4092
  Downloaded xformers
4093
- Installed 38 packages in 217ms
4094
  </div>
4095
  </div>
4096
  <div class="cell-artifacts">
 
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: benchmark | 5.49s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3940
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3941
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3942
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3943
+ xformers_meff 9.60% 449.460us 54.45% 2.550ms 2.550ms 0.000us 0.00% 3.540ms 3.540ms 1
3944
+ xformers_flash3::flash_fwd 4.00% 187.356us 44.14% 2.067ms 689.137us 0.000us 0.00% 3.540ms 1.180ms 3
3945
+ flash_attn_3::fwd 1.48% 69.234us 40.14% 1.880ms 626.685us 2.646ms 100.00% 3.540ms 1.180ms 3
3946
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.648ms 100.06% 2.648ms 2.648ms 1
3947
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.646ms 100.00% 2.646ms 882.010us 3
3948
+ Activity Buffer Request 36.74% 1.721ms 36.74% 1.721ms 1.721ms 894.309us 33.80% 894.309us 894.309us 1
3949
+ aten::empty 0.73% 34.410us 0.73% 34.410us 5.735us 0.000us 0.00% 0.000us 0.000us 6
3950
+ cudaFuncSetAttribute 0.25% 11.780us 0.25% 11.780us 3.927us 0.000us 0.00% 0.000us 0.000us 3
3951
+ cudaLaunchKernel 0.93% 43.670us 0.93% 43.670us 14.557us 0.000us 0.00% 0.000us 0.000us 3
3952
+ aten::reshape 0.24% 11.301us 0.72% 33.571us 5.595us 0.000us 0.00% 0.000us 0.000us 6
3953
+ aten::view 0.48% 22.270us 0.48% 22.270us 3.712us 0.000us 0.00% 0.000us 0.000us 6
3954
+ cudaDeviceSynchronize 45.55% 2.133ms 45.55% 2.133ms 2.133ms 0.000us 0.00% 0.000us 0.000us 1
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
+ Self CPU time total: 4.684ms
3957
+ Self CUDA time total: 2.646ms
3958
 
3959
 
3960
 
 
3964
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3965
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3966
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3967
+ xformers_meff 6.53% 314.780us 50.80% 2.448ms 2.448ms 0.000us 0.00% 3.745ms 3.745ms 1
3968
+ xformers_flash3::flash_fwd 2.99% 144.051us 43.78% 2.110ms 703.226us 0.000us 0.00% 3.745ms 1.248ms 3
3969
+ flash_attn_3::fwd 1.06% 51.161us 40.79% 1.966ms 655.209us 2.793ms 100.00% 3.745ms 1.248ms 3
3970
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.06% 2.795ms 2.795ms 1
3971
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.793ms 100.00% 2.793ms 931.037us 3
3972
+ Activity Buffer Request 38.27% 1.844ms 38.27% 1.844ms 1.844ms 952.158us 34.09% 952.158us 952.158us 1
3973
+ aten::empty 0.59% 28.641us 0.59% 28.641us 4.774us 0.000us 0.00% 0.000us 0.000us 6
3974
+ cudaFuncSetAttribute 0.11% 5.380us 0.11% 5.380us 1.793us 0.000us 0.00% 0.000us 0.000us 3
3975
+ cudaLaunchKernel 0.75% 36.051us 0.75% 36.051us 12.017us 0.000us 0.00% 0.000us 0.000us 3
3976
+ aten::reshape 0.19% 9.170us 0.49% 23.510us 3.918us 0.000us 0.00% 0.000us 0.000us 6
3977
+ aten::view 0.30% 14.340us 0.30% 14.340us 2.390us 0.000us 0.00% 0.000us 0.000us 6
3978
+ cudaDeviceSynchronize 49.20% 2.371ms 49.20% 2.371ms 2.371ms 0.000us 0.00% 0.000us 0.000us 1
3979
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3980
+ Self CPU time total: 4.819ms
3981
+ Self CUDA time total: 2.793ms
3982
 
3983
 
3984
 
 
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3990
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3991
+ xformers_meff 6.41% 306.378us 48.23% 2.306ms 2.306ms 0.000us 0.00% 3.879ms 3.879ms 1
3992
+ xformers_flash3::flash_fwd 2.97% 141.954us 41.36% 1.977ms 659.046us 0.000us 0.00% 3.879ms 1.293ms 3
3993
+ flash_attn_3::fwd 1.09% 51.910us 38.39% 1.835ms 611.728us 2.892ms 100.00% 3.879ms 1.293ms 3
3994
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.893ms 100.06% 2.893ms 2.893ms 1
3995
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.892ms 100.00% 2.892ms 963.882us 3
3996
+ Activity Buffer Request 35.83% 1.713ms 35.83% 1.713ms 1.713ms 986.975us 34.13% 986.975us 986.975us 1
3997
+ aten::empty 0.60% 28.840us 0.60% 28.840us 4.807us 0.000us 0.00% 0.000us 0.000us 6
3998
+ cudaFuncSetAttribute 0.11% 5.330us 0.11% 5.330us 1.777us 0.000us 0.00% 0.000us 0.000us 3
3999
+ cudaLaunchKernel 0.75% 36.082us 0.75% 36.082us 12.027us 0.000us 0.00% 0.000us 0.000us 3
4000
+ aten::reshape 0.17% 8.059us 0.47% 22.400us 3.733us 0.000us 0.00% 0.000us 0.000us 6
4001
+ aten::view 0.30% 14.341us 0.30% 14.341us 2.390us 0.000us 0.00% 0.000us 0.000us 6
4002
+ cudaDeviceSynchronize 51.77% 2.475ms 51.77% 2.475ms 2.475ms 0.000us 0.00% 0.000us 0.000us 1
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
+ Self CPU time total: 4.781ms
4005
+ Self CUDA time total: 2.892ms
4006
 
4007
 
4008
 
 
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4014
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4015
+ xformers_meff 6.14% 305.279us 50.00% 2.487ms 2.487ms 0.000us 0.00% 3.889ms 3.889ms 1
4016
+ xformers_flash3::flash_fwd 2.94% 146.052us 43.42% 2.159ms 719.674us 0.000us 0.00% 3.889ms 1.296ms 3
4017
+ flash_attn_3::fwd 1.05% 52.012us 40.48% 2.013ms 670.990us 2.906ms 100.00% 3.889ms 1.296ms 3
4018
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.907ms 100.06% 2.907ms 2.907ms 1
4019
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.906ms 100.00% 2.906ms 968.605us 3
4020
+ Activity Buffer Request 34.76% 1.728ms 34.76% 1.728ms 1.728ms 983.453us 33.84% 983.453us 983.453us 1
4021
+ aten::empty 0.63% 31.322us 0.63% 31.322us 5.220us 0.000us 0.00% 0.000us 0.000us 6
4022
+ cudaFuncSetAttribute 0.11% 5.389us 0.11% 5.389us 1.796us 0.000us 0.00% 0.000us 0.000us 3
4023
+ cudaLaunchKernel 3.94% 195.844us 3.94% 195.844us 65.281us 0.000us 0.00% 0.000us 0.000us 3
4024
+ aten::reshape 0.17% 8.560us 0.45% 22.331us 3.722us 0.000us 0.00% 0.000us 0.000us 6
4025
+ aten::view 0.28% 13.771us 0.28% 13.771us 2.295us 0.000us 0.00% 0.000us 0.000us 6
4026
+ cudaDeviceSynchronize 50.00% 2.486ms 50.00% 2.486ms 2.486ms 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
+ Self CPU time total: 4.973ms
4029
+ Self CUDA time total: 2.906ms
4030
 
4031
 
4032
 
 
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
+ xformers_meff 5.54% 306.968us 45.05% 2.496ms 2.496ms 0.000us 0.00% 4.618ms 4.618ms 1
4040
+ xformers_flash3::flash_fwd 2.62% 145.024us 39.11% 2.167ms 722.434us 0.000us 0.00% 4.618ms 1.539ms 3
4041
+ flash_attn_3::fwd 0.92% 51.181us 36.50% 2.022ms 674.093us 3.463ms 100.00% 4.618ms 1.539ms 3
4042
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.465ms 100.05% 3.465ms 3.465ms 1
4043
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.463ms 100.00% 3.463ms 1.154ms 3
4044
+ Activity Buffer Request 31.42% 1.741ms 31.42% 1.741ms 1.741ms 1.155ms 33.34% 1.155ms 1.155ms 1
4045
+ aten::empty 0.54% 29.990us 0.54% 29.990us 4.998us 0.000us 0.00% 0.000us 0.000us 6
4046
+ cudaFuncSetAttribute 0.10% 5.350us 0.10% 5.350us 1.783us 0.000us 0.00% 0.000us 0.000us 3
4047
+ cudaLaunchKernel 3.51% 194.715us 3.51% 194.715us 64.905us 0.000us 0.00% 0.000us 0.000us 3
4048
+ aten::reshape 0.15% 8.420us 0.40% 22.040us 3.673us 0.000us 0.00% 0.000us 0.000us 6
4049
+ aten::view 0.25% 13.620us 0.25% 13.620us 2.270us 0.000us 0.00% 0.000us 0.000us 6
4050
+ cudaDeviceSynchronize 54.95% 3.045ms 54.95% 3.045ms 3.045ms 0.000us 0.00% 0.000us 0.000us 1
4051
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4052
+ Self CPU time total: 5.541ms
4053
+ Self CUDA time total: 3.463ms
4054
 
4055
 
4056
 
 
4060
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4061
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4062
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4063
+ xformers_meff 5.16% 304.966us 48.93% 2.893ms 2.893ms 0.000us 0.00% 4.598ms 4.598ms 1
4064
+ xformers_flash3::flash_fwd 9.37% 553.844us 43.37% 2.564ms 854.584us 0.000us 0.00% 4.598ms 1.533ms 3
4065
+ flash_attn_3::fwd 0.88% 52.300us 34.00% 2.010ms 669.970us 3.443ms 100.00% 4.598ms 1.533ms 3
4066
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.445ms 100.05% 3.445ms 3.445ms 1
4067
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.443ms 100.00% 3.443ms 1.148ms 3
4068
+ Activity Buffer Request 28.71% 1.697ms 28.71% 1.697ms 1.697ms 1.155ms 33.53% 1.155ms 1.155ms 1
4069
+ aten::empty 0.52% 30.653us 0.52% 30.653us 5.109us 0.000us 0.00% 0.000us 0.000us 6
4070
+ cudaFuncSetAttribute 0.09% 5.400us 0.09% 5.400us 1.800us 0.000us 0.00% 0.000us 0.000us 3
4071
+ cudaLaunchKernel 3.80% 224.365us 3.80% 224.365us 74.788us 0.000us 0.00% 0.000us 0.000us 3
4072
+ aten::reshape 0.15% 8.918us 0.40% 23.921us 3.987us 0.000us 0.00% 0.000us 0.000us 6
4073
+ aten::view 0.25% 15.003us 0.25% 15.003us 2.501us 0.000us 0.00% 0.000us 0.000us 6
4074
+ cudaDeviceSynchronize 51.07% 3.019ms 51.07% 3.019ms 3.019ms 0.000us 0.00% 0.000us 0.000us 1
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
+ Self CPU time total: 5.912ms
4077
+ Self CUDA time total: 3.443ms
4078
 
4079
 
4080
  impl wl p50(ms) ok
4081
  xformers_meff cuda_attn_L128_bfloat16 0.98 True
4082
+ xformers_meff cuda_attn_L256_bfloat16 1.04 True
4083
  xformers_meff cuda_attn_L320_bfloat16 1.06 True
4084
+ xformers_meff cuda_attn_L384_bfloat16 1.09 True
4085
+ xformers_meff cuda_attn_L448_bfloat16 1.26 True
4086
+ xformers_meff cuda_attn_L512_bfloat16 1.24 True
4087
  </pre></div>
4088
  <div class="uv-install-logs" id="uv-logs-benchmark">
4089
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4090
  <div class="uv-logs-content" style="display: none;">
4091
  Downloading xformers (111.8MiB)
4092
  Downloaded xformers
4093
+ Installed 1 package in 12ms
4094
  </div>
4095
  </div>
4096
  <div class="cell-artifacts">
flash_attn/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 42ecd5306fef7c29b246aeecde0a12e51ef4139ff514ca508f9c74968d64ef13
  • Pointer size: 130 Bytes
  • Size of remote file: 24.8 kB

Git LFS Details

  • SHA256: 0f160f3f11d41b3a388cb9ab3a3ed23dc9ca473cb8531e7d3dc53c94cc97ebd0
  • Pointer size: 130 Bytes
  • Size of remote file: 24.8 kB
flash_attn/results/combined_results.html CHANGED
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-12-19T19:09:55.297355</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
@@ -3999,96 +3999,96 @@ body[data-tool="eraser"] .main-content {
3999
  <g id="matplotlib.axis_2">
4000
  <g id="ytick_1">
4001
  <g id="grid-y--2" class="grid grid-y">
4002
- <path d="M 47.81 404.469232 L 835.361742 404.469232 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4003
  </g>
4004
  <g id="line2d_7">
4005
  <defs>
4006
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4007
  </defs>
4008
  <g>
4009
- <use ns4:href="#m0fca2865ba" x="47.81" y="404.469232" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="408.26845" transform="rotate(-0 40.81 408.26845)">1.0</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_2">
4017
  <g id="grid-y--3" class="grid grid-y">
4018
- <path d="M 47.81 347.147903 L 835.361742 347.147903 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
- <use ns4:href="#m0fca2865ba" x="47.81" y="347.147903" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="350.947122" transform="rotate(-0 40.81 350.947122)">1.2</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_3">
4030
  <g id="grid-y--4" class="grid grid-y">
4031
- <path d="M 47.81 289.826575 L 835.361742 289.826575 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
- <use ns4:href="#m0fca2865ba" x="47.81" y="289.826575" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="293.625794" transform="rotate(-0 40.81 293.625794)">1.4</text>
4040
  </g>
4041
  </g>
4042
  <g id="ytick_4">
4043
  <g id="grid-y--5" class="grid grid-y">
4044
- <path d="M 47.81 232.505247 L 835.361742 232.505247 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4045
  </g>
4046
  <g id="line2d_10">
4047
  <g>
4048
- <use ns4:href="#m0fca2865ba" x="47.81" y="232.505247" style="stroke: #000000; stroke-width: 0.8" />
4049
  </g>
4050
  </g>
4051
  <g id="text_10">
4052
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="236.304466" transform="rotate(-0 40.81 236.304466)">1.6</text>
4053
  </g>
4054
  </g>
4055
  <g id="ytick_5">
4056
  <g id="grid-y--6" class="grid grid-y">
4057
- <path d="M 47.81 175.183919 L 835.361742 175.183919 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
  </g>
4059
  <g id="line2d_11">
4060
  <g>
4061
- <use ns4:href="#m0fca2865ba" x="47.81" y="175.183919" style="stroke: #000000; stroke-width: 0.8" />
4062
  </g>
4063
  </g>
4064
  <g id="text_11">
4065
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="178.983137" transform="rotate(-0 40.81 178.983137)">1.8</text>
4066
  </g>
4067
  </g>
4068
  <g id="ytick_6">
4069
  <g id="grid-y--7" class="grid grid-y">
4070
- <path d="M 47.81 117.86259 L 835.361742 117.86259 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
  </g>
4072
  <g id="line2d_12">
4073
  <g>
4074
- <use ns4:href="#m0fca2865ba" x="47.81" y="117.86259" style="stroke: #000000; stroke-width: 0.8" />
4075
  </g>
4076
  </g>
4077
  <g id="text_12">
4078
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="121.661809" transform="rotate(-0 40.81 121.661809)">2.0</text>
4079
  </g>
4080
  </g>
4081
  <g id="ytick_7">
4082
  <g id="grid-y--8" class="grid grid-y">
4083
- <path d="M 47.81 60.541262 L 835.361742 60.541262 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
  </g>
4085
  <g id="line2d_13">
4086
  <g>
4087
- <use ns4:href="#m0fca2865ba" x="47.81" y="60.541262" style="stroke: #000000; stroke-width: 0.8" />
4088
  </g>
4089
  </g>
4090
  <g id="text_13">
4091
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="64.340481" transform="rotate(-0 40.81 64.340481)">2.2</text>
4092
  </g>
4093
  </g>
4094
  <g id="label--y" class="ylabel">
@@ -4096,73 +4096,73 @@ body[data-tool="eraser"] .main-content {
4096
  </g>
4097
  </g>
4098
  <g id="series--torch-flash-ma" class="series">
4099
- <path d="M 83.607806 346.603064 L 226.799032 331.148661 L 369.990258 322.7508 L 513.181484 313.642154 L 656.37271 270.506995 L 799.563935 259.742049 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4100
  <defs>
4101
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4102
  </defs>
4103
  <g clip-path="url(#p09feef2583)">
4104
- <use ns4:href="#md7efaf3aec" x="83.607806" y="346.603064" style="fill: #1f77b4; stroke: #1f77b4" />
4105
- <use ns4:href="#md7efaf3aec" x="226.799032" y="331.148661" style="fill: #1f77b4; stroke: #1f77b4" />
4106
- <use ns4:href="#md7efaf3aec" x="369.990258" y="322.7508" style="fill: #1f77b4; stroke: #1f77b4" />
4107
- <use ns4:href="#md7efaf3aec" x="513.181484" y="313.642154" style="fill: #1f77b4; stroke: #1f77b4" />
4108
- <use ns4:href="#md7efaf3aec" x="656.37271" y="270.506995" style="fill: #1f77b4; stroke: #1f77b4" />
4109
- <use ns4:href="#md7efaf3aec" x="799.563935" y="259.742049" style="fill: #1f77b4; stroke: #1f77b4" />
4110
  </g>
4111
  </g>
4112
  <g id="series--torch-mem-eff" class="series">
4113
- <path d="M 83.607806 162.593002 L 226.799032 131.641491 L 369.990258 126.594348 L 513.181484 96.170767 L 656.37271 105.428161 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4114
  <defs>
4115
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4116
  </defs>
4117
  <g clip-path="url(#p09feef2583)">
4118
- <use ns4:href="#m9b8c54d372" x="83.607806" y="162.593002" style="fill: #ff7f0e; stroke: #ff7f0e" />
4119
- <use ns4:href="#m9b8c54d372" x="226.799032" y="131.641491" style="fill: #ff7f0e; stroke: #ff7f0e" />
4120
- <use ns4:href="#m9b8c54d372" x="369.990258" y="126.594348" style="fill: #ff7f0e; stroke: #ff7f0e" />
4121
- <use ns4:href="#m9b8c54d372" x="513.181484" y="96.170767" style="fill: #ff7f0e; stroke: #ff7f0e" />
4122
- <use ns4:href="#m9b8c54d372" x="656.37271" y="105.428161" style="fill: #ff7f0e; stroke: #ff7f0e" />
4123
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4124
  </g>
4125
  </g>
4126
  <g id="series--xformers-meff" class="series">
4127
- <path d="M 83.607806 410.706939 L 226.799032 396.737158 L 369.990258 386.568354 L 513.181484 386.536541 L 656.37271 333.774551 L 799.563935 337.388661 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4128
  <defs>
4129
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4130
  </defs>
4131
  <g clip-path="url(#p09feef2583)">
4132
- <use ns4:href="#mc655281e0b" x="83.607806" y="410.706939" style="fill: #2ca02c; stroke: #2ca02c" />
4133
- <use ns4:href="#mc655281e0b" x="226.799032" y="396.737158" style="fill: #2ca02c; stroke: #2ca02c" />
4134
- <use ns4:href="#mc655281e0b" x="369.990258" y="386.568354" style="fill: #2ca02c; stroke: #2ca02c" />
4135
- <use ns4:href="#mc655281e0b" x="513.181484" y="386.536541" style="fill: #2ca02c; stroke: #2ca02c" />
4136
- <use ns4:href="#mc655281e0b" x="656.37271" y="333.774551" style="fill: #2ca02c; stroke: #2ca02c" />
4137
- <use ns4:href="#mc655281e0b" x="799.563935" y="337.388661" style="fill: #2ca02c; stroke: #2ca02c" />
4138
  </g>
4139
  </g>
4140
  <g id="series--hf-kernels-flash-attn" class="series">
4141
- <path d="M 83.607806 416.940633 L 226.799032 399.984697 L 369.990258 390.841946 L 513.181484 387.029791 L 656.37271 344.433452 L 799.563935 341.857145 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4142
  <defs>
4143
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4144
  </defs>
4145
  <g clip-path="url(#p09feef2583)">
4146
- <use ns4:href="#m61c8040d7e" x="83.607806" y="416.940633" style="fill: #d62728; stroke: #d62728" />
4147
- <use ns4:href="#m61c8040d7e" x="226.799032" y="399.984697" style="fill: #d62728; stroke: #d62728" />
4148
- <use ns4:href="#m61c8040d7e" x="369.990258" y="390.841946" style="fill: #d62728; stroke: #d62728" />
4149
- <use ns4:href="#m61c8040d7e" x="513.181484" y="387.029791" style="fill: #d62728; stroke: #d62728" />
4150
- <use ns4:href="#m61c8040d7e" x="656.37271" y="344.433452" style="fill: #d62728; stroke: #d62728" />
4151
- <use ns4:href="#m61c8040d7e" x="799.563935" y="341.857145" style="fill: #d62728; stroke: #d62728" />
4152
  </g>
4153
  </g>
4154
  <g id="series--hf-kernels-flash-attn3" class="series">
4155
- <path d="M 83.607806 428.387702 L 226.799032 412.171498 L 369.990258 404.997448 L 513.181484 400.314295 L 656.37271 358.798463 L 799.563935 355.895138 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4156
  <defs>
4157
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4158
  </defs>
4159
  <g clip-path="url(#p09feef2583)">
4160
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4161
- <use ns4:href="#m7cd35be9cc" x="226.799032" y="412.171498" style="fill: #9467bd; stroke: #9467bd" />
4162
- <use ns4:href="#m7cd35be9cc" x="369.990258" y="404.997448" style="fill: #9467bd; stroke: #9467bd" />
4163
- <use ns4:href="#m7cd35be9cc" x="513.181484" y="400.314295" style="fill: #9467bd; stroke: #9467bd" />
4164
- <use ns4:href="#m7cd35be9cc" x="656.37271" y="358.798463" style="fill: #9467bd; stroke: #9467bd" />
4165
- <use ns4:href="#m7cd35be9cc" x="799.563935" y="355.895138" style="fill: #9467bd; stroke: #9467bd" />
4166
  </g>
4167
  </g>
4168
  <g id="patch_3">
@@ -4247,7 +4247,7 @@ body[data-tool="eraser"] .main-content {
4247
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4248
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4249
  </span> |
4250
- Cell: combine | 4.87s
4251
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4252
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4253
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4356,48 +4356,48 @@ Summary: 6 found, 0 skipped, 0 missing
4356
  COMBINED BENCHMARK SUMMARY
4357
 
4358
  impl wl p50(ms) ok
4359
- hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
4360
- hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.02 True
4361
- hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
4362
  hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
4363
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
4364
- hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.22 True
4365
- hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
4366
- hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
4367
- hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.00 True
4368
  hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True
4369
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.16 True
4370
  hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True
4371
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4372
- Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
4373
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4374
- Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
4375
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4376
- Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
4377
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4378
- Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
4379
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4380
- Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
4381
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4382
- Error: module &#x27;sage_attention_e8dcde4226fe38e6&#x27; has no attribute &#x27;fwd&#x27;
4383
- torch_flash_ma cuda_attn_L128_bfloat16 1.20 True
4384
- torch_flash_ma cuda_attn_L256_bfloat16 1.26 True
4385
- torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
4386
- torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
4387
- torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
4388
- torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
4389
- torch_mem_eff cuda_attn_L128_bfloat16 1.84 True
4390
- torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
4391
- torch_mem_eff cuda_attn_L320_bfloat16 1.97 True
4392
- torch_mem_eff cuda_attn_L384_bfloat16 2.08 True
4393
- torch_mem_eff cuda_attn_L448_bfloat16 2.04 True
4394
- torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
4395
  xformers_meff cuda_attn_L128_bfloat16 0.98 True
4396
- xformers_meff cuda_attn_L256_bfloat16 1.03 True
4397
  xformers_meff cuda_attn_L320_bfloat16 1.06 True
4398
- xformers_meff cuda_attn_L384_bfloat16 1.06 True
4399
- xformers_meff cuda_attn_L448_bfloat16 1.25 True
4400
- xformers_meff cuda_attn_L512_bfloat16 1.23 True
4401
 
4402
  GENERATING COMBINED VISUALIZATION
4403
 
@@ -4421,7 +4421,7 @@ Implementations included:
4421
  <div class="uv-install-logs" id="uv-logs-combine">
4422
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4423
  <div class="uv-logs-content" style="display: none;">
4424
- Installed 37 packages in 315ms
4425
  </div>
4426
  </div>
4427
  <div class="cell-artifacts">
@@ -4434,7 +4434,7 @@ Installed 37 packages in 315ms
4434
  <rdf:RDF>
4435
  <ns2:Work>
4436
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4437
- <dc:date>2025-12-19T19:09:55.297355</dc:date>
4438
  <dc:format>image/svg+xml</dc:format>
4439
  <dc:creator>
4440
  <ns2:Agent>
@@ -4544,96 +4544,96 @@ Installed 37 packages in 315ms
4544
  <g id="matplotlib.axis_2">
4545
  <g id="ytick_1">
4546
  <g id="grid-y--2" class="grid grid-y">
4547
- <path d="M 47.81 404.469232 L 835.361742 404.469232 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4548
  </g>
4549
  <g id="line2d_7">
4550
  <defs>
4551
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4552
  </defs>
4553
  <g>
4554
- <use ns4:href="#m0fca2865ba" x="47.81" y="404.469232" style="stroke: #000000; stroke-width: 0.8" />
4555
  </g>
4556
  </g>
4557
  <g id="text_7">
4558
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="408.26845" transform="rotate(-0 40.81 408.26845)">1.0</text>
4559
  </g>
4560
  </g>
4561
  <g id="ytick_2">
4562
  <g id="grid-y--3" class="grid grid-y">
4563
- <path d="M 47.81 347.147903 L 835.361742 347.147903 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4564
  </g>
4565
  <g id="line2d_8">
4566
  <g>
4567
- <use ns4:href="#m0fca2865ba" x="47.81" y="347.147903" style="stroke: #000000; stroke-width: 0.8" />
4568
  </g>
4569
  </g>
4570
  <g id="text_8">
4571
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="350.947122" transform="rotate(-0 40.81 350.947122)">1.2</text>
4572
  </g>
4573
  </g>
4574
  <g id="ytick_3">
4575
  <g id="grid-y--4" class="grid grid-y">
4576
- <path d="M 47.81 289.826575 L 835.361742 289.826575 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4577
  </g>
4578
  <g id="line2d_9">
4579
  <g>
4580
- <use ns4:href="#m0fca2865ba" x="47.81" y="289.826575" style="stroke: #000000; stroke-width: 0.8" />
4581
  </g>
4582
  </g>
4583
  <g id="text_9">
4584
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="293.625794" transform="rotate(-0 40.81 293.625794)">1.4</text>
4585
  </g>
4586
  </g>
4587
  <g id="ytick_4">
4588
  <g id="grid-y--5" class="grid grid-y">
4589
- <path d="M 47.81 232.505247 L 835.361742 232.505247 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4590
  </g>
4591
  <g id="line2d_10">
4592
  <g>
4593
- <use ns4:href="#m0fca2865ba" x="47.81" y="232.505247" style="stroke: #000000; stroke-width: 0.8" />
4594
  </g>
4595
  </g>
4596
  <g id="text_10">
4597
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="236.304466" transform="rotate(-0 40.81 236.304466)">1.6</text>
4598
  </g>
4599
  </g>
4600
  <g id="ytick_5">
4601
  <g id="grid-y--6" class="grid grid-y">
4602
- <path d="M 47.81 175.183919 L 835.361742 175.183919 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4603
  </g>
4604
  <g id="line2d_11">
4605
  <g>
4606
- <use ns4:href="#m0fca2865ba" x="47.81" y="175.183919" style="stroke: #000000; stroke-width: 0.8" />
4607
  </g>
4608
  </g>
4609
  <g id="text_11">
4610
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="178.983137" transform="rotate(-0 40.81 178.983137)">1.8</text>
4611
  </g>
4612
  </g>
4613
  <g id="ytick_6">
4614
  <g id="grid-y--7" class="grid grid-y">
4615
- <path d="M 47.81 117.86259 L 835.361742 117.86259 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4616
  </g>
4617
  <g id="line2d_12">
4618
  <g>
4619
- <use ns4:href="#m0fca2865ba" x="47.81" y="117.86259" style="stroke: #000000; stroke-width: 0.8" />
4620
  </g>
4621
  </g>
4622
  <g id="text_12">
4623
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="121.661809" transform="rotate(-0 40.81 121.661809)">2.0</text>
4624
  </g>
4625
  </g>
4626
  <g id="ytick_7">
4627
  <g id="grid-y--8" class="grid grid-y">
4628
- <path d="M 47.81 60.541262 L 835.361742 60.541262 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4629
  </g>
4630
  <g id="line2d_13">
4631
  <g>
4632
- <use ns4:href="#m0fca2865ba" x="47.81" y="60.541262" style="stroke: #000000; stroke-width: 0.8" />
4633
  </g>
4634
  </g>
4635
  <g id="text_13">
4636
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="64.340481" transform="rotate(-0 40.81 64.340481)">2.2</text>
4637
  </g>
4638
  </g>
4639
  <g id="label--y" class="ylabel">
@@ -4641,73 +4641,73 @@ Installed 37 packages in 315ms
4641
  </g>
4642
  </g>
4643
  <g id="series--torch-flash-ma" class="series">
4644
- <path d="M 83.607806 346.603064 L 226.799032 331.148661 L 369.990258 322.7508 L 513.181484 313.642154 L 656.37271 270.506995 L 799.563935 259.742049 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4645
  <defs>
4646
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4647
  </defs>
4648
  <g clip-path="url(#p09feef2583)">
4649
- <use ns4:href="#md7efaf3aec" x="83.607806" y="346.603064" style="fill: #1f77b4; stroke: #1f77b4" />
4650
- <use ns4:href="#md7efaf3aec" x="226.799032" y="331.148661" style="fill: #1f77b4; stroke: #1f77b4" />
4651
- <use ns4:href="#md7efaf3aec" x="369.990258" y="322.7508" style="fill: #1f77b4; stroke: #1f77b4" />
4652
- <use ns4:href="#md7efaf3aec" x="513.181484" y="313.642154" style="fill: #1f77b4; stroke: #1f77b4" />
4653
- <use ns4:href="#md7efaf3aec" x="656.37271" y="270.506995" style="fill: #1f77b4; stroke: #1f77b4" />
4654
- <use ns4:href="#md7efaf3aec" x="799.563935" y="259.742049" style="fill: #1f77b4; stroke: #1f77b4" />
4655
  </g>
4656
  </g>
4657
  <g id="series--torch-mem-eff" class="series">
4658
- <path d="M 83.607806 162.593002 L 226.799032 131.641491 L 369.990258 126.594348 L 513.181484 96.170767 L 656.37271 105.428161 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4659
  <defs>
4660
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4661
  </defs>
4662
  <g clip-path="url(#p09feef2583)">
4663
- <use ns4:href="#m9b8c54d372" x="83.607806" y="162.593002" style="fill: #ff7f0e; stroke: #ff7f0e" />
4664
- <use ns4:href="#m9b8c54d372" x="226.799032" y="131.641491" style="fill: #ff7f0e; stroke: #ff7f0e" />
4665
- <use ns4:href="#m9b8c54d372" x="369.990258" y="126.594348" style="fill: #ff7f0e; stroke: #ff7f0e" />
4666
- <use ns4:href="#m9b8c54d372" x="513.181484" y="96.170767" style="fill: #ff7f0e; stroke: #ff7f0e" />
4667
- <use ns4:href="#m9b8c54d372" x="656.37271" y="105.428161" style="fill: #ff7f0e; stroke: #ff7f0e" />
4668
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4669
  </g>
4670
  </g>
4671
  <g id="series--xformers-meff" class="series">
4672
- <path d="M 83.607806 410.706939 L 226.799032 396.737158 L 369.990258 386.568354 L 513.181484 386.536541 L 656.37271 333.774551 L 799.563935 337.388661 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4673
  <defs>
4674
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4675
  </defs>
4676
  <g clip-path="url(#p09feef2583)">
4677
- <use ns4:href="#mc655281e0b" x="83.607806" y="410.706939" style="fill: #2ca02c; stroke: #2ca02c" />
4678
- <use ns4:href="#mc655281e0b" x="226.799032" y="396.737158" style="fill: #2ca02c; stroke: #2ca02c" />
4679
- <use ns4:href="#mc655281e0b" x="369.990258" y="386.568354" style="fill: #2ca02c; stroke: #2ca02c" />
4680
- <use ns4:href="#mc655281e0b" x="513.181484" y="386.536541" style="fill: #2ca02c; stroke: #2ca02c" />
4681
- <use ns4:href="#mc655281e0b" x="656.37271" y="333.774551" style="fill: #2ca02c; stroke: #2ca02c" />
4682
- <use ns4:href="#mc655281e0b" x="799.563935" y="337.388661" style="fill: #2ca02c; stroke: #2ca02c" />
4683
  </g>
4684
  </g>
4685
  <g id="series--hf-kernels-flash-attn" class="series">
4686
- <path d="M 83.607806 416.940633 L 226.799032 399.984697 L 369.990258 390.841946 L 513.181484 387.029791 L 656.37271 344.433452 L 799.563935 341.857145 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4687
  <defs>
4688
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4689
  </defs>
4690
  <g clip-path="url(#p09feef2583)">
4691
- <use ns4:href="#m61c8040d7e" x="83.607806" y="416.940633" style="fill: #d62728; stroke: #d62728" />
4692
- <use ns4:href="#m61c8040d7e" x="226.799032" y="399.984697" style="fill: #d62728; stroke: #d62728" />
4693
- <use ns4:href="#m61c8040d7e" x="369.990258" y="390.841946" style="fill: #d62728; stroke: #d62728" />
4694
- <use ns4:href="#m61c8040d7e" x="513.181484" y="387.029791" style="fill: #d62728; stroke: #d62728" />
4695
- <use ns4:href="#m61c8040d7e" x="656.37271" y="344.433452" style="fill: #d62728; stroke: #d62728" />
4696
- <use ns4:href="#m61c8040d7e" x="799.563935" y="341.857145" style="fill: #d62728; stroke: #d62728" />
4697
  </g>
4698
  </g>
4699
  <g id="series--hf-kernels-flash-attn3" class="series">
4700
- <path d="M 83.607806 428.387702 L 226.799032 412.171498 L 369.990258 404.997448 L 513.181484 400.314295 L 656.37271 358.798463 L 799.563935 355.895138 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4701
  <defs>
4702
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4703
  </defs>
4704
  <g clip-path="url(#p09feef2583)">
4705
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4706
- <use ns4:href="#m7cd35be9cc" x="226.799032" y="412.171498" style="fill: #9467bd; stroke: #9467bd" />
4707
- <use ns4:href="#m7cd35be9cc" x="369.990258" y="404.997448" style="fill: #9467bd; stroke: #9467bd" />
4708
- <use ns4:href="#m7cd35be9cc" x="513.181484" y="400.314295" style="fill: #9467bd; stroke: #9467bd" />
4709
- <use ns4:href="#m7cd35be9cc" x="656.37271" y="358.798463" style="fill: #9467bd; stroke: #9467bd" />
4710
- <use ns4:href="#m7cd35be9cc" x="799.563935" y="355.895138" style="fill: #9467bd; stroke: #9467bd" />
4711
  </g>
4712
  </g>
4713
  <g id="patch_3">
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T19:55:48.469348</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
 
3999
  <g id="matplotlib.axis_2">
4000
  <g id="ytick_1">
4001
  <g id="grid-y--2" class="grid grid-y">
4002
+ <path d="M 47.81 402.388331 L 835.361742 402.388331 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4003
  </g>
4004
  <g id="line2d_7">
4005
  <defs>
4006
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4007
  </defs>
4008
  <g>
4009
+ <use ns4:href="#m0fca2865ba" x="47.81" y="402.388331" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="406.18755" transform="rotate(-0 40.81 406.18755)">1.0</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_2">
4017
  <g id="grid-y--3" class="grid grid-y">
4018
+ <path d="M 47.81 341.392024 L 835.361742 341.392024 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
+ <use ns4:href="#m0fca2865ba" x="47.81" y="341.392024" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="345.191243" transform="rotate(-0 40.81 345.191243)">1.2</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_3">
4030
  <g id="grid-y--4" class="grid grid-y">
4031
+ <path d="M 47.81 280.395718 L 835.361742 280.395718 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
+ <use ns4:href="#m0fca2865ba" x="47.81" y="280.395718" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="284.194936" transform="rotate(-0 40.81 284.194936)">1.4</text>
4040
  </g>
4041
  </g>
4042
  <g id="ytick_4">
4043
  <g id="grid-y--5" class="grid grid-y">
4044
+ <path d="M 47.81 219.399411 L 835.361742 219.399411 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4045
  </g>
4046
  <g id="line2d_10">
4047
  <g>
4048
+ <use ns4:href="#m0fca2865ba" x="47.81" y="219.399411" style="stroke: #000000; stroke-width: 0.8" />
4049
  </g>
4050
  </g>
4051
  <g id="text_10">
4052
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="223.198629" transform="rotate(-0 40.81 223.198629)">1.6</text>
4053
  </g>
4054
  </g>
4055
  <g id="ytick_5">
4056
  <g id="grid-y--6" class="grid grid-y">
4057
+ <path d="M 47.81 158.403104 L 835.361742 158.403104 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
  </g>
4059
  <g id="line2d_11">
4060
  <g>
4061
+ <use ns4:href="#m0fca2865ba" x="47.81" y="158.403104" style="stroke: #000000; stroke-width: 0.8" />
4062
  </g>
4063
  </g>
4064
  <g id="text_11">
4065
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="162.202323" transform="rotate(-0 40.81 162.202323)">1.8</text>
4066
  </g>
4067
  </g>
4068
  <g id="ytick_6">
4069
  <g id="grid-y--7" class="grid grid-y">
4070
+ <path d="M 47.81 97.406797 L 835.361742 97.406797 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
  </g>
4072
  <g id="line2d_12">
4073
  <g>
4074
+ <use ns4:href="#m0fca2865ba" x="47.81" y="97.406797" style="stroke: #000000; stroke-width: 0.8" />
4075
  </g>
4076
  </g>
4077
  <g id="text_12">
4078
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="101.206016" transform="rotate(-0 40.81 101.206016)">2.0</text>
4079
  </g>
4080
  </g>
4081
  <g id="ytick_7">
4082
  <g id="grid-y--8" class="grid grid-y">
4083
+ <path d="M 47.81 36.41049 L 835.361742 36.41049 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
  </g>
4085
  <g id="line2d_13">
4086
  <g>
4087
+ <use ns4:href="#m0fca2865ba" x="47.81" y="36.41049" style="stroke: #000000; stroke-width: 0.8" />
4088
  </g>
4089
  </g>
4090
  <g id="text_13">
4091
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="40.209709" transform="rotate(-0 40.81 40.209709)">2.2</text>
4092
  </g>
4093
  </g>
4094
  <g id="label--y" class="ylabel">
 
4096
  </g>
4097
  </g>
4098
  <g id="series--torch-flash-ma" class="series">
4099
+ <path d="M 83.607806 337.885652 L 226.799032 325.671141 L 369.990258 317.408887 L 513.181484 307.237447 L 656.37271 265.438813 L 799.563935 253.93491 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4100
  <defs>
4101
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4102
  </defs>
4103
  <g clip-path="url(#p09feef2583)">
4104
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="337.885652" style="fill: #1f77b4; stroke: #1f77b4" />
4105
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="325.671141" style="fill: #1f77b4; stroke: #1f77b4" />
4106
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="317.408887" style="fill: #1f77b4; stroke: #1f77b4" />
4107
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="307.237447" style="fill: #1f77b4; stroke: #1f77b4" />
4108
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="265.438813" style="fill: #1f77b4; stroke: #1f77b4" />
4109
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="253.93491" style="fill: #1f77b4; stroke: #1f77b4" />
4110
  </g>
4111
  </g>
4112
  <g id="series--torch-mem-eff" class="series">
4113
+ <path d="M 83.607806 150.723364 L 226.799032 118.266314 L 369.990258 111.819309 L 513.181484 85.541185 L 656.37271 73.225726 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4114
  <defs>
4115
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4116
  </defs>
4117
  <g clip-path="url(#p09feef2583)">
4118
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="150.723364" style="fill: #ff7f0e; stroke: #ff7f0e" />
4119
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="118.266314" style="fill: #ff7f0e; stroke: #ff7f0e" />
4120
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="111.819309" style="fill: #ff7f0e; stroke: #ff7f0e" />
4121
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="85.541185" style="fill: #ff7f0e; stroke: #ff7f0e" />
4122
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="73.225726" style="fill: #ff7f0e; stroke: #ff7f0e" />
4123
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4124
  </g>
4125
  </g>
4126
  <g id="series--xformers-meff" class="series">
4127
+ <path d="M 83.607806 408.996061 L 226.799032 390.330886 L 369.990258 383.840574 L 513.181484 376.133386 L 656.37271 322.193132 L 799.563935 327.713603 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4128
  <defs>
4129
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4130
  </defs>
4131
  <g clip-path="url(#p09feef2583)">
4132
+ <use ns4:href="#mc655281e0b" x="83.607806" y="408.996061" style="fill: #2ca02c; stroke: #2ca02c" />
4133
+ <use ns4:href="#mc655281e0b" x="226.799032" y="390.330886" style="fill: #2ca02c; stroke: #2ca02c" />
4134
+ <use ns4:href="#mc655281e0b" x="369.990258" y="383.840574" style="fill: #2ca02c; stroke: #2ca02c" />
4135
+ <use ns4:href="#mc655281e0b" x="513.181484" y="376.133386" style="fill: #2ca02c; stroke: #2ca02c" />
4136
+ <use ns4:href="#mc655281e0b" x="656.37271" y="322.193132" style="fill: #2ca02c; stroke: #2ca02c" />
4137
+ <use ns4:href="#mc655281e0b" x="799.563935" y="327.713603" style="fill: #2ca02c; stroke: #2ca02c" />
4138
  </g>
4139
  </g>
4140
  <g id="series--hf-kernels-flash-attn" class="series">
4141
+ <path d="M 83.607806 417.466618 L 226.799032 402.775353 L 369.990258 391.490731 L 513.181484 383.472462 L 656.37271 336.492496 L 799.563935 330.032377 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4142
  <defs>
4143
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4144
  </defs>
4145
  <g clip-path="url(#p09feef2583)">
4146
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="417.466618" style="fill: #d62728; stroke: #d62728" />
4147
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="402.775353" style="fill: #d62728; stroke: #d62728" />
4148
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="391.490731" style="fill: #d62728; stroke: #d62728" />
4149
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="383.472462" style="fill: #d62728; stroke: #d62728" />
4150
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="336.492496" style="fill: #d62728; stroke: #d62728" />
4151
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="330.032377" style="fill: #d62728; stroke: #d62728" />
4152
  </g>
4153
  </g>
4154
  <g id="series--hf-kernels-flash-attn3" class="series">
4155
+ <path d="M 83.607806 428.387702 L 226.799032 409.68288 L 369.990258 400.234552 L 513.181484 400.520929 L 656.37271 346.614528 L 799.563935 349.227915 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4156
  <defs>
4157
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4158
  </defs>
4159
  <g clip-path="url(#p09feef2583)">
4160
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4161
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="409.68288" style="fill: #9467bd; stroke: #9467bd" />
4162
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="400.234552" style="fill: #9467bd; stroke: #9467bd" />
4163
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="400.520929" style="fill: #9467bd; stroke: #9467bd" />
4164
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="346.614528" style="fill: #9467bd; stroke: #9467bd" />
4165
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="349.227915" style="fill: #9467bd; stroke: #9467bd" />
4166
  </g>
4167
  </g>
4168
  <g id="patch_3">
 
4247
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4248
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4249
  </span> |
4250
+ Cell: combine | 4.68s
4251
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4252
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4253
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4356
  COMBINED BENCHMARK SUMMARY
4357
 
4358
  impl wl p50(ms) ok
4359
+ hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
4360
+ hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
4361
+ hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.04 True
4362
  hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
4363
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
4364
+ hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.24 True
4365
+ hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.91 True
4366
+ hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
4367
+ hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
4368
  hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True
4369
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
4370
  hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True
4371
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4372
+ Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
4373
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4374
+ Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
4375
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4376
+ Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
4377
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4378
+ Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
4379
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4380
+ Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
4381
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4382
+ Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
4383
+ torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
4384
+ torch_flash_ma cuda_attn_L256_bfloat16 1.25 True
4385
+ torch_flash_ma cuda_attn_L320_bfloat16 1.28 True
4386
+ torch_flash_ma cuda_attn_L384_bfloat16 1.31 True
4387
+ torch_flash_ma cuda_attn_L448_bfloat16 1.45 True
4388
+ torch_flash_ma cuda_attn_L512_bfloat16 1.49 True
4389
+ torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
4390
+ torch_mem_eff cuda_attn_L256_bfloat16 1.93 True
4391
+ torch_mem_eff cuda_attn_L320_bfloat16 1.95 True
4392
+ torch_mem_eff cuda_attn_L384_bfloat16 2.04 True
4393
+ torch_mem_eff cuda_attn_L448_bfloat16 2.08 True
4394
+ torch_mem_eff cuda_attn_L512_bfloat16 2.17 True
4395
  xformers_meff cuda_attn_L128_bfloat16 0.98 True
4396
+ xformers_meff cuda_attn_L256_bfloat16 1.04 True
4397
  xformers_meff cuda_attn_L320_bfloat16 1.06 True
4398
+ xformers_meff cuda_attn_L384_bfloat16 1.09 True
4399
+ xformers_meff cuda_attn_L448_bfloat16 1.26 True
4400
+ xformers_meff cuda_attn_L512_bfloat16 1.24 True
4401
 
4402
  GENERATING COMBINED VISUALIZATION
4403
 
 
4421
  <div class="uv-install-logs" id="uv-logs-combine">
4422
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4423
  <div class="uv-logs-content" style="display: none;">
4424
+ Installed 37 packages in 206ms
4425
  </div>
4426
  </div>
4427
  <div class="cell-artifacts">
 
4434
  <rdf:RDF>
4435
  <ns2:Work>
4436
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4437
+ <dc:date>2025-12-19T19:55:48.469348</dc:date>
4438
  <dc:format>image/svg+xml</dc:format>
4439
  <dc:creator>
4440
  <ns2:Agent>
 
4544
  <g id="matplotlib.axis_2">
4545
  <g id="ytick_1">
4546
  <g id="grid-y--2" class="grid grid-y">
4547
+ <path d="M 47.81 402.388331 L 835.361742 402.388331 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4548
  </g>
4549
  <g id="line2d_7">
4550
  <defs>
4551
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4552
  </defs>
4553
  <g>
4554
+ <use ns4:href="#m0fca2865ba" x="47.81" y="402.388331" style="stroke: #000000; stroke-width: 0.8" />
4555
  </g>
4556
  </g>
4557
  <g id="text_7">
4558
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="406.18755" transform="rotate(-0 40.81 406.18755)">1.0</text>
4559
  </g>
4560
  </g>
4561
  <g id="ytick_2">
4562
  <g id="grid-y--3" class="grid grid-y">
4563
+ <path d="M 47.81 341.392024 L 835.361742 341.392024 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4564
  </g>
4565
  <g id="line2d_8">
4566
  <g>
4567
+ <use ns4:href="#m0fca2865ba" x="47.81" y="341.392024" style="stroke: #000000; stroke-width: 0.8" />
4568
  </g>
4569
  </g>
4570
  <g id="text_8">
4571
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="345.191243" transform="rotate(-0 40.81 345.191243)">1.2</text>
4572
  </g>
4573
  </g>
4574
  <g id="ytick_3">
4575
  <g id="grid-y--4" class="grid grid-y">
4576
+ <path d="M 47.81 280.395718 L 835.361742 280.395718 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4577
  </g>
4578
  <g id="line2d_9">
4579
  <g>
4580
+ <use ns4:href="#m0fca2865ba" x="47.81" y="280.395718" style="stroke: #000000; stroke-width: 0.8" />
4581
  </g>
4582
  </g>
4583
  <g id="text_9">
4584
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="284.194936" transform="rotate(-0 40.81 284.194936)">1.4</text>
4585
  </g>
4586
  </g>
4587
  <g id="ytick_4">
4588
  <g id="grid-y--5" class="grid grid-y">
4589
+ <path d="M 47.81 219.399411 L 835.361742 219.399411 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4590
  </g>
4591
  <g id="line2d_10">
4592
  <g>
4593
+ <use ns4:href="#m0fca2865ba" x="47.81" y="219.399411" style="stroke: #000000; stroke-width: 0.8" />
4594
  </g>
4595
  </g>
4596
  <g id="text_10">
4597
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="223.198629" transform="rotate(-0 40.81 223.198629)">1.6</text>
4598
  </g>
4599
  </g>
4600
  <g id="ytick_5">
4601
  <g id="grid-y--6" class="grid grid-y">
4602
+ <path d="M 47.81 158.403104 L 835.361742 158.403104 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4603
  </g>
4604
  <g id="line2d_11">
4605
  <g>
4606
+ <use ns4:href="#m0fca2865ba" x="47.81" y="158.403104" style="stroke: #000000; stroke-width: 0.8" />
4607
  </g>
4608
  </g>
4609
  <g id="text_11">
4610
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="162.202323" transform="rotate(-0 40.81 162.202323)">1.8</text>
4611
  </g>
4612
  </g>
4613
  <g id="ytick_6">
4614
  <g id="grid-y--7" class="grid grid-y">
4615
+ <path d="M 47.81 97.406797 L 835.361742 97.406797 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4616
  </g>
4617
  <g id="line2d_12">
4618
  <g>
4619
+ <use ns4:href="#m0fca2865ba" x="47.81" y="97.406797" style="stroke: #000000; stroke-width: 0.8" />
4620
  </g>
4621
  </g>
4622
  <g id="text_12">
4623
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="101.206016" transform="rotate(-0 40.81 101.206016)">2.0</text>
4624
  </g>
4625
  </g>
4626
  <g id="ytick_7">
4627
  <g id="grid-y--8" class="grid grid-y">
4628
+ <path d="M 47.81 36.41049 L 835.361742 36.41049 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4629
  </g>
4630
  <g id="line2d_13">
4631
  <g>
4632
+ <use ns4:href="#m0fca2865ba" x="47.81" y="36.41049" style="stroke: #000000; stroke-width: 0.8" />
4633
  </g>
4634
  </g>
4635
  <g id="text_13">
4636
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="40.209709" transform="rotate(-0 40.81 40.209709)">2.2</text>
4637
  </g>
4638
  </g>
4639
  <g id="label--y" class="ylabel">
 
4641
  </g>
4642
  </g>
4643
  <g id="series--torch-flash-ma" class="series">
4644
+ <path d="M 83.607806 337.885652 L 226.799032 325.671141 L 369.990258 317.408887 L 513.181484 307.237447 L 656.37271 265.438813 L 799.563935 253.93491 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4645
  <defs>
4646
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4647
  </defs>
4648
  <g clip-path="url(#p09feef2583)">
4649
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="337.885652" style="fill: #1f77b4; stroke: #1f77b4" />
4650
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="325.671141" style="fill: #1f77b4; stroke: #1f77b4" />
4651
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="317.408887" style="fill: #1f77b4; stroke: #1f77b4" />
4652
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="307.237447" style="fill: #1f77b4; stroke: #1f77b4" />
4653
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="265.438813" style="fill: #1f77b4; stroke: #1f77b4" />
4654
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="253.93491" style="fill: #1f77b4; stroke: #1f77b4" />
4655
  </g>
4656
  </g>
4657
  <g id="series--torch-mem-eff" class="series">
4658
+ <path d="M 83.607806 150.723364 L 226.799032 118.266314 L 369.990258 111.819309 L 513.181484 85.541185 L 656.37271 73.225726 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4659
  <defs>
4660
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4661
  </defs>
4662
  <g clip-path="url(#p09feef2583)">
4663
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="150.723364" style="fill: #ff7f0e; stroke: #ff7f0e" />
4664
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="118.266314" style="fill: #ff7f0e; stroke: #ff7f0e" />
4665
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="111.819309" style="fill: #ff7f0e; stroke: #ff7f0e" />
4666
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="85.541185" style="fill: #ff7f0e; stroke: #ff7f0e" />
4667
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="73.225726" style="fill: #ff7f0e; stroke: #ff7f0e" />
4668
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4669
  </g>
4670
  </g>
4671
  <g id="series--xformers-meff" class="series">
4672
+ <path d="M 83.607806 408.996061 L 226.799032 390.330886 L 369.990258 383.840574 L 513.181484 376.133386 L 656.37271 322.193132 L 799.563935 327.713603 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4673
  <defs>
4674
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4675
  </defs>
4676
  <g clip-path="url(#p09feef2583)">
4677
+ <use ns4:href="#mc655281e0b" x="83.607806" y="408.996061" style="fill: #2ca02c; stroke: #2ca02c" />
4678
+ <use ns4:href="#mc655281e0b" x="226.799032" y="390.330886" style="fill: #2ca02c; stroke: #2ca02c" />
4679
+ <use ns4:href="#mc655281e0b" x="369.990258" y="383.840574" style="fill: #2ca02c; stroke: #2ca02c" />
4680
+ <use ns4:href="#mc655281e0b" x="513.181484" y="376.133386" style="fill: #2ca02c; stroke: #2ca02c" />
4681
+ <use ns4:href="#mc655281e0b" x="656.37271" y="322.193132" style="fill: #2ca02c; stroke: #2ca02c" />
4682
+ <use ns4:href="#mc655281e0b" x="799.563935" y="327.713603" style="fill: #2ca02c; stroke: #2ca02c" />
4683
  </g>
4684
  </g>
4685
  <g id="series--hf-kernels-flash-attn" class="series">
4686
+ <path d="M 83.607806 417.466618 L 226.799032 402.775353 L 369.990258 391.490731 L 513.181484 383.472462 L 656.37271 336.492496 L 799.563935 330.032377 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4687
  <defs>
4688
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4689
  </defs>
4690
  <g clip-path="url(#p09feef2583)">
4691
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="417.466618" style="fill: #d62728; stroke: #d62728" />
4692
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="402.775353" style="fill: #d62728; stroke: #d62728" />
4693
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="391.490731" style="fill: #d62728; stroke: #d62728" />
4694
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="383.472462" style="fill: #d62728; stroke: #d62728" />
4695
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="336.492496" style="fill: #d62728; stroke: #d62728" />
4696
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="330.032377" style="fill: #d62728; stroke: #d62728" />
4697
  </g>
4698
  </g>
4699
  <g id="series--hf-kernels-flash-attn3" class="series">
4700
+ <path d="M 83.607806 428.387702 L 226.799032 409.68288 L 369.990258 400.234552 L 513.181484 400.520929 L 656.37271 346.614528 L 799.563935 349.227915 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4701
  <defs>
4702
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4703
  </defs>
4704
  <g clip-path="url(#p09feef2583)">
4705
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4706
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="409.68288" style="fill: #9467bd; stroke: #9467bd" />
4707
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="400.234552" style="fill: #9467bd; stroke: #9467bd" />
4708
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="400.520929" style="fill: #9467bd; stroke: #9467bd" />
4709
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="346.614528" style="fill: #9467bd; stroke: #9467bd" />
4710
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="349.227915" style="fill: #9467bd; stroke: #9467bd" />
4711
  </g>
4712
  </g>
4713
  <g id="patch_3">
index.html CHANGED
@@ -3928,8 +3928,9 @@ uvx https://github.com/drbh/uvnote.git build benches
3928
  </div>
3929
 
3930
  <h2>ACTIVATION FUNCTIONS</h2>
 
3931
  <div class="artifact-preview">
3932
- <img src="activation/results/artifacts/combine/latency.svg" alt="Activation Latency" width="800">
3933
  </div>
3934
 
3935
  <table>
@@ -3959,6 +3960,31 @@ uvx https://github.com/drbh/uvnote.git build benches
3959
  </tr>
3960
  </tbody>
3961
  </table>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3962
  <p align="center">
3963
  <button
3964
  onclick="window.location.href='activation/'"
 
3928
  </div>
3929
 
3930
  <h2>ACTIVATION FUNCTIONS</h2>
3931
+ <h3>Linux (CUDA)</h3>
3932
  <div class="artifact-preview">
3933
+ <img src="activation/results_linux/artifacts/combine/latency.svg" alt="Activation Latency (Linux)" width="800">
3934
  </div>
3935
 
3936
  <table>
 
3960
  </tr>
3961
  </tbody>
3962
  </table>
3963
+ <h3>macOS (MPS/CPU)</h3>
3964
+ <div class="artifact-preview">
3965
+ <img src="activation/results_darwin/artifacts/combine/latency.svg" alt="Activation Latency (macOS)" width="800">
3966
+ </div>
3967
+
3968
+ <table>
3969
+ <thead>
3970
+ <tr>
3971
+ <th>Implementation</th>
3972
+ <th>Description</th>
3973
+ <th>Source</th>
3974
+ <th>HF</th>
3975
+ <th>Bench</th>
3976
+ </tr>
3977
+ </thead>
3978
+ <tbody>
3979
+ <tr>
3980
+ <td>PyTorch SwiGLU (macOS)</td>
3981
+ <td>PyTorch native SwiGLU on macOS</td>
3982
+ <td>-</td>
3983
+ <td>-</td>
3984
+ <td><a href="activation/impls/torch_swiglu_darwin.html">Bench</a></td>
3985
+ </tr>
3986
+ </tbody>
3987
+ </table>
3988
  <p align="center">
3989
  <button
3990
  onclick="window.location.href='activation/'"
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl CHANGED
@@ -1,4 +1,4 @@
1
- {"ts": "2025-12-19T18:57:07Z", "run": "8a911691677c4be4b2377923d73cef2c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8283059999598663, "p50": 0.8335360000160108, "p90": 0.8356760000083341, "mean": 0.8340919999909602, "iqr": 0.0024800000346658635, "raw_times": [0.8356760000083341, 0.8397459999969215, 0.8331959999736682, 0.8335360000160108, 0.8283059999598663], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8370360000071742, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
2
- {"ts": "2025-12-19T18:57:07Z", "run": "8a911691677c4be4b2377923d73cef2c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6452309999976933, "p50": 1.6598209999756364, "p90": 1.6613920000168036, "mean": 1.6622576000031586, "iqr": 0.0022199999989425123, "raw_times": [1.6591720000178611, 1.6613920000168036, 1.6598209999756364, 1.6856720000077985, 1.6452309999976933], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.654771999994864, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
3
- {"ts": "2025-12-19T18:57:07Z", "run": "8a911691677c4be4b2377923d73cef2c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6495710000299368, "p50": 1.651621000007708, "p90": 1.6563920000294274, "mean": 1.6539776000172424, "iqr": 0.0065000000404324965, "raw_times": [1.649891999988995, 1.6624120000301446, 1.6563920000294274, 1.6495710000299368, 1.651621000007708], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6589109999927132, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
4
- {"ts": "2025-12-19T18:57:07Z", "run": "8a911691677c4be4b2377923d73cef2c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.243421000036051, "p50": 3.2525119999604613, "p90": 3.2605619999799274, "mean": 3.252856000005977, "iqr": 0.017038999942542432, "raw_times": [3.2525119999604613, 3.2642620000160605, 3.2605619999799274, 3.243421000036051, 3.243523000037385], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.250041000001147, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
 
1
+ {"ts": "2025-12-19T19:41:22Z", "run": "e4072b52508346c79afed4185ddfbd8a", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8300210000129482, "p50": 0.8342720000200643, "p90": 0.83692099997279, "mean": 0.8337814000014987, "iqr": 0.006369000004724512, "raw_times": [0.8300210000129482, 0.83692099997279, 0.8342720000200643, 0.8305519999680655, 0.8371410000336255], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8391320000100677, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
2
+ {"ts": "2025-12-19T19:41:22Z", "run": "e4072b52508346c79afed4185ddfbd8a", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6468019999820172, "p50": 1.6505419999930382, "p90": 1.6509619999851566, "mean": 1.650563999987753, "iqr": 0.0014700000292577897, "raw_times": [1.6509619999851566, 1.6505419999930382, 1.6468019999820172, 1.6494919999558988, 1.6550220000226545], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6566229999739335, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
3
+ {"ts": "2025-12-19T19:41:22Z", "run": "e4072b52508346c79afed4185ddfbd8a", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.643002000037086, "p50": 1.6493729999638163, "p90": 1.6502219999665613, "mean": 1.6475743999876613, "iqr": 0.0068199999532225775, "raw_times": [1.643002000037086, 1.6434020000133387, 1.6502219999665613, 1.6493729999638163, 1.6518729999575044], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6485120000311326, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
4
+ {"ts": "2025-12-19T19:41:22Z", "run": "e4072b52508346c79afed4185ddfbd8a", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.2460340000284305, "p50": 3.2577039999637236, "p90": 3.260522999994464, "mean": 3.2551376000014898, "iqr": 0.011509999978898122, "raw_times": [3.260522999994464, 3.2460340000284305, 3.249013000015566, 3.2577039999637236, 3.262414000005265], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2401029999959974, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
layer_norm/impls/hf_kernels_layer_norm.html CHANGED
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
3889
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3890
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3891
  </span> |
3892
- Cell: benchmark | 6.26s
3893
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3894
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3895
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3961,19 +3961,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
3961
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3962
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3963
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3964
- hf_kernels_layer_norm 4.17% 177.304us 48.13% 2.048ms 2.048ms 0.000us 0.00% 3.167ms 3.167ms 1
3965
- _layer_norm_f8ec252::dropout_add_ln_fwd 1.47% 62.693us 43.45% 1.849ms 616.229us 2.429ms 100.00% 3.167ms 1.056ms 3
3966
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.430ms 100.06% 2.430ms 2.430ms 1
3967
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.429ms 100.00% 2.429ms 809.553us 3
3968
- Activity Buffer Request 39.70% 1.689ms 39.70% 1.689ms 1.689ms 738.629us 30.41% 738.629us 738.629us 1
3969
- aten::view 0.51% 21.739us 0.51% 21.739us 3.623us 0.000us 0.00% 0.000us 0.000us 6
3970
- aten::empty 1.04% 44.400us 1.04% 44.400us 4.933us 0.000us 0.00% 0.000us 0.000us 9
3971
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.22% 9.310us 0.22% 9.310us 3.103us 0.000us 0.00% 0.000us 0.000us 3
3972
- cudaLaunchKernel 1.01% 43.131us 1.01% 43.131us 14.377us 0.000us 0.00% 0.000us 0.000us 3
3973
- cudaDeviceSynchronize 51.87% 2.207ms 51.87% 2.207ms 2.207ms 0.000us 0.00% 0.000us 0.000us 1
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
- Self CPU time total: 4.255ms
3976
- Self CUDA time total: 2.429ms
3977
 
3978
 
3979
 
@@ -3983,19 +3983,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
3983
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3984
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3985
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3986
- hf_kernels_layer_norm 2.14% 140.133us 29.32% 1.923ms 1.923ms 0.000us 0.00% 6.388ms 6.388ms 1
3987
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.69% 45.053us 27.01% 1.772ms 590.648us 4.807ms 100.00% 6.388ms 2.129ms 3
3988
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.808ms 100.03% 4.808ms 4.808ms 1
3989
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.807ms 100.00% 4.807ms 1.602ms 3
3990
- Activity Buffer Request 25.34% 1.663ms 25.34% 1.663ms 1.663ms 1.581ms 32.89% 1.581ms 1.581ms 1
3991
- aten::view 0.17% 11.390us 0.17% 11.390us 1.898us 0.000us 0.00% 0.000us 0.000us 6
3992
- aten::empty 0.45% 29.620us 0.45% 29.620us 3.291us 0.000us 0.00% 0.000us 0.000us 9
3993
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.820us 0.07% 4.820us 1.607us 0.000us 0.00% 0.000us 0.000us 3
3994
- cudaLaunchKernel 0.46% 29.860us 0.46% 29.860us 9.953us 0.000us 0.00% 0.000us 0.000us 3
3995
- cudaDeviceSynchronize 70.68% 4.637ms 70.68% 4.637ms 4.637ms 0.000us 0.00% 0.000us 0.000us 1
3996
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
- Self CPU time total: 6.560ms
3998
- Self CUDA time total: 4.807ms
3999
 
4000
 
4001
 
@@ -4005,19 +4005,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
- hf_kernels_layer_norm 1.98% 129.253us 29.33% 1.919ms 1.919ms 0.000us 0.00% 6.330ms 6.330ms 1
4009
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.71% 46.780us 27.18% 1.779ms 592.854us 4.774ms 100.00% 6.330ms 2.110ms 3
4010
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.775ms 100.03% 4.775ms 4.775ms 1
4011
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.774ms 100.00% 4.774ms 1.591ms 3
4012
- Activity Buffer Request 25.49% 1.668ms 25.49% 1.668ms 1.668ms 1.556ms 32.59% 1.556ms 1.556ms 1
4013
- aten::view 0.17% 11.271us 0.17% 11.271us 1.879us 0.000us 0.00% 0.000us 0.000us 6
4014
- aten::empty 0.45% 29.221us 0.45% 29.221us 3.247us 0.000us 0.00% 0.000us 0.000us 9
4015
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.980us 0.08% 4.980us 1.660us 0.000us 0.00% 0.000us 0.000us 3
4016
- cudaLaunchKernel 0.45% 29.470us 0.45% 29.470us 9.823us 0.000us 0.00% 0.000us 0.000us 3
4017
- cudaDeviceSynchronize 70.67% 4.624ms 70.67% 4.624ms 4.624ms 0.000us 0.00% 0.000us 0.000us 1
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
- Self CPU time total: 6.543ms
4020
- Self CUDA time total: 4.774ms
4021
 
4022
 
4023
 
@@ -4027,36 +4027,40 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4029
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4030
- hf_kernels_layer_norm 1.22% 142.314us 18.53% 2.155ms 2.155ms 0.000us 0.00% 12.836ms 12.836ms 1
4031
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.38% 44.492us 17.20% 2.000ms 666.802us 9.636ms 100.00% 12.836ms 4.279ms 3
4032
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.637ms 100.02% 9.637ms 9.637ms 1
4033
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.636ms 100.00% 9.636ms 3.212ms 3
4034
- Activity Buffer Request 14.57% 1.694ms 14.57% 1.694ms 1.694ms 3.200ms 33.21% 3.200ms 3.200ms 1
4035
- aten::view 0.10% 12.130us 0.10% 12.130us 2.022us 0.000us 0.00% 0.000us 0.000us 6
4036
- aten::empty 0.25% 29.499us 0.25% 29.499us 3.278us 0.000us 0.00% 0.000us 0.000us 9
4037
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.04% 4.820us 0.04% 4.820us 1.607us 0.000us 0.00% 0.000us 0.000us 3
4038
- cudaLaunchKernel 1.96% 227.814us 1.96% 227.814us 75.938us 0.000us 0.00% 0.000us 0.000us 3
4039
- cudaDeviceSynchronize 81.47% 9.472ms 81.47% 9.472ms 9.472ms 0.000us 0.00% 0.000us 0.000us 1
4040
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4041
- Self CPU time total: 11.627ms
4042
- Self CUDA time total: 9.636ms
4043
 
4044
 
4045
  impl wl p50(ms) ok
4046
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4047
- hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
4048
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4049
- hf_kernels_layer_norm LN_B16_S4096_D8192 3.25 True
4050
  </pre></div>
4051
  <div class="uv-install-logs" id="uv-logs-benchmark">
4052
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4053
  <div class="uv-logs-content" style="display: none;">
 
 
4054
  Installed 14 packages in 12ms
4055
  </div>
4056
  </div>
4057
- <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4058
- Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.35it/s]
4059
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.71it/s]</div>
 
 
4060
  <div class="cell-artifacts">
4061
  <h4>Artifacts:</h4>
4062
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
 
3889
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3890
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3891
  </span> |
3892
+ Cell: benchmark | 6.61s
3893
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3894
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3895
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3961
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3962
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3963
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3964
+ hf_kernels_layer_norm 4.34% 182.243us 49.12% 2.065ms 2.065ms 0.000us 0.00% 3.103ms 3.103ms 1
3965
+ _layer_norm_f8ec252::dropout_add_ln_fwd 1.54% 64.542us 44.23% 1.860ms 619.846us 2.366ms 100.00% 3.103ms 1.034ms 3
3966
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.367ms 100.06% 2.367ms 2.367ms 1
3967
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.366ms 100.00% 2.366ms 788.551us 3
3968
+ Activity Buffer Request 40.34% 1.696ms 40.34% 1.696ms 1.696ms 737.372us 31.17% 737.372us 737.372us 1
3969
+ aten::view 0.55% 23.192us 0.55% 23.192us 3.865us 0.000us 0.00% 0.000us 0.000us 6
3970
+ aten::empty 1.11% 46.641us 1.11% 46.641us 5.182us 0.000us 0.00% 0.000us 0.000us 9
3971
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.21% 8.950us 0.21% 8.950us 2.983us 0.000us 0.00% 0.000us 0.000us 3
3972
+ cudaLaunchKernel 1.04% 43.741us 1.04% 43.741us 14.580us 0.000us 0.00% 0.000us 0.000us 3
3973
+ cudaDeviceSynchronize 50.88% 2.139ms 50.88% 2.139ms 2.139ms 0.000us 0.00% 0.000us 0.000us 1
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
+ Self CPU time total: 4.204ms
3976
+ Self CUDA time total: 2.366ms
3977
 
3978
 
3979
 
 
3983
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3984
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3985
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3986
+ hf_kernels_layer_norm 2.14% 142.004us 28.99% 1.924ms 1.924ms 0.000us 0.00% 6.477ms 6.477ms 1
3987
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.66% 43.639us 26.68% 1.771ms 590.278us 4.886ms 100.00% 6.477ms 2.159ms 3
3988
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.887ms 100.03% 4.887ms 4.887ms 1
3989
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.886ms 100.00% 4.886ms 1.629ms 3
3990
+ Activity Buffer Request 25.01% 1.660ms 25.01% 1.660ms 1.660ms 1.591ms 32.57% 1.591ms 1.591ms 1
3991
+ aten::view 0.17% 11.341us 0.17% 11.341us 1.890us 0.000us 0.00% 0.000us 0.000us 6
3992
+ aten::empty 0.47% 31.442us 0.47% 31.442us 3.494us 0.000us 0.00% 0.000us 0.000us 9
3993
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.640us 0.07% 4.640us 1.547us 0.000us 0.00% 0.000us 0.000us 3
3994
+ cudaLaunchKernel 0.46% 30.730us 0.46% 30.730us 10.243us 0.000us 0.00% 0.000us 0.000us 3
3995
+ cudaDeviceSynchronize 71.01% 4.714ms 71.01% 4.714ms 4.714ms 0.000us 0.00% 0.000us 0.000us 1
3996
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
+ Self CPU time total: 6.638ms
3998
+ Self CUDA time total: 4.886ms
3999
 
4000
 
4001
 
 
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
+ hf_kernels_layer_norm 1.93% 128.176us 30.23% 2.007ms 2.007ms 0.000us 0.00% 6.371ms 6.371ms 1
4009
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.67% 44.789us 28.12% 1.867ms 622.462us 4.799ms 100.00% 6.371ms 2.124ms 3
4010
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.800ms 100.03% 4.800ms 4.800ms 1
4011
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.799ms 100.00% 4.799ms 1.600ms 3
4012
+ Activity Buffer Request 26.44% 1.756ms 26.44% 1.756ms 1.756ms 1.572ms 32.76% 1.572ms 1.572ms 1
4013
+ aten::view 0.18% 11.888us 0.18% 11.888us 1.981us 0.000us 0.00% 0.000us 0.000us 6
4014
+ aten::empty 0.47% 31.493us 0.47% 31.493us 3.499us 0.000us 0.00% 0.000us 0.000us 9
4015
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.790us 0.07% 4.790us 1.597us 0.000us 0.00% 0.000us 0.000us 3
4016
+ cudaLaunchKernel 0.46% 30.490us 0.46% 30.490us 10.163us 0.000us 0.00% 0.000us 0.000us 3
4017
+ cudaDeviceSynchronize 69.77% 4.633ms 69.77% 4.633ms 4.633ms 0.000us 0.00% 0.000us 0.000us 1
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
+ Self CPU time total: 6.641ms
4020
+ Self CUDA time total: 4.799ms
4021
 
4022
 
4023
 
 
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4029
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4030
+ hf_kernels_layer_norm 1.63% 190.425us 19.77% 2.315ms 2.315ms 0.000us 0.00% 12.766ms 12.766ms 1
4031
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.56% 65.132us 17.99% 2.107ms 702.188us 9.610ms 100.00% 12.766ms 4.255ms 3
4032
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.611ms 100.01% 9.611ms 9.611ms 1
4033
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.610ms 100.00% 9.610ms 3.203ms 3
4034
+ Activity Buffer Request 14.43% 1.690ms 14.43% 1.690ms 1.690ms 3.156ms 32.84% 3.156ms 3.156ms 1
4035
+ aten::view 0.16% 18.311us 0.16% 18.311us 3.052us 0.000us 0.00% 0.000us 0.000us 6
4036
+ aten::empty 0.27% 31.990us 0.27% 31.990us 3.554us 0.000us 0.00% 0.000us 0.000us 9
4037
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.06% 6.981us 0.06% 6.981us 2.327us 0.000us 0.00% 0.000us 0.000us 3
4038
+ cudaLaunchKernel 2.67% 312.827us 2.67% 312.827us 104.276us 0.000us 0.00% 0.000us 0.000us 3
4039
+ cudaDeviceSynchronize 80.23% 9.393ms 80.23% 9.393ms 9.393ms 0.000us 0.00% 0.000us 0.000us 1
4040
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4041
+ Self CPU time total: 11.708ms
4042
+ Self CUDA time total: 9.610ms
4043
 
4044
 
4045
  impl wl p50(ms) ok
4046
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4047
+ hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4048
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4049
+ hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4050
  </pre></div>
4051
  <div class="uv-install-logs" id="uv-logs-benchmark">
4052
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4053
  <div class="uv-logs-content" style="display: none;">
4054
+ Downloading hf-xet (3.2MiB)
4055
+ Downloaded hf-xet
4056
  Installed 14 packages in 12ms
4057
  </div>
4058
  </div>
4059
+ <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
4060
+
4061
+ Fetching 4 files: 25%|██▌ | 1/4 [00:00&lt;00:00, 8.01it/s]
4062
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.00it/s]
4063
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.30it/s]</div>
4064
  <div class="cell-artifacts">
4065
  <h4>Artifacts:</h4>
4066
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
layer_norm/impls/torch_layer_norm.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,7 +3904,7 @@ Cell: nv | 0.25s
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
- <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:56:51 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.25s
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
- | N/A 34C P0 107W / 350W | 0MiB / 46068MiB | 53% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
@@ -3937,7 +3937,7 @@ Cell: nv | 0.25s
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3939
  </span> |
3940
- Cell: benchmark | 7.79s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3985,19 +3985,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
3985
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3986
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
- torch_layer_norm 3.56% 149.102us 49.42% 2.068ms 2.068ms 0.000us 0.00% 3.039ms 3.039ms 1
3989
- aten::layer_norm 0.35% 14.790us 45.86% 1.919ms 639.751us 0.000us 0.00% 3.039ms 1.013ms 3
3990
- aten::native_layer_norm 1.65% 69.001us 45.51% 1.904ms 634.821us 2.327ms 100.00% 3.039ms 1.013ms 3
3991
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.329ms 100.06% 2.329ms 2.329ms 1
3992
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.327ms 100.00% 2.327ms 775.759us 3
3993
- Activity Buffer Request 41.45% 1.735ms 41.45% 1.735ms 1.735ms 711.588us 30.58% 711.588us 711.588us 1
3994
- aten::empty 1.11% 46.511us 1.11% 46.511us 5.168us 0.000us 0.00% 0.000us 0.000us 9
3995
- cudaLaunchKernel 1.13% 47.301us 1.13% 47.301us 15.767us 0.000us 0.00% 0.000us 0.000us 3
3996
- aten::view 0.16% 6.890us 0.16% 6.890us 1.148us 0.000us 0.00% 0.000us 0.000us 6
3997
- cudaDeviceSynchronize 50.58% 2.117ms 50.58% 2.117ms 2.117ms 0.000us 0.00% 0.000us 0.000us 1
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
- Self CPU time total: 4.185ms
4000
- Self CUDA time total: 2.327ms
4001
 
4002
 
4003
 
@@ -4007,19 +4007,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4009
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4010
- torch_layer_norm 1.05% 70.042us 28.68% 1.911ms 1.911ms 0.000us 0.00% 6.475ms 6.475ms 1
4011
- aten::layer_norm 0.13% 8.728us 27.63% 1.841ms 613.810us 0.000us 0.00% 6.475ms 2.158ms 3
4012
- aten::native_layer_norm 0.73% 48.442us 27.50% 1.833ms 610.901us 4.886ms 100.00% 6.475ms 2.158ms 3
4013
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.888ms 100.03% 4.888ms 4.888ms 1
4014
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.886ms 100.00% 4.886ms 1.629ms 3
4015
- Activity Buffer Request 25.85% 1.723ms 25.85% 1.723ms 1.723ms 1.589ms 32.51% 1.589ms 1.589ms 1
4016
- aten::empty 0.43% 28.711us 0.43% 28.711us 3.190us 0.000us 0.00% 0.000us 0.000us 9
4017
- cudaLaunchKernel 0.44% 29.201us 0.44% 29.201us 9.734us 0.000us 0.00% 0.000us 0.000us 3
4018
- aten::view 0.06% 3.829us 0.06% 3.829us 0.638us 0.000us 0.00% 0.000us 0.000us 6
4019
- cudaDeviceSynchronize 71.32% 4.753ms 71.32% 4.753ms 4.753ms 0.000us 0.00% 0.000us 0.000us 1
4020
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4021
- Self CPU time total: 6.665ms
4022
- Self CUDA time total: 4.886ms
4023
 
4024
 
4025
 
@@ -4029,19 +4029,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
4029
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4030
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4031
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4032
- torch_layer_norm 1.06% 69.120us 29.93% 1.960ms 1.960ms 0.000us 0.00% 6.232ms 6.232ms 1
4033
- aten::layer_norm 0.13% 8.631us 28.88% 1.891ms 630.434us 0.000us 0.00% 6.232ms 2.077ms 3
4034
- aten::native_layer_norm 0.71% 46.790us 28.75% 1.883ms 627.557us 4.719ms 100.00% 6.232ms 2.077ms 3
4035
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.721ms 100.03% 4.721ms 4.721ms 1
4036
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.719ms 100.00% 4.719ms 1.573ms 3
4037
- Activity Buffer Request 27.06% 1.772ms 27.06% 1.772ms 1.772ms 1.513ms 32.05% 1.513ms 1.513ms 1
4038
- aten::empty 0.45% 29.333us 0.45% 29.333us 3.259us 0.000us 0.00% 0.000us 0.000us 9
4039
- cudaLaunchKernel 0.46% 30.200us 0.46% 30.200us 10.067us 0.000us 0.00% 0.000us 0.000us 3
4040
- aten::view 0.06% 3.850us 0.06% 3.850us 0.642us 0.000us 0.00% 0.000us 0.000us 6
4041
- cudaDeviceSynchronize 70.07% 4.589ms 70.07% 4.589ms 4.589ms 0.000us 0.00% 0.000us 0.000us 1
4042
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4043
- Self CPU time total: 6.549ms
4044
- Self CUDA time total: 4.719ms
4045
 
4046
 
4047
 
@@ -4051,31 +4051,77 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
4051
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4052
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
- torch_layer_norm 0.60% 67.701us 14.52% 1.650ms 1.650ms 0.000us 0.00% 13.091ms 13.091ms 1
4055
- aten::layer_norm 0.08% 8.549us 13.92% 1.582ms 527.445us 0.000us 0.00% 13.091ms 4.364ms 3
4056
- aten::native_layer_norm 0.41% 47.051us 13.85% 1.574ms 524.596us 9.846ms 100.00% 13.091ms 4.364ms 3
4057
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.848ms 100.02% 9.848ms 9.848ms 1
4058
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.846ms 100.00% 9.846ms 3.282ms 3
4059
- Activity Buffer Request 11.12% 1.264ms 11.12% 1.264ms 1.264ms 3.245ms 32.96% 3.245ms 3.245ms 1
4060
- aten::empty 0.26% 29.420us 0.26% 29.420us 3.269us 0.000us 0.00% 0.000us 0.000us 9
4061
- cudaLaunchKernel 2.02% 229.604us 2.02% 229.604us 76.535us 0.000us 0.00% 0.000us 0.000us 3
4062
- aten::view 0.04% 3.990us 0.04% 3.990us 0.665us 0.000us 0.00% 0.000us 0.000us 6
4063
- cudaDeviceSynchronize 85.48% 9.715ms 85.48% 9.715ms 9.715ms 0.000us 0.00% 0.000us 0.000us 1
4064
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
- Self CPU time total: 11.365ms
4066
- Self CUDA time total: 9.846ms
4067
 
4068
 
4069
  impl wl p50(ms) ok
4070
  torch_layer_norm LN_B16_S2048_D4096 0.81 True
4071
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4072
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4073
- torch_layer_norm LN_B16_S4096_D8192 3.33 True
4074
  </pre></div>
4075
  <div class="uv-install-logs" id="uv-logs-benchmark">
4076
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4077
  <div class="uv-logs-content" style="display: none;">
4078
- Installed 37 packages in 298ms
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4079
  </div>
4080
  </div>
4081
  <div class="cell-artifacts">
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.30s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 19:40:36 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
 
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
+ | N/A 26C P8 24W / 350W | 0MiB / 46068MiB | 0% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
 
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3939
  </span> |
3940
+ Cell: benchmark | 32.13s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3985
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3986
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
+ torch_layer_norm 2.46% 151.464us 66.01% 4.061ms 4.061ms 0.000us 0.00% 3.020ms 3.020ms 1
3989
+ aten::layer_norm 0.24% 14.681us 63.55% 3.910ms 1.303ms 0.000us 0.00% 3.020ms 1.007ms 3
3990
+ aten::native_layer_norm 20.97% 1.290ms 63.31% 3.895ms 1.298ms 2.310ms 100.00% 3.020ms 1.007ms 3
3991
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.311ms 100.06% 2.311ms 2.311ms 1
3992
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.310ms 100.00% 2.310ms 770.057us 3
3993
+ Activity Buffer Request 40.34% 2.482ms 40.34% 2.482ms 2.482ms 709.854us 30.73% 709.854us 709.854us 1
3994
+ aten::empty 1.09% 66.873us 1.09% 66.873us 7.430us 0.000us 0.00% 0.000us 0.000us 9
3995
+ cudaLaunchKernel 0.79% 48.731us 0.79% 48.731us 16.244us 0.000us 0.00% 0.000us 0.000us 3
3996
+ aten::view 0.12% 7.460us 0.12% 7.460us 1.243us 0.000us 0.00% 0.000us 0.000us 6
3997
+ cudaDeviceSynchronize 33.99% 2.091ms 33.99% 2.091ms 2.091ms 0.000us 0.00% 0.000us 0.000us 1
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
+ Self CPU time total: 6.152ms
4000
+ Self CUDA time total: 2.310ms
4001
 
4002
 
4003
 
 
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4009
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4010
+ torch_layer_norm 1.07% 70.812us 28.19% 1.857ms 1.857ms 0.000us 0.00% 6.442ms 6.442ms 1
4011
+ aten::layer_norm 0.14% 9.000us 27.11% 1.786ms 595.403us 0.000us 0.00% 6.442ms 2.147ms 3
4012
+ aten::native_layer_norm 0.75% 49.502us 26.98% 1.777ms 592.403us 4.862ms 100.00% 6.442ms 2.147ms 3
4013
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.864ms 100.03% 4.864ms 4.864ms 1
4014
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.862ms 100.00% 4.862ms 1.621ms 3
4015
+ Activity Buffer Request 25.31% 1.667ms 25.31% 1.667ms 1.667ms 1.580ms 32.49% 1.580ms 1.580ms 1
4016
+ aten::empty 0.43% 28.150us 0.43% 28.150us 3.128us 0.000us 0.00% 0.000us 0.000us 9
4017
+ cudaLaunchKernel 0.44% 28.800us 0.44% 28.800us 9.600us 0.000us 0.00% 0.000us 0.000us 3
4018
+ aten::view 0.06% 3.751us 0.06% 3.751us 0.625us 0.000us 0.00% 0.000us 0.000us 6
4019
+ cudaDeviceSynchronize 71.81% 4.731ms 71.81% 4.731ms 4.731ms 0.000us 0.00% 0.000us 0.000us 1
4020
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4021
+ Self CPU time total: 6.588ms
4022
+ Self CUDA time total: 4.862ms
4023
 
4024
 
4025
 
 
4029
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4030
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4031
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4032
+ torch_layer_norm 1.08% 70.451us 29.89% 1.957ms 1.957ms 0.000us 0.00% 6.239ms 6.239ms 1
4033
+ aten::layer_norm 0.13% 8.611us 28.81% 1.886ms 628.738us 0.000us 0.00% 6.239ms 2.080ms 3
4034
+ aten::native_layer_norm 0.76% 49.870us 28.68% 1.878ms 625.867us 4.724ms 100.00% 6.239ms 2.080ms 3
4035
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.726ms 100.03% 4.726ms 4.726ms 1
4036
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.724ms 100.00% 4.724ms 1.575ms 3
4037
+ Activity Buffer Request 26.98% 1.766ms 26.98% 1.766ms 1.766ms 1.515ms 32.08% 1.515ms 1.515ms 1
4038
+ aten::empty 0.45% 29.490us 0.45% 29.490us 3.277us 0.000us 0.00% 0.000us 0.000us 9
4039
+ cudaLaunchKernel 0.43% 27.941us 0.43% 27.941us 9.314us 0.000us 0.00% 0.000us 0.000us 3
4040
+ aten::view 0.06% 4.101us 0.06% 4.101us 0.684us 0.000us 0.00% 0.000us 0.000us 6
4041
+ cudaDeviceSynchronize 70.11% 4.590ms 70.11% 4.590ms 4.590ms 0.000us 0.00% 0.000us 0.000us 1
4042
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4043
+ Self CPU time total: 6.547ms
4044
+ Self CUDA time total: 4.724ms
4045
 
4046
 
4047
 
 
4051
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4052
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
+ torch_layer_norm 0.65% 74.391us 15.11% 1.731ms 1.731ms 0.000us 0.00% 13.123ms 13.123ms 1
4055
+ aten::layer_norm 0.08% 9.310us 14.46% 1.656ms 552.093us 0.000us 0.00% 13.123ms 4.374ms 3
4056
+ aten::native_layer_norm 0.45% 52.052us 14.38% 1.647ms 548.989us 9.864ms 100.00% 13.123ms 4.374ms 3
4057
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.866ms 100.01% 9.866ms 9.866ms 1
4058
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.864ms 100.00% 9.864ms 3.288ms 3
4059
+ Activity Buffer Request 11.61% 1.330ms 11.61% 1.330ms 1.330ms 3.258ms 33.03% 3.258ms 3.258ms 1
4060
+ aten::empty 0.27% 31.120us 0.27% 31.120us 3.458us 0.000us 0.00% 0.000us 0.000us 9
4061
+ cudaLaunchKernel 2.01% 229.635us 2.01% 229.635us 76.545us 0.000us 0.00% 0.000us 0.000us 3
4062
+ aten::view 0.04% 4.651us 0.04% 4.651us 0.775us 0.000us 0.00% 0.000us 0.000us 6
4063
+ cudaDeviceSynchronize 84.89% 9.721ms 84.89% 9.721ms 9.721ms 0.000us 0.00% 0.000us 0.000us 1
4064
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
+ Self CPU time total: 11.451ms
4066
+ Self CUDA time total: 9.864ms
4067
 
4068
 
4069
  impl wl p50(ms) ok
4070
  torch_layer_norm LN_B16_S2048_D4096 0.81 True
4071
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4072
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4073
+ torch_layer_norm LN_B16_S4096_D8192 3.32 True
4074
  </pre></div>
4075
  <div class="uv-install-logs" id="uv-logs-benchmark">
4076
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4077
  <div class="uv-logs-content" style="display: none;">
4078
+ Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4079
+ Downloading sympy (6.0MiB)
4080
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4081
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4082
+ Downloading nvidia-cublas-cu12 (566.8MiB)
4083
+ Downloading nvidia-curand-cu12 (60.7MiB)
4084
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4085
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4086
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4087
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4088
+ Downloading numpy (16.1MiB)
4089
+ Downloading setuptools (1.1MiB)
4090
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4091
+ Downloading kiwisolver (1.4MiB)
4092
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4093
+ Downloading matplotlib (8.3MiB)
4094
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4095
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4096
+ Downloading fonttools (4.8MiB)
4097
+ Downloading pillow (6.7MiB)
4098
+ Downloading networkx (2.0MiB)
4099
+ Downloading torch (846.9MiB)
4100
+ Downloading triton (148.3MiB)
4101
+ Downloaded nvidia-cufile-cu12
4102
+ Downloaded kiwisolver
4103
+ Downloaded setuptools
4104
+ Downloaded networkx
4105
+ Downloaded fonttools
4106
+ Downloaded pillow
4107
+ Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4108
+ Downloaded nvidia-cuda-cupti-cu12
4109
+ Downloaded matplotlib
4110
+ Downloaded numpy
4111
+ Downloaded sympy
4112
+ Downloaded nvidia-nvjitlink-cu12
4113
+ Downloaded nvidia-curand-cu12
4114
+ Downloaded nvidia-cuda-nvrtc-cu12
4115
+ Downloaded triton
4116
+ Downloaded nvidia-cufft-cu12
4117
+ Downloaded nvidia-cusolver-cu12
4118
+ Downloaded nvidia-cusparselt-cu12
4119
+ Downloaded nvidia-cusparse-cu12
4120
+ Downloaded nvidia-nccl-cu12
4121
+ Downloaded nvidia-cublas-cu12
4122
+ Downloaded nvidia-cudnn-cu12
4123
+ Downloaded torch
4124
+ Installed 37 packages in 284ms
4125
  </div>
4126
  </div>
4127
  <div class="cell-artifacts">
layer_norm/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 7e491e55a24ade71662af81d8d2a6705d52134907b596d3edfe9685af71c4890
  • Pointer size: 130 Bytes
  • Size of remote file: 14.6 kB

Git LFS Details

  • SHA256: fa76da3cc0e8c6ec848648e3fa2d66315df4d6c7779fd0c7e2825d697af78f88
  • Pointer size: 130 Bytes
  • Size of remote file: 14.6 kB
layer_norm/results/combined_results.html CHANGED
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-12-19T19:09:50.663153</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
@@ -3973,70 +3973,70 @@ body[data-tool="eraser"] .main-content {
3973
  <g id="matplotlib.axis_2">
3974
  <g id="ytick_1">
3975
  <g id="grid-y--2" class="grid grid-y">
3976
- <path d="M 47.72 408.774166 L 840.20233 408.774166 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3977
  </g>
3978
  <g id="line2d_5">
3979
  <defs>
3980
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3981
  </defs>
3982
  <g>
3983
- <use ns4:href="#m0fca2865ba" x="47.72" y="408.774166" style="stroke: #000000; stroke-width: 0.8" />
3984
  </g>
3985
  </g>
3986
  <g id="text_5">
3987
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.573385" transform="rotate(-0 40.72 412.573385)">1.0</text>
3988
  </g>
3989
  </g>
3990
  <g id="ytick_2">
3991
  <g id="grid-y--3" class="grid grid-y">
3992
- <path d="M 47.72 330.886714 L 840.20233 330.886714 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3993
  </g>
3994
  <g id="line2d_6">
3995
  <g>
3996
- <use ns4:href="#m0fca2865ba" x="47.72" y="330.886714" style="stroke: #000000; stroke-width: 0.8" />
3997
  </g>
3998
  </g>
3999
  <g id="text_6">
4000
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.685933" transform="rotate(-0 40.72 334.685933)">1.5</text>
4001
  </g>
4002
  </g>
4003
  <g id="ytick_3">
4004
  <g id="grid-y--4" class="grid grid-y">
4005
- <path d="M 47.72 252.999261 L 840.20233 252.999261 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4006
  </g>
4007
  <g id="line2d_7">
4008
  <g>
4009
- <use ns4:href="#m0fca2865ba" x="47.72" y="252.999261" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.79848" transform="rotate(-0 40.72 256.79848)">2.0</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_4">
4017
  <g id="grid-y--5" class="grid grid-y">
4018
- <path d="M 47.72 175.111809 L 840.20233 175.111809 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
- <use ns4:href="#m0fca2865ba" x="47.72" y="175.111809" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.911028" transform="rotate(-0 40.72 178.911028)">2.5</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_5">
4030
  <g id="grid-y--6" class="grid grid-y">
4031
- <path d="M 47.72 97.224356 L 840.20233 97.224356 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
- <use ns4:href="#m0fca2865ba" x="47.72" y="97.224356" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.023575" transform="rotate(-0 40.72 101.023575)">3.0</text>
4040
  </g>
4041
  </g>
4042
  <g id="label--y" class="ylabel">
@@ -4044,27 +4044,27 @@ body[data-tool="eraser"] .main-content {
4044
  </g>
4045
  </g>
4046
  <g id="series--torch-layer-norm" class="series">
4047
- <path d="M 83.741924 437.689571 L 323.888085 302.950354 L 564.034245 314.128917 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4048
  <defs>
4049
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4050
  </defs>
4051
  <g clip-path="url(#p2214f54723)">
4052
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4053
- <use ns4:href="#md7efaf3aec" x="323.888085" y="302.950354" style="fill: #1f77b4; stroke: #1f77b4" />
4054
- <use ns4:href="#md7efaf3aec" x="564.034245" y="314.128917" style="fill: #1f77b4; stroke: #1f77b4" />
4055
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4056
  </g>
4057
  </g>
4058
  <g id="series--hf-kernels-layer-norm" class="series">
4059
- <path d="M 83.741924 434.70508 L 323.888085 305.990613 L 564.034245 307.267967 L 804.180406 57.889324 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4060
  <defs>
4061
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4062
  </defs>
4063
  <g clip-path="url(#p2214f54723)">
4064
- <use ns4:href="#m9b8c54d372" x="83.741924" y="434.70508" style="fill: #ff7f0e; stroke: #ff7f0e" />
4065
- <use ns4:href="#m9b8c54d372" x="323.888085" y="305.990613" style="fill: #ff7f0e; stroke: #ff7f0e" />
4066
- <use ns4:href="#m9b8c54d372" x="564.034245" y="307.267967" style="fill: #ff7f0e; stroke: #ff7f0e" />
4067
- <use ns4:href="#m9b8c54d372" x="804.180406" y="57.889324" style="fill: #ff7f0e; stroke: #ff7f0e" />
4068
  </g>
4069
  </g>
4070
  <g id="patch_3">
@@ -4122,7 +4122,7 @@ body[data-tool="eraser"] .main-content {
4122
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4123
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4124
  </span> |
4125
- Cell: combine | 4.51s
4126
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4127
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4128
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4210,13 +4210,13 @@ COMBINED BENCHMARK SUMMARY
4210
 
4211
  impl wl p50(ms) ok
4212
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4213
- hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
4214
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4215
- hf_kernels_layer_norm LN_B16_S4096_D8192 3.25 True
4216
  torch_layer_norm LN_B16_S2048_D4096 0.81 True
4217
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4218
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4219
- torch_layer_norm LN_B16_S4096_D8192 3.33 True
4220
 
4221
  GENERATING COMBINED VISUALIZATION
4222
 
@@ -4236,7 +4236,7 @@ Implementations included:
4236
  <div class="uv-install-logs" id="uv-logs-combine">
4237
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4238
  <div class="uv-logs-content" style="display: none;">
4239
- Installed 37 packages in 297ms
4240
  </div>
4241
  </div>
4242
  <div class="cell-artifacts">
@@ -4249,7 +4249,7 @@ Installed 37 packages in 297ms
4249
  <rdf:RDF>
4250
  <ns2:Work>
4251
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4252
- <dc:date>2025-12-19T19:09:50.663153</dc:date>
4253
  <dc:format>image/svg+xml</dc:format>
4254
  <dc:creator>
4255
  <ns2:Agent>
@@ -4333,70 +4333,70 @@ Installed 37 packages in 297ms
4333
  <g id="matplotlib.axis_2">
4334
  <g id="ytick_1">
4335
  <g id="grid-y--2" class="grid grid-y">
4336
- <path d="M 47.72 408.774166 L 840.20233 408.774166 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4337
  </g>
4338
  <g id="line2d_5">
4339
  <defs>
4340
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4341
  </defs>
4342
  <g>
4343
- <use ns4:href="#m0fca2865ba" x="47.72" y="408.774166" style="stroke: #000000; stroke-width: 0.8" />
4344
  </g>
4345
  </g>
4346
  <g id="text_5">
4347
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.573385" transform="rotate(-0 40.72 412.573385)">1.0</text>
4348
  </g>
4349
  </g>
4350
  <g id="ytick_2">
4351
  <g id="grid-y--3" class="grid grid-y">
4352
- <path d="M 47.72 330.886714 L 840.20233 330.886714 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4353
  </g>
4354
  <g id="line2d_6">
4355
  <g>
4356
- <use ns4:href="#m0fca2865ba" x="47.72" y="330.886714" style="stroke: #000000; stroke-width: 0.8" />
4357
  </g>
4358
  </g>
4359
  <g id="text_6">
4360
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.685933" transform="rotate(-0 40.72 334.685933)">1.5</text>
4361
  </g>
4362
  </g>
4363
  <g id="ytick_3">
4364
  <g id="grid-y--4" class="grid grid-y">
4365
- <path d="M 47.72 252.999261 L 840.20233 252.999261 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4366
  </g>
4367
  <g id="line2d_7">
4368
  <g>
4369
- <use ns4:href="#m0fca2865ba" x="47.72" y="252.999261" style="stroke: #000000; stroke-width: 0.8" />
4370
  </g>
4371
  </g>
4372
  <g id="text_7">
4373
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.79848" transform="rotate(-0 40.72 256.79848)">2.0</text>
4374
  </g>
4375
  </g>
4376
  <g id="ytick_4">
4377
  <g id="grid-y--5" class="grid grid-y">
4378
- <path d="M 47.72 175.111809 L 840.20233 175.111809 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4379
  </g>
4380
  <g id="line2d_8">
4381
  <g>
4382
- <use ns4:href="#m0fca2865ba" x="47.72" y="175.111809" style="stroke: #000000; stroke-width: 0.8" />
4383
  </g>
4384
  </g>
4385
  <g id="text_8">
4386
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.911028" transform="rotate(-0 40.72 178.911028)">2.5</text>
4387
  </g>
4388
  </g>
4389
  <g id="ytick_5">
4390
  <g id="grid-y--6" class="grid grid-y">
4391
- <path d="M 47.72 97.224356 L 840.20233 97.224356 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4392
  </g>
4393
  <g id="line2d_9">
4394
  <g>
4395
- <use ns4:href="#m0fca2865ba" x="47.72" y="97.224356" style="stroke: #000000; stroke-width: 0.8" />
4396
  </g>
4397
  </g>
4398
  <g id="text_9">
4399
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.023575" transform="rotate(-0 40.72 101.023575)">3.0</text>
4400
  </g>
4401
  </g>
4402
  <g id="label--y" class="ylabel">
@@ -4404,27 +4404,27 @@ Installed 37 packages in 297ms
4404
  </g>
4405
  </g>
4406
  <g id="series--torch-layer-norm" class="series">
4407
- <path d="M 83.741924 437.689571 L 323.888085 302.950354 L 564.034245 314.128917 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4408
  <defs>
4409
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4410
  </defs>
4411
  <g clip-path="url(#p2214f54723)">
4412
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4413
- <use ns4:href="#md7efaf3aec" x="323.888085" y="302.950354" style="fill: #1f77b4; stroke: #1f77b4" />
4414
- <use ns4:href="#md7efaf3aec" x="564.034245" y="314.128917" style="fill: #1f77b4; stroke: #1f77b4" />
4415
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4416
  </g>
4417
  </g>
4418
  <g id="series--hf-kernels-layer-norm" class="series">
4419
- <path d="M 83.741924 434.70508 L 323.888085 305.990613 L 564.034245 307.267967 L 804.180406 57.889324 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4420
  <defs>
4421
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4422
  </defs>
4423
  <g clip-path="url(#p2214f54723)">
4424
- <use ns4:href="#m9b8c54d372" x="83.741924" y="434.70508" style="fill: #ff7f0e; stroke: #ff7f0e" />
4425
- <use ns4:href="#m9b8c54d372" x="323.888085" y="305.990613" style="fill: #ff7f0e; stroke: #ff7f0e" />
4426
- <use ns4:href="#m9b8c54d372" x="564.034245" y="307.267967" style="fill: #ff7f0e; stroke: #ff7f0e" />
4427
- <use ns4:href="#m9b8c54d372" x="804.180406" y="57.889324" style="fill: #ff7f0e; stroke: #ff7f0e" />
4428
  </g>
4429
  </g>
4430
  <g id="patch_3">
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T19:55:25.441156</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
 
3973
  <g id="matplotlib.axis_2">
3974
  <g id="ytick_1">
3975
  <g id="grid-y--2" class="grid grid-y">
3976
+ <path d="M 47.72 408.405291 L 840.20233 408.405291 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3977
  </g>
3978
  <g id="line2d_5">
3979
  <defs>
3980
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3981
  </defs>
3982
  <g>
3983
+ <use ns4:href="#m0fca2865ba" x="47.72" y="408.405291" style="stroke: #000000; stroke-width: 0.8" />
3984
  </g>
3985
  </g>
3986
  <g id="text_5">
3987
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.204509" transform="rotate(-0 40.72 412.204509)">1.0</text>
3988
  </g>
3989
  </g>
3990
  <g id="ytick_2">
3991
  <g id="grid-y--3" class="grid grid-y">
3992
+ <path d="M 47.72 330.385445 L 840.20233 330.385445 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3993
  </g>
3994
  <g id="line2d_6">
3995
  <g>
3996
+ <use ns4:href="#m0fca2865ba" x="47.72" y="330.385445" style="stroke: #000000; stroke-width: 0.8" />
3997
  </g>
3998
  </g>
3999
  <g id="text_6">
4000
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.184664" transform="rotate(-0 40.72 334.184664)">1.5</text>
4001
  </g>
4002
  </g>
4003
  <g id="ytick_3">
4004
  <g id="grid-y--4" class="grid grid-y">
4005
+ <path d="M 47.72 252.3656 L 840.20233 252.3656 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4006
  </g>
4007
  <g id="line2d_7">
4008
  <g>
4009
+ <use ns4:href="#m0fca2865ba" x="47.72" y="252.3656" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.164819" transform="rotate(-0 40.72 256.164819)">2.0</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_4">
4017
  <g id="grid-y--5" class="grid grid-y">
4018
+ <path d="M 47.72 174.345754 L 840.20233 174.345754 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
+ <use ns4:href="#m0fca2865ba" x="47.72" y="174.345754" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.144973" transform="rotate(-0 40.72 178.144973)">2.5</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_5">
4030
  <g id="grid-y--6" class="grid grid-y">
4031
+ <path d="M 47.72 96.325909 L 840.20233 96.325909 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
+ <use ns4:href="#m0fca2865ba" x="47.72" y="96.325909" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="100.125128" transform="rotate(-0 40.72 100.125128)">3.0</text>
4040
  </g>
4041
  </g>
4042
  <g id="label--y" class="ylabel">
 
4044
  </g>
4045
  </g>
4046
  <g id="series--torch-layer-norm" class="series">
4047
+ <path d="M 83.741924 437.689571 L 323.888085 302.503181 L 564.034245 313.112476 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4048
  <defs>
4049
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4050
  </defs>
4051
  <g clip-path="url(#p2214f54723)">
4052
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4053
+ <use ns4:href="#md7efaf3aec" x="323.888085" y="302.503181" style="fill: #1f77b4; stroke: #1f77b4" />
4054
+ <use ns4:href="#md7efaf3aec" x="564.034245" y="313.112476" style="fill: #1f77b4; stroke: #1f77b4" />
4055
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4056
  </g>
4057
  </g>
4058
  <g id="series--hf-kernels-layer-norm" class="series">
4059
+ <path d="M 83.741924 434.265436 L 323.888085 306.894918 L 564.034245 307.077328 L 804.180406 56.113857 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4060
  <defs>
4061
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4062
  </defs>
4063
  <g clip-path="url(#p2214f54723)">
4064
+ <use ns4:href="#m9b8c54d372" x="83.741924" y="434.265436" style="fill: #ff7f0e; stroke: #ff7f0e" />
4065
+ <use ns4:href="#m9b8c54d372" x="323.888085" y="306.894918" style="fill: #ff7f0e; stroke: #ff7f0e" />
4066
+ <use ns4:href="#m9b8c54d372" x="564.034245" y="307.077328" style="fill: #ff7f0e; stroke: #ff7f0e" />
4067
+ <use ns4:href="#m9b8c54d372" x="804.180406" y="56.113857" style="fill: #ff7f0e; stroke: #ff7f0e" />
4068
  </g>
4069
  </g>
4070
  <g id="patch_3">
 
4122
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4123
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4124
  </span> |
4125
+ Cell: combine | 4.63s
4126
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4127
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4128
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4210
 
4211
  impl wl p50(ms) ok
4212
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4213
+ hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4214
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4215
+ hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4216
  torch_layer_norm LN_B16_S2048_D4096 0.81 True
4217
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4218
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4219
+ torch_layer_norm LN_B16_S4096_D8192 3.32 True
4220
 
4221
  GENERATING COMBINED VISUALIZATION
4222
 
 
4236
  <div class="uv-install-logs" id="uv-logs-combine">
4237
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4238
  <div class="uv-logs-content" style="display: none;">
4239
+ Installed 37 packages in 299ms
4240
  </div>
4241
  </div>
4242
  <div class="cell-artifacts">
 
4249
  <rdf:RDF>
4250
  <ns2:Work>
4251
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4252
+ <dc:date>2025-12-19T19:55:25.441156</dc:date>
4253
  <dc:format>image/svg+xml</dc:format>
4254
  <dc:creator>
4255
  <ns2:Agent>
 
4333
  <g id="matplotlib.axis_2">
4334
  <g id="ytick_1">
4335
  <g id="grid-y--2" class="grid grid-y">
4336
+ <path d="M 47.72 408.405291 L 840.20233 408.405291 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4337
  </g>
4338
  <g id="line2d_5">
4339
  <defs>
4340
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4341
  </defs>
4342
  <g>
4343
+ <use ns4:href="#m0fca2865ba" x="47.72" y="408.405291" style="stroke: #000000; stroke-width: 0.8" />
4344
  </g>
4345
  </g>
4346
  <g id="text_5">
4347
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.204509" transform="rotate(-0 40.72 412.204509)">1.0</text>
4348
  </g>
4349
  </g>
4350
  <g id="ytick_2">
4351
  <g id="grid-y--3" class="grid grid-y">
4352
+ <path d="M 47.72 330.385445 L 840.20233 330.385445 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4353
  </g>
4354
  <g id="line2d_6">
4355
  <g>
4356
+ <use ns4:href="#m0fca2865ba" x="47.72" y="330.385445" style="stroke: #000000; stroke-width: 0.8" />
4357
  </g>
4358
  </g>
4359
  <g id="text_6">
4360
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.184664" transform="rotate(-0 40.72 334.184664)">1.5</text>
4361
  </g>
4362
  </g>
4363
  <g id="ytick_3">
4364
  <g id="grid-y--4" class="grid grid-y">
4365
+ <path d="M 47.72 252.3656 L 840.20233 252.3656 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4366
  </g>
4367
  <g id="line2d_7">
4368
  <g>
4369
+ <use ns4:href="#m0fca2865ba" x="47.72" y="252.3656" style="stroke: #000000; stroke-width: 0.8" />
4370
  </g>
4371
  </g>
4372
  <g id="text_7">
4373
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.164819" transform="rotate(-0 40.72 256.164819)">2.0</text>
4374
  </g>
4375
  </g>
4376
  <g id="ytick_4">
4377
  <g id="grid-y--5" class="grid grid-y">
4378
+ <path d="M 47.72 174.345754 L 840.20233 174.345754 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4379
  </g>
4380
  <g id="line2d_8">
4381
  <g>
4382
+ <use ns4:href="#m0fca2865ba" x="47.72" y="174.345754" style="stroke: #000000; stroke-width: 0.8" />
4383
  </g>
4384
  </g>
4385
  <g id="text_8">
4386
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.144973" transform="rotate(-0 40.72 178.144973)">2.5</text>
4387
  </g>
4388
  </g>
4389
  <g id="ytick_5">
4390
  <g id="grid-y--6" class="grid grid-y">
4391
+ <path d="M 47.72 96.325909 L 840.20233 96.325909 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4392
  </g>
4393
  <g id="line2d_9">
4394
  <g>
4395
+ <use ns4:href="#m0fca2865ba" x="47.72" y="96.325909" style="stroke: #000000; stroke-width: 0.8" />
4396
  </g>
4397
  </g>
4398
  <g id="text_9">
4399
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="100.125128" transform="rotate(-0 40.72 100.125128)">3.0</text>
4400
  </g>
4401
  </g>
4402
  <g id="label--y" class="ylabel">
 
4404
  </g>
4405
  </g>
4406
  <g id="series--torch-layer-norm" class="series">
4407
+ <path d="M 83.741924 437.689571 L 323.888085 302.503181 L 564.034245 313.112476 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4408
  <defs>
4409
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4410
  </defs>
4411
  <g clip-path="url(#p2214f54723)">
4412
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4413
+ <use ns4:href="#md7efaf3aec" x="323.888085" y="302.503181" style="fill: #1f77b4; stroke: #1f77b4" />
4414
+ <use ns4:href="#md7efaf3aec" x="564.034245" y="313.112476" style="fill: #1f77b4; stroke: #1f77b4" />
4415
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4416
  </g>
4417
  </g>
4418
  <g id="series--hf-kernels-layer-norm" class="series">
4419
+ <path d="M 83.741924 434.265436 L 323.888085 306.894918 L 564.034245 307.077328 L 804.180406 56.113857 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4420
  <defs>
4421
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4422
  </defs>
4423
  <g clip-path="url(#p2214f54723)">
4424
+ <use ns4:href="#m9b8c54d372" x="83.741924" y="434.265436" style="fill: #ff7f0e; stroke: #ff7f0e" />
4425
+ <use ns4:href="#m9b8c54d372" x="323.888085" y="306.894918" style="fill: #ff7f0e; stroke: #ff7f0e" />
4426
+ <use ns4:href="#m9b8c54d372" x="564.034245" y="307.077328" style="fill: #ff7f0e; stroke: #ff7f0e" />
4427
+ <use ns4:href="#m9b8c54d372" x="804.180406" y="56.113857" style="fill: #ff7f0e; stroke: #ff7f0e" />
4428
  </g>
4429
  </g>
4430
  <g id="patch_3">
openai_moe/impls/artifacts/benchmark/openai_moe.jsonl CHANGED
@@ -1,8 +1,8 @@
1
- {"ts": "2025-12-19T18:57:39Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_E2", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 155.7981640000321, "p50": 157.7297640000097, "p90": 159.48504900001126, "mean": 158.39911260001145, "iqr": 2.223896000032255, "raw_times": [161.72143300002517, 157.261152999979, 155.7981640000321, 159.48504900001126, 157.7297640000097], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 159.10347999999885, "peak_bytes": 416866816, "ok": true, "absmax": 2.765655517578125e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.765655517578125e-05, "mae": 2.0696452338597737e-06, "mse": 7.332408985538663e-12, "ref": "naive_moe"}, "err": null}
2
- {"ts": "2025-12-19T18:58:03Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_E4", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 199.79041199997027, "p50": 204.82147100000248, "p90": 205.0451750000093, "mean": 203.32668460000605, "iqr": 3.4747309999829668, "raw_times": [205.40592100002186, 199.79041199997027, 201.57044400002633, 205.0451750000093, 204.82147100000248], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 200.72428899999295, "peak_bytes": 632035840, "ok": true, "absmax": 1.621246337890625e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.621246337890625e-05, "mae": 9.61917862696282e-07, "mse": 1.59423277530657e-12, "ref": "naive_moe"}, "err": null}
3
- {"ts": "2025-12-19T18:58:47Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S1024_E2", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 372.8170420000083, "p50": 383.31174900002907, "p90": 392.9121939999618, "mean": 385.07766660000016, "iqr": 10.251173999961338, "raw_times": [393.68632800000114, 392.9121939999618, 382.66102000000046, 383.31174900002907, 372.8170420000083], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 393.062126000018, "peak_bytes": 643844608, "ok": true, "absmax": 2.6226043701171875e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.6226043701171875e-05, "mae": 2.0501920516835526e-06, "mse": 7.1848811622476916e-12, "ref": "naive_moe"}, "err": null}
4
- {"ts": "2025-12-19T18:59:36Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S1024_E4", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 418.8624209999716, "p50": 421.41534400002456, "p90": 422.4395519999007, "mean": 421.30189059998884, "iqr": 1.8283119999296105, "raw_times": [423.18089600007625, 421.41534400002456, 418.8624209999716, 420.6112399999711, 422.4395519999007], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 421.8970150000132, "peak_bytes": 823386112, "ok": true, "absmax": 1.3589859008789062e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.3589859008789062e-05, "mae": 9.400179123986163e-07, "mse": 1.5130355735665235e-12, "ref": "naive_moe"}, "err": null}
5
- {"ts": "2025-12-19T19:01:05Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S512_E2", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 766.098573000022, "p50": 773.6994500000947, "p90": 774.865274999911, "mean": 772.9573942000115, "iqr": 8.746404999897095, "raw_times": [766.1188700000139, 773.6994500000947, 766.098573000022, 774.865274999911, 784.0048030000162], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 782.9079639999463, "peak_bytes": 1036112384, "ok": true, "absmax": 3.2901763916015625e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 3.2901763916015625e-05, "mae": 2.0572656467265915e-06, "mse": 7.247809123700488e-12, "ref": "naive_moe"}, "err": null}
6
- {"ts": "2025-12-19T19:02:49Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S512_E4", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 828.9166780000414, "p50": 840.0145479999992, "p90": 848.174653000001, "mean": 841.7884347999916, "iqr": 11.353517000088686, "raw_times": [855.0151590000041, 828.9166780000414, 848.174653000001, 836.8211359999123, 840.0145479999992], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 857.4785790000305, "peak_bytes": 1235263488, "ok": true, "absmax": 1.430511474609375e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.430511474609375e-05, "mae": 9.400343401466671e-07, "mse": 1.5107844445957919e-12, "ref": "naive_moe"}, "err": null}
7
- {"ts": "2025-12-19T19:05:50Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S1024_E2", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1492.7651169999763, "p50": 1513.7102520000099, "p90": 1522.1755649999977, "mean": 1513.4781133999923, "iqr": 10.99431700004061, "raw_times": [1492.7651169999763, 1511.1812479999571, 1522.1755649999977, 1527.5583850000203, 1513.7102520000099], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1532.0516410000664, "peak_bytes": 1861947904, "ok": true, "absmax": 2.6226043701171875e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.6226043701171875e-05, "mae": 2.060702854578267e-06, "mse": 7.262949790198814e-12, "ref": "naive_moe"}, "err": null}
8
- {"ts": "2025-12-19T19:09:07Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S1024_E4", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1654.5569229999728, "p50": 1658.7427389999903, "p90": 1665.0588319999997, "mean": 1660.4780848000016, "iqr": 7.11779099992782, "raw_times": [1658.7427389999903, 1665.0588319999997, 1666.0908889999746, 1657.941041000072, 1654.5569229999728], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1670.381679000002, "peak_bytes": 2062163968, "ok": true, "absmax": 1.5974044799804688e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.5974044799804688e-05, "mae": 9.529014732834185e-07, "mse": 1.5621694476192216e-12, "ref": "naive_moe"}, "err": null}
 
1
+ {"ts": "2025-12-19T19:54:31Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S512_E2", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 2.591275999975551, "p50": 2.6265569999850413, "p90": 2.6390279999759514, "mean": 2.626043199961714, "iqr": 0.02082100013467425, "raw_times": [2.591275999975551, 2.6390279999759514, 2.6265569999850413, 2.618206999841277, 2.6551480000307492], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.6624880001691054, "peak_bytes": 311252992, "ok": true, "absmax": 1.0818243026733398e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.0818243026733398e-05, "mae": 1.0733322142186807e-06, "mse": 1.9560496885423495e-12, "ref": "naive_moe"}, "err": null}
2
+ {"ts": "2025-12-19T19:54:31Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S512_E4", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.913345000000845, "p50": 3.932325000050696, "p90": 3.941766000025382, "mean": 3.9370316000258754, "iqr": 0.02511100001356681, "raw_times": [3.941766000025382, 3.913345000000845, 3.916655000011815, 3.981067000040639, 3.932325000050696], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.937866000114809, "peak_bytes": 632822272, "ok": true, "absmax": 7.82310962677002e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 7.82310962677002e-06, "mae": 5.576844728238939e-07, "mse": 5.436189692842319e-13, "ref": "naive_moe"}, "err": null}
3
+ {"ts": "2025-12-19T19:54:32Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S1024_E2", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.80903300015234, "p50": 3.849652999861064, "p90": 3.853734000131226, "mean": 3.837069200062615, "iqr": 0.039670999967711396, "raw_times": [3.8140630001635145, 3.8588630000049307, 3.80903300015234, 3.853734000131226, 3.849652999861064], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.788761999885537, "peak_bytes": 645417472, "ok": true, "absmax": 1.5497207641601562e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.5497207641601562e-05, "mae": 1.1454358173068613e-06, "mse": 2.2412421311207575e-12, "ref": "naive_moe"}, "err": null}
4
+ {"ts": "2025-12-19T19:54:34Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S1024_E4", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 5.2778859999307315, "p50": 5.308016000071802, "p90": 5.336937000038233, "mean": 5.31205640004373, "iqr": 0.038680999978168984, "raw_times": [5.339187000117818, 5.336937000038233, 5.298256000060064, 5.308016000071802, 5.2778859999307315], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 5.26179400003457, "peak_bytes": 657099264, "ok": true, "absmax": 6.556510925292969e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 6.556510925292969e-06, "mae": 4.852234951613354e-07, "mse": 4.015021550906467e-13, "ref": "naive_moe"}, "err": null}
5
+ {"ts": "2025-12-19T19:54:36Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S512_E2", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 6.679864000034286, "p50": 6.717303999948854, "p90": 6.729205000056027, "mean": 6.711754200023279, "iqr": 0.028612000050998176, "raw_times": [6.679864000034286, 6.717303999948854, 6.700593000005028, 6.7318050000722, 6.729205000056027], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 6.593322000071566, "peak_bytes": 678357504, "ok": true, "absmax": 1.3589859008789062e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.3589859008789062e-05, "mae": 1.1745952406272409e-06, "mse": 2.316181968442521e-12, "ref": "naive_moe"}, "err": null}
6
+ {"ts": "2025-12-19T19:54:38Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S512_E4", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 7.423924000022453, "p50": 7.518165999954363, "p90": 7.529216999955679, "mean": 7.5042842000129895, "iqr": 0.02257999994981219, "raw_times": [7.543477000126586, 7.529216999955679, 7.518165999954363, 7.506637000005867, 7.423924000022453], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 7.323180999946999, "peak_bytes": 701983232, "ok": true, "absmax": 8.58306884765625e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 8.58306884765625e-06, "mae": 5.268635732136318e-07, "mse": 4.753664909623589e-13, "ref": "naive_moe"}, "err": null}
7
+ {"ts": "2025-12-19T19:54:42Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S1024_E2", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 13.163481999981741, "p50": 13.23755299995355, "p90": 13.251324000066234, "mean": 13.23588719997133, "iqr": 0.04864200013798836, "raw_times": [13.163481999981741, 13.202681999928245, 13.23755299995355, 13.32439499992688, 13.251324000066234], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 13.090128999920125, "peak_bytes": 1012207616, "ok": true, "absmax": 1.71661376953125e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.71661376953125e-05, "mae": 1.797086838450923e-06, "mse": 5.3811247992252564e-12, "ref": "naive_moe"}, "err": null}
8
+ {"ts": "2025-12-19T19:54:46Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S1024_E4", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 13.26829100003124, "p50": 13.362623000148233, "p90": 13.40691399991556, "mean": 13.346813000043767, "iqr": 0.1288519999889104, "raw_times": [13.40691399991556, 13.418175000197152, 13.26829100003124, 13.27806199992665, 13.362623000148233], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 12.873562000095262, "peak_bytes": 910968320, "ok": true, "absmax": 8.344650268554688e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 8.344650268554688e-06, "mae": 5.471991357808292e-07, "mse": 5.06310813587485e-13, "ref": "naive_moe"}, "err": null}
openai_moe/impls/binned_torch.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,7 +3904,7 @@ Cell: nv | 0.25s
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
- <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:56:28 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.25s
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
- | N/A 34C P0 80W / 350W | 0MiB / 46068MiB | 41% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
@@ -3935,9 +3935,9 @@ Cell: nv | 0.25s
3935
  <span class="collapse-indicators">
3936
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3939
  </span> |
3940
- Cell: benchmark | 730.34s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -4095,29 +4095,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S512_E2
4095
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4096
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 919.007ms 1814.55% 919.007ms 919.007ms 1
4099
- binned_torch 24.74% 227.809ms 100.00% 920.989ms 920.989ms 0.000us 0.00% 50.650ms 50.650ms 1
4100
- aten::item 1.86% 17.169ms 26.20% 241.261ms 15.722us 0.000us 0.00% 15.873ms 1.034us 15345
4101
- aten::_local_scalar_dense 5.94% 54.669ms 24.33% 224.092ms 14.604us 15.872ms 31.34% 15.873ms 1.034us 15345
4102
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 15.872ms 31.34% 15.872ms 1.034us 15345
4103
- aten::floor_divide 5.47% 50.387ms 13.12% 120.822ms 19.665us 7.812ms 15.43% 7.812ms 1.272us 6144
4104
- aten::bmm 0.02% 191.383us 0.03% 231.124us 38.521us 7.592ms 14.99% 7.592ms 1.265ms 6
4105
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.592ms 14.99% 7.592ms 1.265ms 6
4106
- aten::copy_ 3.61% 33.260ms 9.01% 82.984ms 13.480us 6.583ms 13.00% 6.585ms 1.070us 6156
4107
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.579ms 12.99% 6.579ms 1.069us 6153
4108
- aten::mul 3.25% 29.933ms 5.69% 52.377ms 17.000us 4.706ms 9.29% 4.706ms 1.527us 3081
4109
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.478ms 8.84% 4.478ms 1.458us 3072
4110
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.159ms 8.21% 4.159ms 1.354us 3072
4111
- aten::remainder 3.14% 28.956ms 4.78% 44.045ms 14.337us 3.839ms 7.58% 3.839ms 1.250us 3072
4112
- aten::add 2.87% 26.444ms 4.82% 44.437ms 14.651us 3.761ms 7.43% 3.761ms 1.240us 3033
4113
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 7.22% 3.655ms 1.190us 3072
4114
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.365ms 6.64% 3.365ms 1.110us 3030
4115
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.023ms 3.99% 2.023ms 1.317us 1536
4116
  void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.816ms 3.58% 1.816ms 1.182us 1536
4117
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 287.650us 0.57% 287.650us 47.942us 6
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
- Self CPU time total: 920.998ms
4120
- Self CUDA time total: 50.647ms
4121
 
4122
 
4123
 
@@ -4127,29 +4127,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S512_E4
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4129
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4130
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 934.694ms 1714.22% 934.694ms 934.694ms 1
4131
- binned_torch 24.25% 226.767ms 100.00% 935.247ms 935.247ms 0.000us 0.00% 54.534ms 54.534ms 1
4132
- aten::item 1.76% 16.424ms 27.79% 259.914ms 15.348us 0.000us 0.00% 17.987ms 1.062us 16935
4133
- aten::_local_scalar_dense 6.05% 56.595ms 26.03% 243.490ms 14.378us 17.985ms 32.98% 17.987ms 1.062us 16935
4134
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 17.985ms 32.98% 17.985ms 1.062us 16935
4135
- aten::floor_divide 5.13% 47.972ms 12.39% 115.852ms 18.856us 7.812ms 14.33% 7.813ms 1.272us 6144
4136
- aten::bmm 0.02% 166.771us 0.02% 207.402us 34.567us 7.794ms 14.29% 7.794ms 1.299ms 6
4137
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.794ms 14.29% 7.794ms 1.299ms 6
4138
- aten::copy_ 3.47% 32.488ms 8.51% 79.554ms 12.923us 6.633ms 12.17% 6.635ms 1.078us 6156
4139
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.630ms 12.16% 6.630ms 1.078us 6153
4140
- aten::add 4.14% 38.686ms 7.06% 65.992ms 14.368us 5.259ms 9.64% 5.259ms 1.145us 4593
4141
- aten::mul 3.02% 28.215ms 5.35% 50.047ms 16.244us 4.701ms 8.62% 4.701ms 1.526us 3081
4142
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.474ms 8.21% 4.474ms 1.457us 3072
4143
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.157ms 7.62% 4.157ms 1.353us 3072
4144
- aten::remainder 2.81% 26.265ms 4.43% 41.468ms 13.499us 3.852ms 7.06% 3.852ms 1.254us 3072
4145
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 6.70% 3.655ms 1.190us 3072
4146
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.270ms 6.00% 3.270ms 1.079us 3030
4147
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.030ms 3.72% 2.030ms 1.322us 1536
4148
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.822ms 3.34% 1.822ms 1.186us 1536
4149
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.584ms 2.91% 1.584ms 1.015us 1560
4150
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4151
- Self CPU time total: 935.255ms
4152
- Self CUDA time total: 54.526ms
4153
 
4154
 
4155
 
@@ -4159,29 +4159,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S1024_E2
4159
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4160
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4161
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4162
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.775s 1705.66% 1.775s 1.775s 1
4163
- binned_torch 24.39% 432.670ms 100.00% 1.774s 1.774s 0.000us 0.00% 104.087ms 104.087ms 1
4164
- aten::item 1.67% 29.627ms 26.26% 465.825ms 15.266us 0.000us 0.00% 31.856ms 1.044us 30513
4165
- aten::_local_scalar_dense 5.88% 104.231ms 24.59% 436.198ms 14.295us 31.854ms 30.61% 31.856ms 1.044us 30513
4166
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 31.854ms 30.61% 31.854ms 1.044us 30513
4167
- aten::floor_divide 5.49% 97.404ms 13.46% 238.769ms 19.431us 15.611ms 15.00% 15.612ms 1.270us 12288
4168
- aten::bmm 0.01% 215.332us 0.01% 258.864us 43.144us 15.009ms 14.42% 15.009ms 2.502ms 6
4169
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.009ms 14.42% 15.009ms 2.502ms 6
4170
- aten::copy_ 3.73% 66.187ms 9.04% 160.371ms 13.038us 13.330ms 12.81% 13.331ms 1.084us 12300
4171
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.326ms 12.80% 13.326ms 1.084us 12294
4172
- aten::mul 3.16% 56.128ms 5.72% 101.496ms 16.495us 11.275ms 10.83% 11.277ms 1.833us 6153
4173
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.921ms 9.53% 9.921ms 1.615us 6144
4174
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.311ms 7.99% 8.311ms 1.353us 6144
4175
- aten::remainder 3.23% 57.334ms 5.09% 90.371ms 14.709us 7.676ms 7.38% 7.678ms 1.250us 6144
4176
- aten::add 2.88% 51.067ms 5.02% 88.987ms 15.049us 7.641ms 7.34% 7.642ms 1.292us 5913
4177
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.300ms 7.01% 7.300ms 1.188us 6144
4178
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.359ms 6.11% 6.359ms 1.076us 5910
4179
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.045ms 3.89% 4.045ms 1.317us 3072
4180
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.632ms 3.49% 3.632ms 1.182us 3072
4181
- aten::clamp 0.00% 74.963us 0.01% 122.824us 20.471us 1.191ms 1.14% 1.191ms 198.444us 6
4182
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4183
- Self CPU time total: 1.774s
4184
- Self CUDA time total: 104.078ms
4185
 
4186
 
4187
 
@@ -4191,29 +4191,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S1024_E4
4191
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4192
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4193
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4194
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.943s 1756.79% 1.943s 1.943s 1
4195
- binned_torch 24.29% 471.728ms 100.00% 1.942s 1.942s 0.000us 0.00% 110.592ms 110.592ms 1
4196
- aten::item 1.62% 31.476ms 26.94% 523.166ms 15.511us 0.000us 0.00% 35.330ms 1.047us 33729
4197
- aten::_local_scalar_dense 6.11% 118.659ms 25.32% 491.691ms 14.578us 35.327ms 31.95% 35.330ms 1.047us 33729
4198
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 35.327ms 31.95% 35.327ms 1.047us 33728
4199
- aten::floor_divide 5.19% 100.816ms 12.43% 241.273ms 19.635us 15.609ms 14.12% 15.611ms 1.270us 12288
4200
- aten::bmm 0.01% 222.165us 0.01% 267.105us 44.517us 15.085ms 13.64% 15.085ms 2.514ms 6
4201
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.085ms 13.64% 15.085ms 2.514ms 6
4202
- aten::copy_ 3.60% 69.833ms 8.76% 170.090ms 13.828us 13.355ms 12.08% 13.357ms 1.086us 12300
4203
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.353ms 12.07% 13.353ms 1.086us 12294
4204
- aten::mul 2.94% 57.042ms 5.32% 103.331ms 16.794us 10.942ms 9.89% 10.942ms 1.778us 6153
4205
- aten::add 3.88% 75.326ms 6.94% 134.721ms 14.806us 10.866ms 9.83% 10.866ms 1.194us 9099
4206
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.591ms 8.67% 9.591ms 1.561us 6144
4207
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.314ms 7.52% 8.314ms 1.353us 6144
4208
- aten::remainder 2.77% 53.827ms 4.45% 86.321ms 14.050us 7.697ms 6.96% 7.697ms 1.253us 6144
4209
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.295ms 6.60% 7.295ms 1.187us 6144
4210
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.370ms 5.76% 6.370ms 1.078us 5910
4211
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.058ms 3.67% 4.058ms 1.321us 3072
4212
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.639ms 3.29% 3.639ms 1.185us 3072
4213
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.234ms 2.92% 3.234ms 1.015us 3186
4214
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4215
- Self CPU time total: 1.942s
4216
- Self CUDA time total: 110.585ms
4217
 
4218
 
4219
 
@@ -4223,29 +4223,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S512_E2
4223
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4224
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4225
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4226
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.554s 1668.92% 3.554s 3.554s 1
4227
- binned_torch 24.03% 852.954ms 100.00% 3.549s 3.549s 0.000us 0.00% 212.979ms 212.979ms 1
4228
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 63.933ms 30.02% 63.933ms 1.038us 61586
4229
- aten::item 1.68% 59.518ms 26.66% 946.248ms 15.364us 0.000us 0.00% 63.933ms 1.038us 61587
4230
- aten::_local_scalar_dense 6.15% 218.157ms 24.98% 886.634ms 14.396us 63.932ms 30.02% 63.933ms 1.038us 61587
4231
- aten::floor_divide 5.36% 190.145ms 13.28% 471.339ms 19.179us 31.621ms 14.85% 31.623ms 1.287us 24576
4232
- aten::bmm 0.01% 230.233us 0.01% 275.904us 45.984us 28.855ms 13.55% 28.855ms 4.809ms 6
4233
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 28.855ms 13.55% 28.855ms 4.809ms 6
4234
- aten::copy_ 3.84% 136.428ms 9.38% 333.073ms 13.546us 26.747ms 12.56% 26.749ms 1.088us 24588
4235
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.744ms 12.56% 26.744ms 1.088us 24582
4236
- aten::mul 3.20% 113.415ms 5.79% 205.629ms 16.722us 25.614ms 12.03% 25.614ms 2.083us 12297
4237
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.161ms 10.41% 22.161ms 1.803us 12288
4238
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 17.018ms 7.99% 17.018ms 1.385us 12288
4239
- aten::add 2.93% 103.833ms 5.19% 184.217ms 14.843us 16.665ms 7.83% 16.666ms 1.343us 12411
4240
- aten::remainder 3.13% 110.979ms 5.01% 177.878ms 14.476us 15.442ms 7.25% 15.444ms 1.257us 12288
4241
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.607ms 6.86% 14.607ms 1.189us 12288
4242
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.543ms 6.36% 13.543ms 1.091us 12408
4243
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.136ms 3.82% 8.136ms 1.324us 6144
4244
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.305ms 3.43% 7.305ms 1.189us 6144
4245
- aten::clamp 0.00% 80.604us 0.00% 131.123us 21.854us 2.608ms 1.22% 2.608ms 434.678us 6
4246
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4247
- Self CPU time total: 3.549s
4248
- Self CUDA time total: 212.971ms
4249
 
4250
 
4251
 
@@ -4255,29 +4255,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S512_E4
4255
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4256
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4257
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4258
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.834s 1701.16% 3.834s 3.834s 1
4259
- binned_torch 23.91% 917.039ms 100.00% 3.836s 3.836s 0.000us 0.00% 225.394ms 225.394ms 1
4260
- aten::item 1.70% 65.086ms 27.21% 1.044s 15.386us 0.000us 0.00% 70.210ms 1.035us 67845
4261
- aten::_local_scalar_dense 6.32% 242.356ms 25.52% 978.758ms 14.426us 70.207ms 31.15% 70.210ms 1.035us 67845
4262
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 70.207ms 31.15% 70.207ms 1.035us 67840
4263
- aten::floor_divide 5.09% 195.347ms 12.48% 478.676ms 19.477us 31.474ms 13.97% 31.481ms 1.281us 24576
4264
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 28.832ms 12.79% 28.832ms 4.805ms 6
4265
- aten::bmm 0.01% 227.473us 0.01% 274.364us 45.727us 28.832ms 12.79% 28.832ms 4.805ms 6
4266
- aten::copy_ 3.61% 138.479ms 8.82% 338.314ms 13.759us 26.687ms 11.84% 26.689ms 1.085us 24588
4267
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.685ms 11.84% 26.685ms 1.086us 24581
4268
- aten::mul 2.97% 113.735ms 5.38% 206.436ms 16.787us 25.537ms 11.33% 25.539ms 2.077us 12297
4269
- aten::add 4.18% 160.247ms 7.41% 284.235ms 15.249us 23.217ms 10.30% 23.217ms 1.246us 18639
4270
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.084ms 9.80% 22.084ms 1.797us 12288
4271
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.963ms 7.53% 16.963ms 1.381us 12287
4272
- aten::remainder 2.89% 110.779ms 4.66% 178.579ms 14.533us 15.327ms 6.80% 15.329ms 1.247us 12288
4273
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.512ms 6.44% 14.512ms 1.181us 12287
4274
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.655ms 6.06% 13.655ms 1.101us 12407
4275
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.083ms 3.59% 8.083ms 1.316us 6144
4276
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.244ms 3.21% 7.244ms 1.179us 6144
4277
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.461ms 2.87% 6.461ms 1.037us 6228
4278
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4279
- Self CPU time total: 3.836s
4280
- Self CUDA time total: 225.376ms
4281
 
4282
 
4283
 
@@ -4287,29 +4287,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S1024_E2
4287
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4288
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4289
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4290
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.307s 1714.16% 7.307s 7.307s 1
4291
- binned_torch 24.10% 1.762s 100.00% 7.313s 7.313s 0.000us 0.00% 426.284ms 426.284ms 1
4292
- aten::item 1.74% 126.959ms 26.39% 1.930s 15.721us 0.000us 0.00% 128.245ms 1.045us 122763
4293
- aten::_local_scalar_dense 6.22% 454.984ms 24.65% 1.803s 14.685us 128.239ms 30.08% 128.245ms 1.045us 122763
4294
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 128.241ms 30.08% 128.241ms 1.045us 122762
4295
- aten::floor_divide 5.53% 404.463ms 13.23% 967.808ms 19.690us 63.393ms 14.87% 63.393ms 1.290us 49152
4296
- aten::bmm 0.00% 234.623us 0.00% 278.223us 46.371us 56.525ms 13.26% 56.525ms 9.421ms 6
4297
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 56.525ms 13.26% 56.525ms 9.421ms 6
4298
- aten::copy_ 4.05% 295.852ms 9.44% 690.402ms 14.045us 53.639ms 12.58% 53.640ms 1.091us 49158
4299
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.636ms 12.58% 53.636ms 1.091us 49154
4300
- aten::mul 3.24% 237.068ms 5.73% 419.319ms 17.056us 51.499ms 12.08% 51.504ms 2.095us 24585
4301
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.577ms 10.46% 44.577ms 1.814us 24576
4302
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 34.181ms 8.02% 34.181ms 1.391us 24576
4303
- aten::add 2.92% 213.232ms 5.07% 370.760ms 15.173us 33.603ms 7.88% 33.606ms 1.375us 24435
4304
- aten::remainder 3.14% 229.281ms 5.03% 367.714ms 14.962us 30.916ms 7.25% 30.921ms 1.258us 24576
4305
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.214ms 6.85% 29.214ms 1.189us 24576
4306
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.954ms 6.32% 26.954ms 1.103us 24431
4307
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.285ms 3.82% 16.285ms 1.325us 12288
4308
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.630ms 3.43% 14.630ms 1.191us 12288
4309
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 5.208ms 1.22% 5.208ms 868.029us 6
4310
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4311
- Self CPU time total: 7.313s
4312
- Self CUDA time total: 426.263ms
4313
 
4314
 
4315
 
@@ -4319,41 +4319,47 @@ PROFILE TRACE: binned_torch | cuda_B4_S1024_E4
4319
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4320
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4321
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4322
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.520s 1665.26% 7.520s 7.520s 1
4323
- binned_torch 23.83% 1.792s 100.00% 7.522s 7.522s 0.000us 0.00% 451.603ms 451.603ms 1
4324
- aten::item 1.82% 136.877ms 27.31% 2.054s 15.246us 0.000us 0.00% 140.837ms 1.045us 134715
4325
- aten::_local_scalar_dense 6.26% 471.062ms 25.49% 1.917s 14.230us 140.825ms 31.19% 140.837ms 1.045us 134715
4326
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 140.826ms 31.19% 140.826ms 1.045us 134706
4327
- aten::floor_divide 5.15% 387.087ms 12.45% 936.766ms 19.059us 63.494ms 14.06% 63.499ms 1.292us 49152
4328
- aten::bmm 0.00% 222.563us 0.00% 265.513us 44.252us 56.696ms 12.56% 56.696ms 9.449ms 6
4329
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 56.696ms 12.56% 56.696ms 9.449ms 6
4330
- aten::copy_ 3.71% 279.306ms 8.85% 665.315ms 13.534us 53.897ms 11.94% 53.900ms 1.096us 49158
4331
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.894ms 11.94% 53.894ms 1.097us 49149
4332
- aten::mul 3.04% 228.311ms 5.39% 405.691ms 16.502us 51.688ms 11.45% 51.695ms 2.103us 24585
4333
- aten::add 4.00% 300.523ms 6.98% 525.049ms 14.441us 45.565ms 10.09% 45.568ms 1.253us 36357
4334
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.621ms 9.88% 44.621ms 1.816us 24576
4335
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 34.193ms 7.57% 34.193ms 1.391us 24573
4336
- aten::remainder 2.86% 215.282ms 4.58% 344.226ms 14.007us 30.855ms 6.83% 30.857ms 1.256us 24576
4337
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.302ms 6.49% 29.302ms 1.192us 24573
4338
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.656ms 5.90% 26.656ms 1.091us 24431
4339
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.266ms 3.60% 16.266ms 1.324us 12288
4340
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.588ms 3.23% 14.588ms 1.187us 12288
4341
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.278ms 2.72% 12.278ms 1.030us 11922
4342
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4343
- Self CPU time total: 7.522s
4344
- Self CUDA time total: 451.562ms
4345
 
4346
 
4347
  impl wl p50(ms) ok
4348
- binned_torch cuda_B1_S1024_E2 383.31 True
4349
- binned_torch cuda_B1_S1024_E4 421.42 True
4350
- binned_torch cuda_B1_S512_E2 157.73 True
4351
- binned_torch cuda_B1_S512_E4 204.82 True
4352
- binned_torch cuda_B4_S1024_E2 1513.71 True
4353
- binned_torch cuda_B4_S1024_E4 1658.74 True
4354
- binned_torch cuda_B4_S512_E2 773.70 True
4355
- binned_torch cuda_B4_S512_E4 840.01 True
4356
  </pre></div>
 
 
 
 
 
 
4357
  <div class="cell-artifacts">
4358
  <h4>Artifacts:</h4>
4359
  <a href="artifacts/benchmark/openai_moe.jsonl" class="artifact" target="_blank">openai_moe.jsonl</a>
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.28s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 19:41:48 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
 
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
+ | N/A 33C P0 126W / 350W | 0MiB / 46068MiB | 100% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
 
3935
  <span class="collapse-indicators">
3936
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3939
  </span> |
3940
+ Cell: benchmark | 733.46s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
4095
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4096
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 935.516ms 1843.92% 935.516ms 935.516ms 1
4099
+ binned_torch 24.73% 231.815ms 100.00% 937.553ms 937.553ms 0.000us 0.00% 50.740ms 50.740ms 1
4100
+ aten::item 1.92% 17.997ms 26.19% 245.573ms 16.003us 0.000us 0.00% 15.756ms 1.027us 15345
4101
+ aten::_local_scalar_dense 6.46% 60.533ms 24.27% 227.576ms 14.831us 15.755ms 31.05% 15.756ms 1.027us 15345
4102
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 15.755ms 31.05% 15.755ms 1.027us 15345
4103
+ aten::floor_divide 5.33% 49.954ms 13.00% 121.926ms 19.845us 7.813ms 15.40% 7.813ms 1.272us 6144
4104
+ aten::bmm 0.02% 192.684us 0.02% 232.345us 38.724us 7.792ms 15.36% 7.792ms 1.299ms 6
4105
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.792ms 15.36% 7.792ms 1.299ms 6
4106
+ aten::copy_ 3.73% 34.970ms 9.17% 86.008ms 13.971us 6.589ms 12.99% 6.590ms 1.071us 6156
4107
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.585ms 12.98% 6.585ms 1.070us 6153
4108
+ aten::mul 3.28% 30.750ms 5.69% 53.382ms 17.326us 4.708ms 9.28% 4.708ms 1.528us 3081
4109
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.480ms 8.83% 4.480ms 1.458us 3072
4110
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.159ms 8.20% 4.159ms 1.354us 3072
4111
+ aten::remainder 3.15% 29.490ms 4.77% 44.737ms 14.563us 3.838ms 7.56% 3.838ms 1.249us 3072
4112
+ aten::add 2.76% 25.910ms 4.76% 44.643ms 14.719us 3.755ms 7.40% 3.755ms 1.238us 3033
4113
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 7.20% 3.655ms 1.190us 3072
4114
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.364ms 6.63% 3.364ms 1.110us 3030
4115
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.022ms 3.99% 2.022ms 1.316us 1536
4116
  void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.816ms 3.58% 1.816ms 1.182us 1536
4117
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 284.802us 0.56% 284.802us 47.467us 6
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
+ Self CPU time total: 937.562ms
4120
+ Self CUDA time total: 50.735ms
4121
 
4122
 
4123
 
 
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4129
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4130
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 958.363ms 1758.28% 958.363ms 958.363ms 1
4131
+ binned_torch 24.25% 232.525ms 100.00% 958.754ms 958.754ms 0.000us 0.00% 54.510ms 54.510ms 1
4132
+ aten::item 1.77% 17.002ms 27.44% 263.071ms 15.534us 0.000us 0.00% 17.862ms 1.055us 16935
4133
+ aten::_local_scalar_dense 6.54% 62.707ms 25.67% 246.070ms 14.530us 17.860ms 32.77% 17.862ms 1.055us 16935
4134
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 17.860ms 32.77% 17.860ms 1.055us 16935
4135
+ aten::bmm 0.02% 170.065us 0.02% 212.615us 35.436us 7.895ms 14.48% 7.895ms 1.316ms 6
4136
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.895ms 14.48% 7.895ms 1.316ms 6
4137
+ aten::floor_divide 4.96% 47.565ms 12.31% 117.977ms 19.202us 7.812ms 14.33% 7.813ms 1.272us 6144
4138
+ aten::copy_ 3.61% 34.645ms 8.68% 83.187ms 13.513us 6.631ms 12.17% 6.631ms 1.077us 6156
4139
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.628ms 12.16% 6.628ms 1.077us 6152
4140
+ aten::add 3.91% 37.531ms 7.22% 69.217ms 15.070us 5.262ms 9.65% 5.262ms 1.146us 4593
4141
+ aten::mul 3.03% 29.029ms 5.30% 50.820ms 16.495us 4.703ms 8.63% 4.703ms 1.526us 3081
4142
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.476ms 8.21% 4.476ms 1.457us 3072
4143
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.156ms 7.62% 4.156ms 1.353us 3072
4144
+ aten::remainder 2.84% 27.273ms 4.45% 42.673ms 13.891us 3.854ms 7.07% 3.854ms 1.255us 3072
4145
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.656ms 6.71% 3.656ms 1.190us 3072
4146
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.271ms 6.00% 3.271ms 1.080us 3030
4147
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.031ms 3.73% 2.031ms 1.323us 1536
4148
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.822ms 3.34% 1.822ms 1.187us 1536
4149
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.585ms 2.91% 1.585ms 1.016us 1560
4150
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4151
+ Self CPU time total: 958.762ms
4152
+ Self CUDA time total: 54.506ms
4153
 
4154
 
4155
 
 
4159
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4160
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4161
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4162
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.754s 1688.21% 1.754s 1.754s 1
4163
+ binned_torch 24.13% 423.200ms 100.00% 1.754s 1.754s 0.000us 0.00% 103.889ms 103.889ms 1
4164
+ aten::item 1.68% 29.485ms 26.54% 465.492ms 15.256us 0.000us 0.00% 31.587ms 1.035us 30513
4165
+ aten::_local_scalar_dense 6.17% 108.158ms 24.86% 436.007ms 14.289us 31.585ms 30.40% 31.587ms 1.035us 30513
4166
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 31.585ms 30.40% 31.585ms 1.035us 30513
4167
+ aten::floor_divide 5.33% 93.524ms 13.33% 233.711ms 19.019us 15.605ms 15.02% 15.605ms 1.270us 12288
4168
+ aten::bmm 0.01% 221.157us 0.02% 267.387us 44.564us 15.098ms 14.53% 15.098ms 2.516ms 6
4169
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.098ms 14.53% 15.098ms 2.516ms 6
4170
+ aten::copy_ 3.90% 68.459ms 9.45% 165.766ms 13.477us 13.325ms 12.83% 13.325ms 1.083us 12300
4171
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.322ms 12.82% 13.322ms 1.084us 12294
4172
+ aten::mul 3.29% 57.635ms 5.89% 103.357ms 16.798us 11.271ms 10.85% 11.273ms 1.832us 6153
4173
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.920ms 9.55% 9.920ms 1.615us 6144
4174
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.308ms 8.00% 8.308ms 1.352us 6144
4175
+ aten::remainder 3.09% 54.193ms 4.85% 85.026ms 13.839us 7.675ms 7.39% 7.675ms 1.249us 6144
4176
+ aten::add 2.79% 48.989ms 4.92% 86.297ms 14.595us 7.638ms 7.35% 7.639ms 1.292us 5913
4177
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.297ms 7.02% 7.297ms 1.188us 6144
4178
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.357ms 6.12% 6.357ms 1.076us 5910
4179
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.044ms 3.89% 4.044ms 1.317us 3072
4180
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.632ms 3.50% 3.632ms 1.182us 3072
4181
+ aten::clamp 0.00% 73.899us 0.01% 123.411us 20.569us 1.193ms 1.15% 1.193ms 198.833us 6
4182
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4183
+ Self CPU time total: 1.754s
4184
+ Self CUDA time total: 103.882ms
4185
 
4186
 
4187
 
 
4191
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4192
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4193
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4194
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.874s 1695.99% 1.874s 1.874s 1
4195
+ binned_torch 24.25% 455.076ms 100.00% 1.876s 1.876s 0.000us 0.00% 110.516ms 110.516ms 1
4196
+ aten::item 1.77% 33.154ms 27.43% 514.675ms 15.259us 0.000us 0.00% 34.979ms 1.037us 33729
4197
+ aten::_local_scalar_dense 6.27% 117.583ms 25.66% 481.520ms 14.276us 34.976ms 31.65% 34.979ms 1.037us 33729
4198
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 34.976ms 31.65% 34.976ms 1.037us 33728
4199
+ aten::floor_divide 4.89% 91.819ms 12.09% 226.952ms 18.469us 15.582ms 14.10% 15.582ms 1.268us 12288
4200
+ aten::bmm 0.01% 222.715us 0.01% 267.616us 44.603us 15.462ms 13.99% 15.462ms 2.577ms 6
4201
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.462ms 13.99% 15.462ms 2.577ms 6
4202
+ aten::copy_ 3.58% 67.106ms 8.62% 161.781ms 13.153us 13.339ms 12.07% 13.341ms 1.085us 12300
4203
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.337ms 12.07% 13.337ms 1.085us 12294
4204
+ aten::mul 3.09% 57.893ms 5.35% 100.363ms 16.311us 10.926ms 9.89% 10.927ms 1.776us 6153
4205
+ aten::add 4.06% 76.225ms 6.94% 130.290ms 14.319us 10.845ms 9.81% 10.845ms 1.192us 9099
4206
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.572ms 8.66% 9.572ms 1.558us 6144
4207
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.302ms 7.51% 8.302ms 1.351us 6144
4208
+ aten::remainder 2.99% 56.031ms 4.55% 85.473ms 13.912us 7.682ms 6.95% 7.682ms 1.250us 6144
4209
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.280ms 6.59% 7.280ms 1.185us 6144
4210
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.358ms 5.75% 6.358ms 1.076us 5910
4211
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.050ms 3.67% 4.050ms 1.318us 3072
4212
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.631ms 3.29% 3.631ms 1.182us 3072
4213
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.228ms 2.92% 3.228ms 1.013us 3186
4214
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4215
+ Self CPU time total: 1.876s
4216
+ Self CUDA time total: 110.507ms
4217
 
4218
 
4219
 
 
4223
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4224
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4225
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4226
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.610s 1697.16% 3.610s 3.610s 1
4227
+ binned_torch 23.68% 855.222ms 100.00% 3.611s 3.611s 0.000us 0.00% 212.735ms 212.735ms 1
4228
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 63.569ms 29.88% 63.569ms 1.032us 61587
4229
+ aten::item 1.81% 65.197ms 27.34% 987.119ms 16.028us 0.000us 0.00% 63.568ms 1.032us 61587
4230
+ aten::_local_scalar_dense 6.48% 233.826ms 25.53% 921.922ms 14.969us 63.567ms 29.88% 63.568ms 1.032us 61587
4231
+ aten::floor_divide 5.24% 189.036ms 13.02% 470.235ms 19.134us 31.579ms 14.85% 31.582ms 1.285us 24576
4232
+ aten::bmm 0.01% 232.455us 0.01% 281.845us 46.974us 29.001ms 13.63% 29.001ms 4.833ms 6
4233
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 29.001ms 13.63% 29.001ms 4.833ms 6
4234
+ aten::copy_ 3.67% 132.477ms 9.25% 334.079ms 13.587us 26.719ms 12.56% 26.722ms 1.087us 24588
4235
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.715ms 12.56% 26.715ms 1.087us 24585
4236
+ aten::mul 3.15% 113.903ms 5.68% 205.201ms 16.687us 25.580ms 12.03% 25.582ms 2.080us 12297
4237
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.132ms 10.40% 22.132ms 1.801us 12288
4238
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.992ms 7.99% 16.992ms 1.383us 12288
4239
+ aten::add 2.81% 101.355ms 4.98% 179.658ms 14.476us 16.634ms 7.82% 16.635ms 1.340us 12411
4240
+ aten::remainder 3.15% 113.609ms 4.99% 180.020ms 14.650us 15.413ms 7.25% 15.415ms 1.255us 12288
4241
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.588ms 6.86% 14.588ms 1.187us 12288
4242
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.512ms 6.35% 13.512ms 1.089us 12408
4243
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.121ms 3.82% 8.121ms 1.322us 6144
4244
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.292ms 3.43% 7.292ms 1.187us 6144
4245
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.612ms 1.23% 2.612ms 435.298us 6
4246
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4247
+ Self CPU time total: 3.611s
4248
+ Self CUDA time total: 212.720ms
4249
 
4250
 
4251
 
 
4255
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4256
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4257
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4258
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.762s 1666.70% 3.762s 3.762s 1
4259
+ binned_torch 23.91% 899.748ms 100.00% 3.764s 3.764s 0.000us 0.00% 225.734ms 225.734ms 1
4260
+ aten::item 1.82% 68.620ms 27.46% 1.034s 15.235us 0.000us 0.00% 69.795ms 1.029us 67845
4261
+ aten::_local_scalar_dense 6.31% 237.441ms 25.64% 964.994ms 14.224us 69.792ms 30.92% 69.795ms 1.029us 67845
4262
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 69.793ms 30.92% 69.793ms 1.029us 67840
4263
+ aten::floor_divide 4.95% 186.290ms 12.17% 458.105ms 18.640us 31.553ms 13.98% 31.560ms 1.284us 24576
4264
+ aten::bmm 0.01% 226.315us 0.01% 272.505us 45.418us 29.269ms 12.97% 29.269ms 4.878ms 6
4265
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 29.269ms 12.97% 29.269ms 4.878ms 6
4266
+ aten::copy_ 3.56% 134.013ms 8.54% 321.380ms 13.071us 26.742ms 11.85% 26.743ms 1.088us 24588
4267
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.740ms 11.85% 26.740ms 1.088us 24581
4268
+ aten::mul 3.06% 115.077ms 5.31% 199.757ms 16.244us 25.618ms 11.35% 25.618ms 2.083us 12297
4269
+ aten::add 4.14% 155.825ms 7.08% 266.365ms 14.291us 23.275ms 10.31% 23.276ms 1.249us 18639
4270
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.160ms 9.82% 22.160ms 1.803us 12288
4271
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 17.005ms 7.53% 17.005ms 1.384us 12287
4272
+ aten::remainder 2.93% 110.282ms 4.49% 168.952ms 13.749us 15.362ms 6.81% 15.364ms 1.250us 12288
4273
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.548ms 6.45% 14.548ms 1.184us 12287
4274
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.690ms 6.07% 13.690ms 1.103us 12407
4275
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.098ms 3.59% 8.098ms 1.318us 6144
4276
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.264ms 3.22% 7.264ms 1.182us 6144
4277
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.476ms 2.87% 6.476ms 1.040us 6228
4278
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4279
+ Self CPU time total: 3.764s
4280
+ Self CUDA time total: 225.722ms
4281
 
4282
 
4283
 
 
4287
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4288
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4289
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4290
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.172s 1685.34% 7.172s 7.172s 1
4291
+ binned_torch 23.83% 1.712s 100.00% 7.184s 7.184s 0.000us 0.00% 425.602ms 425.602ms 1
4292
+ aten::item 1.77% 127.233ms 27.17% 1.952s 15.898us 0.000us 0.00% 127.069ms 1.035us 122763
4293
+ aten::_local_scalar_dense 6.22% 446.668ms 25.40% 1.825s 14.862us 127.060ms 29.86% 127.069ms 1.035us 122763
4294
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 127.060ms 29.86% 127.060ms 1.035us 122762
4295
+ aten::floor_divide 5.22% 375.373ms 13.07% 938.750ms 19.099us 63.372ms 14.89% 63.374ms 1.289us 49152
4296
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 57.057ms 13.41% 57.057ms 9.509ms 6
4297
+ aten::bmm 0.00% 232.954us 0.00% 280.556us 46.759us 57.057ms 13.41% 57.057ms 9.509ms 6
4298
+ aten::copy_ 3.67% 263.382ms 9.14% 656.814ms 13.361us 53.605ms 12.60% 53.606ms 1.090us 49158
4299
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.603ms 12.60% 53.603ms 1.091us 49154
4300
+ aten::mul 3.19% 229.239ms 5.71% 410.065ms 16.679us 51.561ms 12.12% 51.568ms 2.098us 24585
4301
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.597ms 10.48% 44.597ms 1.815us 24576
4302
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 34.170ms 8.03% 34.170ms 1.390us 24576
4303
+ aten::add 2.78% 199.917ms 4.97% 356.982ms 14.609us 33.583ms 7.89% 33.584ms 1.374us 24435
4304
+ aten::remainder 3.17% 227.943ms 4.97% 356.780ms 14.517us 30.902ms 7.26% 30.903ms 1.257us 24576
4305
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.202ms 6.86% 29.202ms 1.188us 24576
4306
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.924ms 6.33% 26.924ms 1.102us 24431
4307
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.278ms 3.82% 16.278ms 1.325us 12288
4308
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.628ms 3.44% 14.628ms 1.190us 12288
4309
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 5.242ms 1.23% 5.242ms 873.601us 6
4310
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4311
+ Self CPU time total: 7.184s
4312
+ Self CUDA time total: 425.579ms
4313
 
4314
 
4315
 
 
4319
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4320
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4321
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4322
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.590s 1687.04% 7.590s 7.590s 1
4323
+ binned_torch 23.93% 1.817s 100.00% 7.592s 7.592s 0.000us 0.00% 449.935ms 449.935ms 1
4324
+ aten::item 1.74% 131.929ms 27.26% 2.070s 15.365us 0.000us 0.00% 139.467ms 1.035us 134715
4325
+ aten::_local_scalar_dense 6.36% 483.083ms 25.53% 1.938s 14.386us 139.456ms 31.00% 139.467ms 1.035us 134715
4326
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 139.456ms 31.00% 139.456ms 1.035us 134706
4327
+ aten::floor_divide 4.94% 375.293ms 12.19% 925.665ms 18.833us 63.455ms 14.10% 63.460ms 1.291us 49152
4328
+ aten::bmm 0.00% 234.075us 0.00% 282.947us 47.158us 56.663ms 12.59% 56.663ms 9.444ms 6
4329
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 56.663ms 12.59% 56.663ms 9.444ms 6
4330
+ aten::copy_ 3.75% 285.044ms 8.75% 664.131ms 13.510us 53.858ms 11.97% 53.860ms 1.096us 49158
4331
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.855ms 11.97% 53.855ms 1.096us 49149
4332
+ aten::mul 3.08% 233.920ms 5.34% 405.684ms 16.501us 51.582ms 11.47% 51.587ms 2.098us 24585
4333
+ aten::add 3.87% 294.168ms 6.87% 521.854ms 14.354us 45.530ms 10.12% 45.534ms 1.252us 36357
4334
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.640ms 9.92% 44.640ms 1.816us 24576
4335
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 34.166ms 7.59% 34.166ms 1.390us 24573
4336
+ aten::remainder 2.91% 220.707ms 4.59% 348.339ms 14.174us 30.841ms 6.86% 30.843ms 1.255us 24576
4337
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.291ms 6.51% 29.291ms 1.192us 24573
4338
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.632ms 5.92% 26.632ms 1.090us 24431
4339
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.258ms 3.61% 16.258ms 1.323us 12288
4340
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.582ms 3.24% 14.582ms 1.187us 12288
4341
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.272ms 2.73% 12.272ms 1.029us 11922
4342
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4343
+ Self CPU time total: 7.592s
4344
+ Self CUDA time total: 449.893ms
4345
 
4346
 
4347
  impl wl p50(ms) ok
4348
+ binned_torch cuda_B1_S1024_E2 377.89 True
4349
+ binned_torch cuda_B1_S1024_E4 408.91 True
4350
+ binned_torch cuda_B1_S512_E2 158.27 True
4351
+ binned_torch cuda_B1_S512_E4 209.01 True
4352
+ binned_torch cuda_B4_S1024_E2 1516.51 True
4353
+ binned_torch cuda_B4_S1024_E4 1643.14 True
4354
+ binned_torch cuda_B4_S512_E2 769.64 True
4355
+ binned_torch cuda_B4_S512_E4 816.95 True
4356
  </pre></div>
4357
+ <div class="uv-install-logs" id="uv-logs-benchmark">
4358
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4359
+ <div class="uv-logs-content" style="display: none;">
4360
+ Installed 37 packages in 284ms
4361
+ </div>
4362
+ </div>
4363
  <div class="cell-artifacts">
4364
  <h4>Artifacts:</h4>
4365
  <a href="artifacts/benchmark/openai_moe.jsonl" class="artifact" target="_blank">openai_moe.jsonl</a>
openai_moe/impls/cells/benchmark.py CHANGED
@@ -4,59 +4,24 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
10
  # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
 
11
  # ///
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
15
 
 
 
 
16
 
17
- def binned_gather(x, indices, bins, expert_capacity, top_k):
18
- E, H = bins.shape[0], x.shape[1]
19
- out = torch.zeros((E, expert_capacity, H), device=x.device, dtype=x.dtype)
20
- for e in range(E):
21
- start = 0 if e == 0 else bins[e - 1]
22
- end = bins[e]
23
- n = min(end - start, expert_capacity)
24
- for i in range(n):
25
- flat_pos = indices[start + i]
26
- tok = flat_pos // top_k
27
- out[e, i] = x[tok]
28
- return out
29
 
30
-
31
- def binned_scatter(x, indices, weights, bins, expert_capacity, top_k):
32
- E, C, H = x.shape
33
- N = indices.shape[0] // top_k
34
- out = torch.zeros((N, top_k, H), dtype=x.dtype, device=x.device)
35
- for e in range(E):
36
- start = 0 if e == 0 else bins[e - 1]
37
- end = bins[e]
38
- n = end - start
39
- if n == 0:
40
- continue
41
- take = min(n, expert_capacity)
42
- for i in range(take):
43
- flat_pos = indices[start + i] # flattened (token, slot)
44
- tok = flat_pos // top_k
45
- slot = flat_pos % top_k
46
- scale = weights[flat_pos] if weights is not None else 1.0
47
- out[tok, slot] = x[e, i] * scale
48
- return out.sum(dim=1)
49
-
50
-
51
- def sort_tokens_by_expert(router_indices, num_experts):
52
- flat_indices = router_indices.flatten()
53
- sorted_values, sorted_indices = torch.sort(flat_indices)
54
- tokens_per_expert = torch.bincount(sorted_values, minlength=num_experts)
55
- bins = torch.cumsum(tokens_per_expert, dim=0)
56
- return sorted_indices, sorted_values, bins, tokens_per_expert
57
-
58
-
59
- def binned_experts_ref(
60
  hidden_states,
61
  router_indices,
62
  routing_weights,
@@ -64,73 +29,53 @@ def binned_experts_ref(
64
  gate_up_proj_bias,
65
  down_proj,
66
  down_proj_bias,
67
- expert_capacity,
68
  ):
 
 
 
 
69
  B, S, H = hidden_states.shape
70
- E, K = routing_weights.shape[2], router_indices.shape[1]
71
-
72
- indices, _, bins, _ = sort_tokens_by_expert(router_indices, E)
73
- x = binned_gather(hidden_states.view(-1, H), indices, bins, expert_capacity, K)
74
 
75
- gate_up = torch.bmm(x, gate_up_proj) + gate_up_proj_bias[..., None, :]
76
- gate, up = gate_up[..., ::2], gate_up[..., 1::2]
 
 
 
77
 
78
- # clamp to limit
79
- limit = 7.0
80
- gate = gate.clamp(min=None, max=limit)
81
- up = up.clamp(min=-limit, max=limit)
82
 
83
- glu = gate * torch.sigmoid(gate * 1.702)
84
- x = (up + 1) * glu
85
- x = torch.bmm(x, down_proj) + down_proj_bias[..., None, :]
 
 
86
 
87
- # build routing weights aligned to (token, slot)
88
- flat_dense = routing_weights.view(-1, E) # [B*S, E]
89
- flat_router = router_indices.view(-1, K) # [B*S, K]
90
- selected = torch.gather(flat_dense, 1, flat_router).reshape(-1) # [B*S*K]
91
 
92
- # scatter back
93
- y = binned_scatter(x, indices, selected, bins, expert_capacity, K) # [B*S, H]
 
 
94
 
95
- return y.view(B, S, H)
 
96
 
 
 
 
97
 
98
- def binned_torch_openai_moe(
99
- hidden_states,
100
- router_indices,
101
- routing_weights,
102
- gate_up_proj,
103
- gate_up_proj_bias,
104
- down_proj,
105
- down_proj_bias,
106
- ):
107
- """
108
- Binned PyTorch implementation of OpenAI-style MoE.
109
- Sorts tokens by expert assignment for more efficient batched processing.
110
- """
111
- B, S = hidden_states.shape[0], hidden_states.shape[1]
112
- K = router_indices.shape[1]
113
-
114
- # Set expert_capacity to a reasonable value (max tokens per expert)
115
- # Use 2x the average to handle imbalance
116
- expert_capacity = (B * S * K * 2) // routing_weights.shape[2]
117
 
118
- return binned_experts_ref(
119
- hidden_states,
120
- router_indices,
121
- routing_weights,
122
- gate_up_proj,
123
- gate_up_proj_bias,
124
- down_proj,
125
- down_proj_bias,
126
- expert_capacity,
127
- )
128
 
129
 
130
  run_benchmark(
131
  kernel_type=KernelTypeEnum.OPENAI_MOE,
132
- impl_name="binned_torch",
133
- impl_tags={"family": "pytorch", "backend": "eager"},
134
- impl_func=binned_torch_openai_moe,
135
  dtype="float32",
136
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
+ # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
11
  # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
12
+ # kernels = { git = "https://github.com/huggingface/kernels.git" }
13
  # ///
14
  import torch
15
  import sys
16
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
17
+ from kernels import get_kernel
18
 
19
+ # Load yamoe to get GptOssExperts reference
20
+ yamoe = get_kernel("drbh/yamoe", revision="v0.2.0")
21
+ GptOssExperts = yamoe.vendored.gpt_oss_mlp.GptOssExperts
22
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ def gpt_oss_openai_moe(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  hidden_states,
26
  router_indices,
27
  routing_weights,
 
29
  gate_up_proj_bias,
30
  down_proj,
31
  down_proj_bias,
 
32
  ):
33
+ """
34
+ GptOssExperts reference implementation of OpenAI-style MoE.
35
+ This is the reference model implementation from the original GPT OSS codebase.
36
+ """
37
  B, S, H = hidden_states.shape
38
+ E = routing_weights.shape[2]
 
 
 
39
 
40
+ # Create a config object for GptOssExperts
41
+ config = type("Config", (), {})()
42
+ config.hidden_size = H
43
+ config.intermediate_size = gate_up_proj.shape[2] // 2 # expert_dim / 2 = H
44
+ config.num_local_experts = E
45
 
46
+ # Initialize model
47
+ model = GptOssExperts(config)
 
 
48
 
49
+ # Set weights from benchmark inputs
50
+ model.gate_up_proj.data = gate_up_proj
51
+ model.gate_up_proj_bias.data = gate_up_proj_bias
52
+ model.down_proj.data = down_proj
53
+ model.down_proj_bias.data = down_proj_bias
54
 
55
+ model = model.to(hidden_states.device)
56
+ model.eval()
 
 
57
 
58
+ # Force GptOssExperts to use CPU path for correctness (matches naive_moe_ref behavior)
59
+ # The GPU path processes all experts which can lead to numerical differences
60
+ # CPU path explicitly uses router_indices like the reference implementation
61
+ model.train() # Force CPU path
62
 
63
+ # Flatten routing_weights to [batch_seq, num_experts]
64
+ routing_weights_flat = routing_weights.view(-1, E)
65
 
66
+ # Run forward pass
67
+ with torch.no_grad():
68
+ output = model(hidden_states, router_indices, routing_weights_flat)
69
 
70
+ model.eval() # Reset to eval mode
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
+ return output
 
 
 
 
 
 
 
 
 
73
 
74
 
75
  run_benchmark(
76
  kernel_type=KernelTypeEnum.OPENAI_MOE,
77
+ impl_name="gpt_oss_experts",
78
+ impl_tags={"family": "reference", "backend": "pytorch"},
79
+ impl_func=gpt_oss_openai_moe,
80
  dtype="float32",
81
  )
openai_moe/impls/gpt_oss_moe.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3905,7 +3905,7 @@ Cell: nv | 0.25s
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
- <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:56:28 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.25s
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
- | N/A 34C P0 80W / 350W | 0MiB / 46068MiB | 41% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.25s
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
- Cell: benchmark | 24.78s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -4042,29 +4042,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E2
4042
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4043
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4044
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4045
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 10.216ms 197.55% 10.216ms 10.216ms 1
4046
- gpt_oss_experts 15.91% 1.991ms 99.94% 12.506ms 12.506ms 0.000us 0.00% 5.174ms 5.174ms 1
4047
- aten::matmul 0.20% 25.600us 3.83% 479.475us 39.956us 0.000us 0.00% 4.551ms 379.252us 12
4048
- aten::mm 2.39% 299.076us 3.63% 453.875us 37.823us 4.551ms 88.01% 4.551ms 379.252us 12
4049
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.080ms 59.56% 3.080ms 342.220us 9
4050
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.465ms 28.33% 1.465ms 488.237us 3
4051
- aten::mul 1.26% 158.145us 2.18% 272.217us 11.342us 108.227us 2.09% 108.227us 4.509us 24
4052
- aten::add 1.55% 194.211us 3.70% 462.764us 25.709us 102.178us 1.98% 102.178us 5.677us 18
4053
- aten::index 1.59% 198.973us 2.67% 334.663us 27.889us 88.354us 1.71% 88.354us 7.363us 12
4054
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 79.810us 1.54% 79.810us 6.651us 12
4055
- aten::index_add_ 0.45% 56.680us 0.73% 90.740us 15.123us 79.552us 1.54% 79.552us 13.259us 6
4056
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 79.552us 1.54% 79.552us 13.259us 6
4057
- aten::nonzero 2.18% 273.387us 6.63% 829.392us 92.155us 65.344us 1.26% 76.032us 8.448us 9
4058
- aten::clamp 1.03% 129.422us 1.66% 207.823us 17.319us 62.817us 1.21% 62.817us 5.235us 12
4059
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.817us 1.21% 62.817us 5.235us 12
4060
- aten::where 0.06% 7.719us 5.20% 651.098us 108.516us 0.000us 0.00% 61.377us 10.230us 6
4061
- aten::nonzero_numpy 0.10% 11.990us 5.14% 643.379us 107.230us 0.000us 0.00% 61.377us 10.230us 6
4062
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 60.705us 1.17% 60.705us 10.117us 6
4063
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 56.224us 1.09% 56.224us 4.685us 12
4064
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 52.097us 1.01% 52.097us 1.158us 45
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
- Self CPU time total: 12.513ms
4067
- Self CUDA time total: 5.171ms
4068
 
4069
 
4070
 
@@ -4074,29 +4074,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E4
4074
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4075
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 13.651ms 223.52% 13.651ms 13.651ms 1
4078
- gpt_oss_experts 16.12% 2.545ms 99.96% 15.780ms 15.780ms 0.000us 0.00% 6.110ms 6.110ms 1
4079
- aten::matmul 0.27% 42.481us 4.88% 770.802us 32.117us 0.000us 0.00% 5.294ms 220.572us 24
4080
- aten::mm 2.84% 449.097us 4.61% 728.321us 30.347us 5.294ms 86.68% 5.294ms 220.572us 24
4081
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.237ms 85.76% 5.237ms 218.225us 24
4082
- aten::nonzero 2.39% 377.468us 7.75% 1.223ms 81.521us 114.980us 1.88% 137.541us 9.169us 15
4083
- aten::mul 1.82% 287.750us 3.13% 494.205us 10.296us 131.291us 2.15% 131.291us 2.735us 48
4084
- aten::add 2.12% 335.279us 3.54% 558.312us 15.509us 126.947us 2.08% 126.947us 3.526us 36
4085
- aten::where 0.06% 10.192us 7.29% 1.151ms 95.886us 0.000us 0.00% 123.269us 10.272us 12
4086
- aten::nonzero_numpy 0.13% 20.434us 7.22% 1.140ms 95.037us 0.000us 0.00% 123.269us 10.272us 12
4087
- aten::index 2.26% 356.611us 3.79% 598.637us 24.943us 111.201us 1.82% 111.201us 4.633us 24
4088
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 100.995us 1.65% 100.995us 4.208us 24
4089
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 92.827us 1.52% 92.827us 1.067us 87
4090
- aten::clamp 1.32% 208.364us 2.23% 352.254us 14.677us 87.969us 1.44% 87.969us 3.665us 24
4091
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 87.969us 1.44% 87.969us 3.665us 24
4092
- aten::item 0.49% 76.878us 38.49% 6.076ms 84.392us 0.000us 0.00% 76.474us 1.062us 72
4093
- aten::_local_scalar_dense 1.91% 301.114us 38.00% 5.999ms 83.325us 76.474us 1.25% 76.474us 1.062us 72
4094
- aten::index_add_ 0.59% 93.433us 0.97% 153.683us 12.807us 71.618us 1.17% 71.618us 5.968us 12
4095
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 71.618us 1.17% 71.618us 5.968us 12
4096
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 66.305us 1.09% 66.305us 5.525us 12
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
- Self CPU time total: 15.786ms
4099
- Self CUDA time total: 6.107ms
4100
 
4101
 
4102
 
@@ -4106,29 +4106,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E2
4106
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4107
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 12.389ms 148.13% 12.389ms 12.389ms 1
4110
- gpt_oss_experts 11.41% 1.669ms 99.96% 14.621ms 14.621ms 0.000us 0.00% 8.369ms 8.369ms 1
4111
- aten::matmul 0.15% 21.391us 2.94% 430.078us 35.840us 0.000us 0.00% 7.346ms 612.203us 12
4112
- aten::mm 1.75% 256.389us 2.79% 408.687us 34.057us 7.346ms 87.84% 7.346ms 612.203us 12
4113
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 4.488ms 53.66% 4.488ms 748.004us 6
4114
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 1.464ms 17.50% 1.464ms 487.982us 3
4115
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.388ms 16.59% 1.388ms 462.616us 3
4116
- aten::mul 1.00% 145.604us 1.75% 255.696us 10.654us 194.273us 2.32% 194.273us 8.095us 24
4117
- aten::add 1.43% 208.704us 2.27% 331.465us 18.415us 186.050us 2.22% 186.050us 10.336us 18
4118
- aten::index_add_ 0.32% 46.701us 0.54% 78.582us 13.097us 164.160us 1.96% 164.160us 27.360us 6
4119
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 164.160us 1.96% 164.160us 27.360us 6
4120
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 147.425us 1.76% 147.425us 12.285us 12
4121
- aten::index 1.21% 177.253us 2.08% 304.936us 25.411us 145.886us 1.74% 145.886us 12.157us 12
4122
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 115.777us 1.38% 115.777us 19.296us 6
4123
- aten::clamp 0.73% 106.215us 1.25% 183.083us 15.257us 109.858us 1.31% 109.858us 9.155us 12
4124
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 109.858us 1.31% 109.858us 9.155us 12
4125
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 103.393us 1.24% 103.393us 8.616us 12
4126
- aten::nonzero 1.57% 229.936us 5.04% 737.613us 81.957us 69.954us 0.84% 81.378us 9.042us 9
4127
- aten::where 0.04% 5.651us 4.11% 600.652us 100.109us 0.000us 0.00% 66.625us 11.104us 6
4128
- aten::nonzero_numpy 0.07% 10.392us 4.07% 595.001us 99.167us 0.000us 0.00% 66.625us 11.104us 6
4129
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4130
- Self CPU time total: 14.627ms
4131
- Self CUDA time total: 8.364ms
4132
 
4133
 
4134
 
@@ -4138,29 +4138,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E4
4138
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4139
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4140
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4141
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 18.030ms 173.64% 18.030ms 18.030ms 1
4142
- gpt_oss_experts 13.01% 2.655ms 99.97% 20.395ms 20.395ms 0.000us 0.00% 10.389ms 10.389ms 1
4143
- aten::matmul 0.22% 44.301us 3.96% 808.849us 33.702us 0.000us 0.00% 9.112ms 379.676us 24
4144
- aten::mm 2.30% 469.031us 3.75% 764.548us 31.856us 9.112ms 87.75% 9.112ms 379.676us 24
4145
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 6.210ms 59.81% 6.210ms 345.012us 18
4146
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.889ms 27.82% 2.889ms 481.470us 6
4147
- aten::mul 1.42% 289.963us 2.49% 508.435us 10.592us 229.763us 2.21% 229.763us 4.787us 48
4148
- aten::add 1.72% 350.925us 2.89% 589.949us 16.387us 210.624us 2.03% 210.624us 5.851us 36
4149
- aten::index 1.71% 348.756us 3.02% 616.348us 25.681us 206.625us 1.99% 206.625us 8.609us 24
4150
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 164.800us 1.59% 164.800us 6.867us 24
4151
- aten::index_add_ 0.46% 94.741us 0.78% 158.583us 13.215us 154.948us 1.49% 154.948us 12.912us 12
4152
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 154.948us 1.49% 154.948us 12.912us 12
4153
- aten::nonzero 1.87% 380.973us 6.27% 1.279ms 85.299us 123.616us 1.19% 148.097us 9.873us 15
4154
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 147.008us 1.42% 147.008us 12.251us 12
4155
- aten::where 0.05% 10.520us 5.90% 1.205ms 100.384us 0.000us 0.00% 133.153us 11.096us 12
4156
- aten::nonzero_numpy 0.10% 20.862us 5.85% 1.194ms 99.507us 0.000us 0.00% 133.153us 11.096us 12
4157
- aten::clamp 1.12% 227.601us 1.88% 383.872us 15.995us 131.553us 1.27% 131.553us 5.481us 24
4158
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 131.553us 1.27% 131.553us 5.481us 24
4159
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 117.823us 1.13% 117.823us 4.909us 24
4160
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 108.771us 1.05% 108.771us 1.250us 87
4161
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4162
- Self CPU time total: 20.401ms
4163
- Self CUDA time total: 10.384ms
4164
 
4165
 
4166
 
@@ -4170,29 +4170,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E2
4170
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4171
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4172
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4173
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 20.818ms 119.74% 20.818ms 20.818ms 1
4174
- gpt_oss_experts 7.44% 1.725ms 99.97% 23.178ms 23.178ms 0.000us 0.00% 17.396ms 17.396ms 1
4175
- aten::matmul 0.10% 22.710us 1.92% 444.608us 37.051us 0.000us 0.00% 14.530ms 1.211ms 12
4176
- aten::mm 1.15% 265.607us 1.82% 421.898us 35.158us 14.530ms 83.57% 14.530ms 1.211ms 12
4177
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 8.913ms 51.26% 8.913ms 1.485ms 6
4178
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.608ms 32.26% 5.608ms 934.678us 6
4179
- aten::add 0.78% 180.710us 1.31% 303.585us 16.866us 773.156us 4.45% 773.156us 42.953us 18
4180
- aten::mul 0.65% 149.642us 1.11% 257.853us 10.744us 660.963us 3.80% 660.963us 27.540us 24
4181
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 496.548us 2.86% 496.548us 41.379us 12
4182
- aten::index_add_ 0.21% 47.690us 0.35% 80.102us 13.350us 447.875us 2.58% 447.875us 74.646us 6
4183
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 447.875us 2.58% 447.875us 74.646us 6
4184
- aten::clamp 0.46% 106.452us 0.78% 180.843us 15.070us 330.692us 1.90% 330.692us 27.558us 12
4185
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 330.692us 1.90% 330.692us 27.558us 12
4186
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 303.202us 1.74% 303.202us 50.534us 6
4187
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 276.608us 1.59% 276.608us 46.101us 6
4188
- aten::index 0.79% 182.360us 1.33% 307.754us 25.646us 264.037us 1.52% 264.037us 22.003us 12
4189
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 255.650us 1.47% 255.650us 21.304us 12
4190
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 230.532us 1.33% 230.532us 38.422us 6
4191
- aten::sigmoid 0.15% 34.019us 0.26% 59.750us 9.958us 176.897us 1.02% 176.897us 29.483us 6
4192
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 176.897us 1.02% 176.897us 29.483us 6
4193
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4194
- Self CPU time total: 23.184ms
4195
- Self CUDA time total: 17.386ms
4196
 
4197
 
4198
 
@@ -4202,29 +4202,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E4
4202
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4203
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4204
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4205
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 24.095ms 139.71% 24.095ms 24.095ms 1
4206
- gpt_oss_experts 10.26% 2.566ms 99.98% 25.007ms 25.007ms 0.000us 0.00% 17.256ms 17.256ms 1
4207
- aten::matmul 0.18% 46.022us 3.50% 875.333us 36.472us 0.000us 0.00% 15.047ms 626.957us 24
4208
- aten::mm 2.10% 524.786us 3.32% 829.311us 34.555us 15.047ms 87.25% 15.047ms 626.957us 24
4209
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 9.083ms 52.67% 9.083ms 756.906us 12
4210
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.100ms 17.97% 3.100ms 516.616us 6
4211
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.851ms 16.53% 2.851ms 475.118us 6
4212
- aten::add 1.39% 348.094us 2.36% 591.130us 16.420us 420.966us 2.44% 420.966us 11.694us 36
4213
- aten::mul 1.18% 295.904us 2.08% 520.297us 10.840us 412.933us 2.39% 412.933us 8.603us 48
4214
- aten::index_add_ 0.37% 93.743us 0.64% 158.984us 13.249us 378.655us 2.20% 378.655us 31.555us 12
4215
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 378.655us 2.20% 378.655us 31.555us 12
4216
- aten::index 1.44% 360.181us 2.46% 616.468us 25.686us 341.602us 1.98% 341.602us 14.233us 24
4217
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 337.478us 1.96% 337.478us 14.062us 24
4218
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 277.186us 1.61% 277.186us 23.099us 12
4219
- aten::clamp 0.86% 215.346us 1.46% 365.788us 15.241us 227.201us 1.32% 227.201us 9.467us 24
4220
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 227.201us 1.32% 227.201us 9.467us 24
4221
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 218.148us 1.26% 218.148us 9.090us 24
4222
- aten::nonzero 1.58% 395.427us 5.08% 1.271ms 84.763us 129.407us 0.75% 155.998us 10.400us 15
4223
- aten::where 0.04% 10.161us 4.81% 1.203ms 100.233us 0.000us 0.00% 140.318us 11.693us 12
4224
- aten::nonzero_numpy 0.09% 22.657us 4.77% 1.193ms 99.386us 0.000us 0.00% 140.318us 11.693us 12
4225
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4226
- Self CPU time total: 25.012ms
4227
- Self CUDA time total: 17.246ms
4228
 
4229
 
4230
 
@@ -4234,29 +4234,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E2
4234
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4235
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4236
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4237
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.042ms 109.28% 40.042ms 40.042ms 1
4238
- gpt_oss_experts 4.23% 1.729ms 99.82% 40.817ms 40.817ms 0.000us 0.00% 36.674ms 36.674ms 1
4239
- aten::matmul 0.05% 21.410us 1.03% 421.330us 35.111us 0.000us 0.00% 26.675ms 2.223ms 12
4240
- aten::mm 0.68% 276.698us 0.98% 399.920us 33.327us 26.675ms 72.80% 26.675ms 2.223ms 12
4241
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 26.671ms 72.79% 26.671ms 2.223ms 12
4242
- aten::mul 0.37% 150.524us 0.64% 261.025us 10.876us 2.978ms 8.13% 2.978ms 124.096us 24
4243
- aten::add 0.45% 185.051us 1.06% 431.657us 23.981us 2.397ms 6.54% 2.397ms 133.144us 18
4244
- aten::clamp 0.27% 109.540us 0.45% 185.742us 15.479us 2.388ms 6.52% 2.388ms 199.031us 12
4245
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.388ms 6.52% 2.388ms 199.031us 12
4246
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.988ms 5.43% 1.988ms 165.705us 12
4247
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.624ms 4.43% 1.624ms 135.337us 12
4248
- aten::index_add_ 0.12% 48.010us 0.20% 82.940us 13.823us 919.238us 2.51% 919.238us 153.206us 6
4249
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 919.238us 2.51% 919.238us 153.206us 6
4250
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 772.550us 2.11% 772.550us 128.758us 6
4251
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 739.366us 2.02% 739.366us 123.228us 6
4252
- aten::index 0.45% 182.853us 0.76% 309.646us 25.804us 710.532us 1.94% 710.532us 59.211us 12
4253
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 676.741us 1.85% 676.741us 112.790us 6
4254
- aten::sigmoid 0.10% 42.329us 0.17% 69.270us 11.545us 319.457us 0.87% 319.457us 53.243us 6
4255
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 319.457us 0.87% 319.457us 53.243us 6
4256
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 250.467us 0.68% 250.467us 41.744us 6
4257
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4258
- Self CPU time total: 40.890ms
4259
- Self CUDA time total: 36.640ms
4260
 
4261
 
4262
 
@@ -4266,40 +4266,40 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E4
4266
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4267
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4268
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4269
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.661ms 117.09% 40.661ms 40.661ms 1
4270
- gpt_oss_experts 6.16% 2.556ms 99.99% 41.476ms 41.476ms 0.000us 0.00% 34.747ms 34.747ms 1
4271
- aten::matmul 0.11% 44.399us 2.11% 876.925us 36.539us 0.000us 0.00% 28.768ms 1.199ms 24
4272
- aten::mm 1.26% 521.881us 2.01% 832.526us 34.689us 28.768ms 82.84% 28.768ms 1.199ms 24
4273
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 20.394ms 58.72% 20.394ms 1.360ms 15
4274
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 8.357ms 24.06% 8.357ms 928.569us 9
4275
- aten::add 0.86% 357.079us 1.47% 609.793us 16.939us 1.481ms 4.26% 1.481ms 41.126us 36
4276
- aten::mul 0.72% 298.967us 1.26% 524.144us 10.920us 1.380ms 3.97% 1.380ms 28.743us 48
4277
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 929.416us 2.68% 929.416us 38.726us 24
4278
- aten::index_add_ 0.23% 94.554us 0.39% 161.804us 13.484us 921.702us 2.65% 921.702us 76.809us 12
4279
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 921.702us 2.65% 921.702us 76.809us 12
4280
- aten::clamp 0.53% 218.042us 0.91% 375.616us 15.651us 772.487us 2.22% 772.487us 32.187us 24
4281
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 772.487us 2.22% 772.487us 32.187us 24
4282
- aten::index 0.86% 357.217us 1.47% 607.740us 25.323us 652.838us 1.88% 652.838us 27.202us 24
4283
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 648.162us 1.87% 648.162us 54.013us 12
4284
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 580.997us 1.67% 580.997us 48.416us 12
4285
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 551.108us 1.59% 551.108us 45.926us 12
4286
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 524.097us 1.51% 524.097us 21.837us 24
4287
- aten::sigmoid 0.17% 69.444us 0.30% 123.064us 10.255us 357.924us 1.03% 357.924us 29.827us 12
4288
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 357.924us 1.03% 357.924us 29.827us 12
4289
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4290
- Self CPU time total: 41.482ms
4291
- Self CUDA time total: 34.727ms
4292
 
4293
 
4294
  impl wl p50(ms) ok
4295
- gpt_oss_experts cuda_B1_S1024_E2 3.77 True
4296
- gpt_oss_experts cuda_B1_S1024_E4 5.20 True
4297
- gpt_oss_experts cuda_B1_S512_E2 2.61 True
4298
- gpt_oss_experts cuda_B1_S512_E4 3.85 True
4299
- gpt_oss_experts cuda_B4_S1024_E2 13.12 True
4300
- gpt_oss_experts cuda_B4_S1024_E4 13.22 True
4301
- gpt_oss_experts cuda_B4_S512_E2 6.64 True
4302
- gpt_oss_experts cuda_B4_S512_E4 7.30 True
4303
  </pre></div>
4304
  <div class="uv-install-logs" id="uv-logs-benchmark">
4305
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
@@ -4308,14 +4308,12 @@ gpt_oss_experts cuda_B4_S512_E4 7.30 True
4308
  Updated https://github.com/huggingface/kernels.git (55b7c980e96bf5f747f0e4136be61c0b089ab76c)
4309
  Building kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c
4310
  Built kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c
4311
- Installed 51 packages in 279ms
4312
  </div>
4313
  </div>
4314
- <div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00&lt;?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
4315
-
4316
- Fetching 6 files: 17%|█▋ | 1/6 [00:00&lt;00:00, 5.68it/s]
4317
- Fetching 6 files: 50%|█████ | 3/6 [00:00&lt;00:00, 5.21it/s]
4318
- Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 10.50it/s]</div>
4319
  <div class="cell-artifacts">
4320
  <h4>Artifacts:</h4>
4321
  <a href="artifacts/benchmark/openai_moe.jsonl" class="artifact" target="_blank">openai_moe.jsonl</a>
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.28s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 19:41:48 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
 
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
+ | N/A 33C P0 126W / 350W | 0MiB / 46068MiB | 100% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
 
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
+ Cell: benchmark | 21.43s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
4042
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4043
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4044
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4045
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 10.220ms 197.88% 10.220ms 10.220ms 1
4046
+ gpt_oss_experts 16.01% 2.006ms 99.94% 12.523ms 12.523ms 0.000us 0.00% 5.168ms 5.168ms 1
4047
+ aten::matmul 0.20% 24.744us 3.78% 473.582us 39.465us 0.000us 0.00% 4.543ms 378.565us 12
4048
+ aten::mm 2.31% 289.874us 3.58% 448.838us 37.403us 4.543ms 87.96% 4.543ms 378.565us 12
4049
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.093ms 59.88% 3.093ms 343.626us 9
4050
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.444ms 27.95% 1.444ms 481.227us 3
4051
+ aten::mul 1.34% 167.604us 2.25% 281.908us 11.746us 108.865us 2.11% 108.865us 4.536us 24
4052
+ aten::add 1.61% 201.238us 3.79% 474.483us 26.360us 102.656us 1.99% 102.656us 5.703us 18
4053
+ aten::index 1.69% 212.259us 2.75% 345.042us 28.753us 88.512us 1.71% 88.512us 7.376us 12
4054
+ aten::index_add_ 0.46% 58.122us 0.75% 94.202us 15.700us 80.160us 1.55% 80.160us 13.360us 6
4055
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 80.160us 1.55% 80.160us 13.360us 6
4056
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 80.000us 1.55% 80.000us 6.667us 12
4057
+ aten::nonzero 2.08% 261.099us 6.37% 797.848us 88.650us 65.246us 1.26% 76.095us 8.455us 9
4058
+ aten::clamp 0.95% 119.641us 1.55% 194.514us 16.209us 63.010us 1.22% 63.010us 5.251us 12
4059
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 63.010us 1.22% 63.010us 5.251us 12
4060
+ aten::where 0.06% 7.130us 5.02% 629.533us 104.922us 0.000us 0.00% 61.472us 10.245us 6
4061
+ aten::nonzero_numpy 0.09% 11.550us 4.97% 622.403us 103.734us 0.000us 0.00% 61.472us 10.245us 6
4062
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 60.800us 1.18% 60.800us 10.133us 6
4063
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 56.608us 1.10% 56.608us 4.717us 12
4064
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 50.776us 0.98% 50.776us 1.128us 45
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
+ Self CPU time total: 12.530ms
4067
+ Self CUDA time total: 5.165ms
4068
 
4069
 
4070
 
 
4074
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4075
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 14.281ms 232.73% 14.281ms 14.281ms 1
4078
+ gpt_oss_experts 16.85% 2.763ms 99.97% 16.396ms 16.396ms 0.000us 0.00% 6.139ms 6.139ms 1
4079
+ aten::matmul 0.27% 44.470us 4.93% 808.156us 33.673us 0.000us 0.00% 5.322ms 221.756us 24
4080
+ aten::mm 2.81% 461.070us 4.66% 763.686us 31.820us 5.322ms 86.73% 5.322ms 221.756us 24
4081
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.267ms 85.83% 5.267ms 219.440us 24
4082
+ aten::nonzero 2.44% 399.465us 7.84% 1.285ms 85.683us 115.131us 1.88% 137.882us 9.192us 15
4083
+ aten::mul 1.86% 305.625us 3.19% 523.892us 10.914us 131.841us 2.15% 131.841us 2.747us 48
4084
+ aten::add 2.10% 345.215us 3.57% 585.271us 16.258us 127.810us 2.08% 127.810us 3.550us 36
4085
+ aten::where 0.07% 10.792us 7.40% 1.214ms 101.132us 0.000us 0.00% 123.674us 10.306us 12
4086
+ aten::nonzero_numpy 0.13% 21.688us 7.33% 1.203ms 100.233us 0.000us 0.00% 123.674us 10.306us 12
4087
+ aten::index 2.22% 363.289us 3.85% 631.035us 26.293us 111.423us 1.82% 111.423us 4.643us 24
4088
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 101.762us 1.66% 101.762us 4.240us 24
4089
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 91.773us 1.50% 91.773us 1.055us 87
4090
+ aten::clamp 1.29% 211.324us 2.19% 359.818us 14.992us 88.222us 1.44% 88.222us 3.676us 24
4091
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 88.222us 1.44% 88.222us 3.676us 24
4092
+ aten::item 0.47% 77.138us 37.50% 6.150ms 85.417us 0.000us 0.00% 75.678us 1.051us 72
4093
+ aten::_local_scalar_dense 1.90% 311.363us 37.03% 6.073ms 84.345us 75.678us 1.23% 75.678us 1.051us 72
4094
+ aten::index_add_ 0.59% 96.073us 0.99% 162.304us 13.525us 70.526us 1.15% 70.526us 5.877us 12
4095
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 70.526us 1.15% 70.526us 5.877us 12
4096
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 66.368us 1.08% 66.368us 5.531us 12
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
+ Self CPU time total: 16.401ms
4099
+ Self CUDA time total: 6.136ms
4100
 
4101
 
4102
 
 
4106
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4107
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 12.623ms 150.27% 12.623ms 12.623ms 1
4110
+ gpt_oss_experts 13.47% 1.791ms 99.96% 13.283ms 13.283ms 0.000us 0.00% 8.405ms 8.405ms 1
4111
+ aten::matmul 0.18% 23.339us 3.36% 446.659us 37.222us 0.000us 0.00% 7.382ms 615.173us 12
4112
+ aten::mm 1.99% 264.803us 3.19% 423.320us 35.277us 7.382ms 87.88% 7.382ms 615.173us 12
4113
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 4.494ms 53.50% 4.494ms 748.960us 6
4114
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 1.479ms 17.61% 1.479ms 493.131us 3
4115
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.402ms 16.69% 1.402ms 467.413us 3
4116
+ aten::mul 1.17% 155.791us 2.03% 269.215us 11.217us 193.439us 2.30% 193.439us 8.060us 24
4117
+ aten::add 1.34% 178.665us 2.34% 311.318us 17.295us 184.286us 2.19% 184.286us 10.238us 18
4118
+ aten::index_add_ 0.37% 48.760us 0.64% 85.661us 14.277us 167.358us 1.99% 167.358us 27.893us 6
4119
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 167.358us 1.99% 167.358us 27.893us 6
4120
+ aten::index 1.43% 189.705us 2.42% 321.187us 26.766us 146.945us 1.75% 146.945us 12.245us 12
4121
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.824us 1.74% 145.824us 12.152us 12
4122
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 116.832us 1.39% 116.832us 19.472us 6
4123
+ aten::clamp 0.82% 108.995us 1.40% 185.495us 15.458us 109.284us 1.30% 109.284us 9.107us 12
4124
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 109.284us 1.30% 109.284us 9.107us 12
4125
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 103.135us 1.23% 103.135us 8.595us 12
4126
+ aten::nonzero 1.83% 243.374us 5.76% 765.236us 85.026us 70.402us 0.84% 81.794us 9.088us 9
4127
+ aten::where 0.04% 5.651us 4.63% 615.153us 102.525us 0.000us 0.00% 66.851us 11.142us 6
4128
+ aten::nonzero_numpy 0.08% 11.009us 4.59% 609.502us 101.584us 0.000us 0.00% 66.851us 11.142us 6
4129
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4130
+ Self CPU time total: 13.289ms
4131
+ Self CUDA time total: 8.400ms
4132
 
4133
 
4134
 
 
4138
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4139
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4140
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4141
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 18.138ms 172.84% 18.138ms 18.138ms 1
4142
+ gpt_oss_experts 12.76% 2.622ms 99.97% 20.540ms 20.540ms 0.000us 0.00% 10.500ms 10.500ms 1
4143
+ aten::matmul 0.22% 44.749us 4.11% 844.232us 35.176us 0.000us 0.00% 9.224ms 384.346us 24
4144
+ aten::mm 2.32% 476.088us 3.89% 799.483us 33.312us 9.224ms 87.90% 9.224ms 384.346us 24
4145
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 6.287ms 59.90% 6.287ms 349.259us 18
4146
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.925ms 27.87% 2.925ms 487.438us 6
4147
+ aten::mul 1.51% 311.093us 2.62% 538.833us 11.226us 229.793us 2.19% 229.793us 4.787us 48
4148
+ aten::add 1.68% 344.530us 2.88% 592.257us 16.452us 211.009us 2.01% 211.009us 5.861us 36
4149
+ aten::index 1.75% 359.041us 3.02% 619.685us 25.820us 205.054us 1.95% 205.054us 8.544us 24
4150
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 164.639us 1.57% 164.639us 6.860us 24
4151
+ aten::index_add_ 0.48% 97.780us 0.85% 174.953us 14.579us 157.631us 1.50% 157.631us 13.136us 12
4152
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 157.631us 1.50% 157.631us 13.136us 12
4153
+ aten::nonzero 1.89% 388.553us 6.17% 1.268ms 84.506us 122.654us 1.17% 146.847us 9.790us 15
4154
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 145.663us 1.39% 145.663us 12.139us 12
4155
+ aten::where 0.05% 10.471us 5.79% 1.190ms 99.134us 0.000us 0.00% 132.128us 11.011us 12
4156
+ aten::nonzero_numpy 0.10% 21.340us 5.74% 1.179ms 98.262us 0.000us 0.00% 132.128us 11.011us 12
4157
+ aten::clamp 1.02% 209.010us 1.74% 358.311us 14.930us 131.327us 1.25% 131.327us 5.472us 24
4158
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 131.327us 1.25% 131.327us 5.472us 24
4159
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 117.601us 1.12% 117.601us 4.900us 24
4160
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 108.253us 1.03% 108.253us 1.244us 87
4161
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4162
+ Self CPU time total: 20.546ms
4163
+ Self CUDA time total: 10.495ms
4164
 
4165
 
4166
 
 
4170
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4171
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4172
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4173
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 20.935ms 121.00% 20.935ms 20.935ms 1
4174
+ gpt_oss_experts 7.61% 1.780ms 99.98% 23.376ms 23.376ms 0.000us 0.00% 17.312ms 17.312ms 1
4175
+ aten::matmul 0.10% 23.122us 1.96% 458.772us 38.231us 0.000us 0.00% 14.468ms 1.206ms 12
4176
+ aten::mm 1.15% 269.268us 1.86% 435.650us 36.304us 14.468ms 83.62% 14.468ms 1.206ms 12
4177
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 8.827ms 51.02% 8.827ms 1.471ms 6
4178
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.632ms 32.55% 5.632ms 938.689us 6
4179
+ aten::add 0.79% 184.599us 1.36% 318.590us 17.699us 771.593us 4.46% 771.593us 42.866us 18
4180
+ aten::mul 0.68% 158.205us 1.17% 272.787us 11.366us 648.706us 3.75% 648.706us 27.029us 24
4181
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 492.134us 2.84% 492.134us 41.011us 12
4182
+ aten::index_add_ 0.22% 51.621us 0.39% 91.292us 15.215us 449.187us 2.60% 449.187us 74.864us 6
4183
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 449.187us 2.60% 449.187us 74.864us 6
4184
+ aten::clamp 0.47% 109.062us 0.80% 186.384us 15.532us 328.069us 1.90% 328.069us 27.339us 12
4185
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 328.069us 1.90% 328.069us 27.339us 12
4186
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 298.432us 1.72% 298.432us 49.739us 6
4187
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 279.459us 1.62% 279.459us 46.576us 6
4188
+ aten::index 0.79% 185.644us 1.37% 320.365us 26.697us 259.362us 1.50% 259.362us 21.614us 12
4189
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 252.002us 1.46% 252.002us 21.000us 12
4190
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 226.817us 1.31% 226.817us 37.803us 6
4191
+ aten::sigmoid 0.16% 37.651us 0.31% 72.093us 12.016us 177.249us 1.02% 177.249us 29.542us 6
4192
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 177.249us 1.02% 177.249us 29.542us 6
4193
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4194
+ Self CPU time total: 23.381ms
4195
+ Self CUDA time total: 17.302ms
4196
 
4197
 
4198
 
 
4202
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4203
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4204
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4205
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 24.710ms 141.76% 24.710ms 24.710ms 1
4206
+ gpt_oss_experts 10.14% 2.749ms 99.98% 27.106ms 27.106ms 0.000us 0.00% 17.441ms 17.441ms 1
4207
+ aten::matmul 0.17% 45.968us 3.40% 922.464us 38.436us 0.000us 0.00% 15.230ms 634.586us 24
4208
+ aten::mm 2.05% 556.479us 3.23% 876.496us 36.521us 15.230ms 87.37% 15.230ms 634.586us 24
4209
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 9.172ms 52.62% 9.172ms 764.334us 12
4210
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.147ms 18.05% 3.147ms 524.452us 6
4211
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.898ms 16.62% 2.898ms 482.943us 6
4212
+ aten::add 1.29% 350.116us 2.26% 613.465us 17.041us 420.321us 2.41% 420.321us 11.676us 36
4213
+ aten::mul 1.13% 307.419us 1.97% 533.015us 11.104us 413.571us 2.37% 413.571us 8.616us 48
4214
+ aten::index_add_ 0.36% 98.853us 0.63% 169.455us 14.121us 380.323us 2.18% 380.323us 31.694us 12
4215
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 380.323us 2.18% 380.323us 31.694us 12
4216
+ aten::index 1.34% 364.187us 2.36% 638.760us 26.615us 342.626us 1.97% 342.626us 14.276us 24
4217
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 337.185us 1.93% 337.185us 14.049us 24
4218
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 278.754us 1.60% 278.754us 23.230us 12
4219
+ aten::clamp 0.81% 219.710us 1.37% 372.721us 15.530us 226.367us 1.30% 226.367us 9.432us 24
4220
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 226.367us 1.30% 226.367us 9.432us 24
4221
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 219.298us 1.26% 219.298us 9.137us 24
4222
+ aten::nonzero 1.48% 402.204us 4.91% 1.331ms 88.732us 129.571us 0.74% 155.747us 10.383us 15
4223
+ aten::where 0.04% 10.572us 4.67% 1.267ms 105.600us 0.000us 0.00% 139.970us 11.664us 12
4224
+ aten::nonzero_numpy 0.08% 21.969us 4.64% 1.257ms 104.719us 0.000us 0.00% 139.970us 11.664us 12
4225
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4226
+ Self CPU time total: 27.112ms
4227
+ Self CUDA time total: 17.431ms
4228
 
4229
 
4230
 
 
4234
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4235
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4236
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4237
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.438ms 109.96% 40.438ms 40.438ms 1
4238
+ gpt_oss_experts 4.40% 1.882ms 99.82% 42.728ms 42.728ms 0.000us 0.00% 36.808ms 36.808ms 1
4239
+ aten::matmul 0.05% 22.249us 1.02% 438.421us 36.535us 0.000us 0.00% 26.813ms 2.234ms 12
4240
+ aten::mm 0.66% 281.965us 0.97% 416.172us 34.681us 26.813ms 72.91% 26.813ms 2.234ms 12
4241
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 26.809ms 72.90% 26.809ms 2.234ms 12
4242
+ aten::mul 0.40% 169.436us 0.68% 291.368us 12.140us 2.973ms 8.09% 2.973ms 123.894us 24
4243
+ aten::add 0.45% 194.095us 1.09% 466.694us 25.927us 2.399ms 6.52% 2.399ms 133.270us 18
4244
+ aten::clamp 0.28% 118.373us 0.48% 205.484us 17.124us 2.385ms 6.49% 2.385ms 198.780us 12
4245
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.385ms 6.49% 2.385ms 198.780us 12
4246
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.983ms 5.39% 1.983ms 165.284us 12
4247
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.623ms 4.41% 1.623ms 135.241us 12
4248
+ aten::index_add_ 0.12% 50.121us 0.21% 88.453us 14.742us 929.513us 2.53% 929.513us 154.919us 6
4249
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 929.513us 2.53% 929.513us 154.919us 6
4250
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 775.973us 2.11% 775.973us 129.329us 6
4251
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 743.622us 2.02% 743.622us 123.937us 6
4252
+ aten::index 0.44% 190.163us 0.78% 332.417us 27.701us 705.798us 1.92% 705.798us 58.816us 12
4253
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 672.133us 1.83% 672.133us 112.022us 6
4254
+ aten::sigmoid 0.10% 42.342us 0.17% 71.992us 11.999us 317.635us 0.86% 317.635us 52.939us 6
4255
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 317.635us 0.86% 317.635us 52.939us 6
4256
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 246.434us 0.67% 246.434us 41.072us 6
4257
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4258
+ Self CPU time total: 42.805ms
4259
+ Self CUDA time total: 36.776ms
4260
 
4261
 
4262
 
 
4266
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4267
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4268
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4269
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.917ms 118.34% 40.917ms 40.917ms 1
4270
+ gpt_oss_experts 6.54% 2.832ms 99.99% 43.320ms 43.320ms 0.000us 0.00% 34.594ms 34.594ms 1
4271
+ aten::matmul 0.11% 46.003us 2.16% 933.683us 38.903us 0.000us 0.00% 28.640ms 1.193ms 24
4272
+ aten::mm 1.27% 551.595us 2.05% 887.680us 36.987us 28.640ms 82.83% 28.640ms 1.193ms 24
4273
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 20.238ms 58.53% 20.238ms 1.349ms 15
4274
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 8.385ms 24.25% 8.385ms 931.701us 9
4275
+ aten::add 0.85% 367.713us 1.47% 637.625us 17.712us 1.485ms 4.30% 1.485ms 41.254us 36
4276
+ aten::mul 0.73% 317.651us 1.28% 554.606us 11.554us 1.368ms 3.96% 1.368ms 28.495us 48
4277
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 932.164us 2.70% 932.164us 38.840us 24
4278
+ aten::index_add_ 0.23% 99.030us 0.39% 170.492us 14.208us 912.225us 2.64% 912.225us 76.019us 12
4279
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 912.225us 2.64% 912.225us 76.019us 12
4280
+ aten::clamp 0.52% 223.402us 0.90% 389.994us 16.250us 772.775us 2.24% 772.775us 32.199us 24
4281
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 772.775us 2.24% 772.775us 32.199us 24
4282
+ aten::index 0.84% 365.911us 1.48% 641.837us 26.743us 652.128us 1.89% 652.128us 27.172us 24
4283
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 646.273us 1.87% 646.273us 53.856us 12
4284
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 582.113us 1.68% 582.113us 48.509us 12
4285
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 552.993us 1.60% 552.993us 46.083us 12
4286
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 519.810us 1.50% 519.810us 21.659us 24
4287
+ aten::sigmoid 0.18% 79.593us 0.31% 135.883us 11.324us 361.471us 1.05% 361.471us 30.123us 12
4288
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 361.471us 1.05% 361.471us 30.123us 12
4289
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4290
+ Self CPU time total: 43.326ms
4291
+ Self CUDA time total: 34.575ms
4292
 
4293
 
4294
  impl wl p50(ms) ok
4295
+ gpt_oss_experts cuda_B1_S1024_E2 3.85 True
4296
+ gpt_oss_experts cuda_B1_S1024_E4 5.31 True
4297
+ gpt_oss_experts cuda_B1_S512_E2 2.63 True
4298
+ gpt_oss_experts cuda_B1_S512_E4 3.93 True
4299
+ gpt_oss_experts cuda_B4_S1024_E2 13.24 True
4300
+ gpt_oss_experts cuda_B4_S1024_E4 13.36 True
4301
+ gpt_oss_experts cuda_B4_S512_E2 6.72 True
4302
+ gpt_oss_experts cuda_B4_S512_E4 7.52 True
4303
  </pre></div>
4304
  <div class="uv-install-logs" id="uv-logs-benchmark">
4305
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 
4308
  Updated https://github.com/huggingface/kernels.git (55b7c980e96bf5f747f0e4136be61c0b089ab76c)
4309
  Building kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c
4310
  Built kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c
4311
+ Installed 14 packages in 3ms
4312
  </div>
4313
  </div>
4314
+ <div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00&lt;?, ?it/s]
4315
+ Fetching 6 files: 50%|█████ | 3/6 [00:00&lt;00:00, 6.07it/s]
4316
+ Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 12.14it/s]</div>
 
 
4317
  <div class="cell-artifacts">
4318
  <h4>Artifacts:</h4>
4319
  <a href="artifacts/benchmark/openai_moe.jsonl" class="artifact" target="_blank">openai_moe.jsonl</a>
openai_moe/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 7db8d527515e46144ea7b8f5f5738602c070e5f806c682f79f8cd000058b9bc5
  • Pointer size: 130 Bytes
  • Size of remote file: 20.3 kB

Git LFS Details

  • SHA256: 3090485b23d0740dc54ec975ab4d53494c6243ac5b87df898966ffdc9bc67256
  • Pointer size: 130 Bytes
  • Size of remote file: 20.3 kB
openai_moe/results/combined_results.html CHANGED
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-12-19T19:10:00.094905</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
@@ -4025,96 +4025,96 @@ body[data-tool="eraser"] .main-content {
4025
  <g id="matplotlib.axis_2">
4026
  <g id="ytick_1">
4027
  <g id="grid-y--2" class="grid grid-y">
4028
- <path d="M 57.17 448.88374 L 845.766818 448.88374 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4029
  </g>
4030
  <g id="line2d_9">
4031
  <defs>
4032
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4033
  </defs>
4034
  <g>
4035
- <use ns4:href="#m0fca2865ba" x="57.17" y="448.88374" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="452.682959" transform="rotate(-0 50.17 452.682959)">0</text>
4040
  </g>
4041
  </g>
4042
  <g id="ytick_2">
4043
  <g id="grid-y--3" class="grid grid-y">
4044
- <path d="M 57.17 388.304965 L 845.766818 388.304965 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4045
  </g>
4046
  <g id="line2d_10">
4047
  <g>
4048
- <use ns4:href="#m0fca2865ba" x="57.17" y="388.304965" style="stroke: #000000; stroke-width: 0.8" />
4049
  </g>
4050
  </g>
4051
  <g id="text_10">
4052
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="392.104184" transform="rotate(-0 50.17 392.104184)">250</text>
4053
  </g>
4054
  </g>
4055
  <g id="ytick_3">
4056
  <g id="grid-y--4" class="grid grid-y">
4057
- <path d="M 57.17 327.726191 L 845.766818 327.726191 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
  </g>
4059
  <g id="line2d_11">
4060
  <g>
4061
- <use ns4:href="#m0fca2865ba" x="57.17" y="327.726191" style="stroke: #000000; stroke-width: 0.8" />
4062
  </g>
4063
  </g>
4064
  <g id="text_11">
4065
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="331.52541" transform="rotate(-0 50.17 331.52541)">500</text>
4066
  </g>
4067
  </g>
4068
  <g id="ytick_4">
4069
  <g id="grid-y--5" class="grid grid-y">
4070
- <path d="M 57.17 267.147416 L 845.766818 267.147416 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
  </g>
4072
  <g id="line2d_12">
4073
  <g>
4074
- <use ns4:href="#m0fca2865ba" x="57.17" y="267.147416" style="stroke: #000000; stroke-width: 0.8" />
4075
  </g>
4076
  </g>
4077
  <g id="text_12">
4078
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="270.946635" transform="rotate(-0 50.17 270.946635)">750</text>
4079
  </g>
4080
  </g>
4081
  <g id="ytick_5">
4082
  <g id="grid-y--6" class="grid grid-y">
4083
- <path d="M 57.17 206.568642 L 845.766818 206.568642 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
  </g>
4085
  <g id="line2d_13">
4086
  <g>
4087
- <use ns4:href="#m0fca2865ba" x="57.17" y="206.568642" style="stroke: #000000; stroke-width: 0.8" />
4088
  </g>
4089
  </g>
4090
  <g id="text_13">
4091
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="210.367861" transform="rotate(-0 50.17 210.367861)">1000</text>
4092
  </g>
4093
  </g>
4094
  <g id="ytick_6">
4095
  <g id="grid-y--7" class="grid grid-y">
4096
- <path d="M 57.17 145.989867 L 845.766818 145.989867 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4097
  </g>
4098
  <g id="line2d_14">
4099
  <g>
4100
- <use ns4:href="#m0fca2865ba" x="57.17" y="145.989867" style="stroke: #000000; stroke-width: 0.8" />
4101
  </g>
4102
  </g>
4103
  <g id="text_14">
4104
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="149.789086" transform="rotate(-0 50.17 149.789086)">1250</text>
4105
  </g>
4106
  </g>
4107
  <g id="ytick_7">
4108
  <g id="grid-y--8" class="grid grid-y">
4109
- <path d="M 57.17 85.411093 L 845.766818 85.411093 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4110
  </g>
4111
  <g id="line2d_15">
4112
  <g>
4113
- <use ns4:href="#m0fca2865ba" x="57.17" y="85.411093" style="stroke: #000000; stroke-width: 0.8" />
4114
  </g>
4115
  </g>
4116
  <g id="text_15">
4117
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="89.210312" transform="rotate(-0 50.17 89.210312)">1500</text>
4118
  </g>
4119
  </g>
4120
  <g id="label--y" class="ylabel">
@@ -4122,35 +4122,35 @@ body[data-tool="eraser"] .main-content {
4122
  </g>
4123
  </g>
4124
  <g id="series--binned-torch" class="series">
4125
- <path d="M 93.01531 410.663437 L 195.430481 399.252405 L 297.845652 356.001516 L 400.260823 346.76844 L 502.675995 261.404682 L 605.091166 245.335532 L 707.506337 82.088892 L 809.921508 46.94533 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4126
  <defs>
4127
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4128
  </defs>
4129
  <g clip-path="url(#p5307ca50d8)">
4130
- <use ns4:href="#md7efaf3aec" x="93.01531" y="410.663437" style="fill: #1f77b4; stroke: #1f77b4" />
4131
- <use ns4:href="#md7efaf3aec" x="195.430481" y="399.252405" style="fill: #1f77b4; stroke: #1f77b4" />
4132
- <use ns4:href="#md7efaf3aec" x="297.845652" y="356.001516" style="fill: #1f77b4; stroke: #1f77b4" />
4133
- <use ns4:href="#md7efaf3aec" x="400.260823" y="346.76844" style="fill: #1f77b4; stroke: #1f77b4" />
4134
- <use ns4:href="#md7efaf3aec" x="502.675995" y="261.404682" style="fill: #1f77b4; stroke: #1f77b4" />
4135
- <use ns4:href="#md7efaf3aec" x="605.091166" y="245.335532" style="fill: #1f77b4; stroke: #1f77b4" />
4136
- <use ns4:href="#md7efaf3aec" x="707.506337" y="82.088892" style="fill: #1f77b4; stroke: #1f77b4" />
4137
  <use ns4:href="#md7efaf3aec" x="809.921508" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
4138
  </g>
4139
  </g>
4140
  <g id="series--gpt-oss-experts" class="series">
4141
- <path d="M 93.01531 448.251939 L 195.430481 447.950554 L 297.845652 447.969402 L 400.260823 447.623067 L 502.675995 447.274263 L 605.091166 447.115423 L 707.506337 445.70441 L 809.921508 445.680583 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4142
  <defs>
4143
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4144
  </defs>
4145
  <g clip-path="url(#p5307ca50d8)">
4146
  <use ns4:href="#m9b8c54d372" x="93.01531" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
4147
- <use ns4:href="#m9b8c54d372" x="195.430481" y="447.950554" style="fill: #ff7f0e; stroke: #ff7f0e" />
4148
- <use ns4:href="#m9b8c54d372" x="297.845652" y="447.969402" style="fill: #ff7f0e; stroke: #ff7f0e" />
4149
- <use ns4:href="#m9b8c54d372" x="400.260823" y="447.623067" style="fill: #ff7f0e; stroke: #ff7f0e" />
4150
- <use ns4:href="#m9b8c54d372" x="502.675995" y="447.274263" style="fill: #ff7f0e; stroke: #ff7f0e" />
4151
- <use ns4:href="#m9b8c54d372" x="605.091166" y="447.115423" style="fill: #ff7f0e; stroke: #ff7f0e" />
4152
- <use ns4:href="#m9b8c54d372" x="707.506337" y="445.70441" style="fill: #ff7f0e; stroke: #ff7f0e" />
4153
- <use ns4:href="#m9b8c54d372" x="809.921508" y="445.680583" style="fill: #ff7f0e; stroke: #ff7f0e" />
4154
  </g>
4155
  </g>
4156
  <g id="patch_3">
@@ -4208,7 +4208,7 @@ body[data-tool="eraser"] .main-content {
4208
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4209
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4210
  </span> |
4211
- Cell: combine | 4.53s
4212
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4213
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4214
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4297,22 +4297,22 @@ Summary: 2 found, 0 skipped, 0 missing
4297
  COMBINED BENCHMARK SUMMARY
4298
 
4299
  impl wl p50(ms) ok
4300
- binned_torch cuda_B1_S1024_E2 383.31 True
4301
- binned_torch cuda_B1_S1024_E4 421.42 True
4302
- binned_torch cuda_B1_S512_E2 157.73 True
4303
- binned_torch cuda_B1_S512_E4 204.82 True
4304
- binned_torch cuda_B4_S1024_E2 1513.71 True
4305
- binned_torch cuda_B4_S1024_E4 1658.74 True
4306
- binned_torch cuda_B4_S512_E2 773.70 True
4307
- binned_torch cuda_B4_S512_E4 840.01 True
4308
- gpt_oss_experts cuda_B1_S1024_E2 3.77 True
4309
- gpt_oss_experts cuda_B1_S1024_E4 5.20 True
4310
- gpt_oss_experts cuda_B1_S512_E2 2.61 True
4311
- gpt_oss_experts cuda_B1_S512_E4 3.85 True
4312
- gpt_oss_experts cuda_B4_S1024_E2 13.12 True
4313
- gpt_oss_experts cuda_B4_S1024_E4 13.22 True
4314
- gpt_oss_experts cuda_B4_S512_E2 6.64 True
4315
- gpt_oss_experts cuda_B4_S512_E4 7.30 True
4316
 
4317
  GENERATING COMBINED VISUALIZATION
4318
 
@@ -4332,7 +4332,7 @@ Implementations included:
4332
  <div class="uv-install-logs" id="uv-logs-combine">
4333
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4334
  <div class="uv-logs-content" style="display: none;">
4335
- Installed 37 packages in 270ms
4336
  </div>
4337
  </div>
4338
  <div class="cell-artifacts">
@@ -4345,7 +4345,7 @@ Installed 37 packages in 270ms
4345
  <rdf:RDF>
4346
  <ns2:Work>
4347
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4348
- <dc:date>2025-12-19T19:10:00.094905</dc:date>
4349
  <dc:format>image/svg+xml</dc:format>
4350
  <dc:creator>
4351
  <ns2:Agent>
@@ -4481,96 +4481,96 @@ Installed 37 packages in 270ms
4481
  <g id="matplotlib.axis_2">
4482
  <g id="ytick_1">
4483
  <g id="grid-y--2" class="grid grid-y">
4484
- <path d="M 57.17 448.88374 L 845.766818 448.88374 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4485
  </g>
4486
  <g id="line2d_9">
4487
  <defs>
4488
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4489
  </defs>
4490
  <g>
4491
- <use ns4:href="#m0fca2865ba" x="57.17" y="448.88374" style="stroke: #000000; stroke-width: 0.8" />
4492
  </g>
4493
  </g>
4494
  <g id="text_9">
4495
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="452.682959" transform="rotate(-0 50.17 452.682959)">0</text>
4496
  </g>
4497
  </g>
4498
  <g id="ytick_2">
4499
  <g id="grid-y--3" class="grid grid-y">
4500
- <path d="M 57.17 388.304965 L 845.766818 388.304965 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4501
  </g>
4502
  <g id="line2d_10">
4503
  <g>
4504
- <use ns4:href="#m0fca2865ba" x="57.17" y="388.304965" style="stroke: #000000; stroke-width: 0.8" />
4505
  </g>
4506
  </g>
4507
  <g id="text_10">
4508
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="392.104184" transform="rotate(-0 50.17 392.104184)">250</text>
4509
  </g>
4510
  </g>
4511
  <g id="ytick_3">
4512
  <g id="grid-y--4" class="grid grid-y">
4513
- <path d="M 57.17 327.726191 L 845.766818 327.726191 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4514
  </g>
4515
  <g id="line2d_11">
4516
  <g>
4517
- <use ns4:href="#m0fca2865ba" x="57.17" y="327.726191" style="stroke: #000000; stroke-width: 0.8" />
4518
  </g>
4519
  </g>
4520
  <g id="text_11">
4521
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="331.52541" transform="rotate(-0 50.17 331.52541)">500</text>
4522
  </g>
4523
  </g>
4524
  <g id="ytick_4">
4525
  <g id="grid-y--5" class="grid grid-y">
4526
- <path d="M 57.17 267.147416 L 845.766818 267.147416 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4527
  </g>
4528
  <g id="line2d_12">
4529
  <g>
4530
- <use ns4:href="#m0fca2865ba" x="57.17" y="267.147416" style="stroke: #000000; stroke-width: 0.8" />
4531
  </g>
4532
  </g>
4533
  <g id="text_12">
4534
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="270.946635" transform="rotate(-0 50.17 270.946635)">750</text>
4535
  </g>
4536
  </g>
4537
  <g id="ytick_5">
4538
  <g id="grid-y--6" class="grid grid-y">
4539
- <path d="M 57.17 206.568642 L 845.766818 206.568642 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4540
  </g>
4541
  <g id="line2d_13">
4542
  <g>
4543
- <use ns4:href="#m0fca2865ba" x="57.17" y="206.568642" style="stroke: #000000; stroke-width: 0.8" />
4544
  </g>
4545
  </g>
4546
  <g id="text_13">
4547
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="210.367861" transform="rotate(-0 50.17 210.367861)">1000</text>
4548
  </g>
4549
  </g>
4550
  <g id="ytick_6">
4551
  <g id="grid-y--7" class="grid grid-y">
4552
- <path d="M 57.17 145.989867 L 845.766818 145.989867 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4553
  </g>
4554
  <g id="line2d_14">
4555
  <g>
4556
- <use ns4:href="#m0fca2865ba" x="57.17" y="145.989867" style="stroke: #000000; stroke-width: 0.8" />
4557
  </g>
4558
  </g>
4559
  <g id="text_14">
4560
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="149.789086" transform="rotate(-0 50.17 149.789086)">1250</text>
4561
  </g>
4562
  </g>
4563
  <g id="ytick_7">
4564
  <g id="grid-y--8" class="grid grid-y">
4565
- <path d="M 57.17 85.411093 L 845.766818 85.411093 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4566
  </g>
4567
  <g id="line2d_15">
4568
  <g>
4569
- <use ns4:href="#m0fca2865ba" x="57.17" y="85.411093" style="stroke: #000000; stroke-width: 0.8" />
4570
  </g>
4571
  </g>
4572
  <g id="text_15">
4573
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="89.210312" transform="rotate(-0 50.17 89.210312)">1500</text>
4574
  </g>
4575
  </g>
4576
  <g id="label--y" class="ylabel">
@@ -4578,35 +4578,35 @@ Installed 37 packages in 270ms
4578
  </g>
4579
  </g>
4580
  <g id="series--binned-torch" class="series">
4581
- <path d="M 93.01531 410.663437 L 195.430481 399.252405 L 297.845652 356.001516 L 400.260823 346.76844 L 502.675995 261.404682 L 605.091166 245.335532 L 707.506337 82.088892 L 809.921508 46.94533 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4582
  <defs>
4583
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4584
  </defs>
4585
  <g clip-path="url(#p5307ca50d8)">
4586
- <use ns4:href="#md7efaf3aec" x="93.01531" y="410.663437" style="fill: #1f77b4; stroke: #1f77b4" />
4587
- <use ns4:href="#md7efaf3aec" x="195.430481" y="399.252405" style="fill: #1f77b4; stroke: #1f77b4" />
4588
- <use ns4:href="#md7efaf3aec" x="297.845652" y="356.001516" style="fill: #1f77b4; stroke: #1f77b4" />
4589
- <use ns4:href="#md7efaf3aec" x="400.260823" y="346.76844" style="fill: #1f77b4; stroke: #1f77b4" />
4590
- <use ns4:href="#md7efaf3aec" x="502.675995" y="261.404682" style="fill: #1f77b4; stroke: #1f77b4" />
4591
- <use ns4:href="#md7efaf3aec" x="605.091166" y="245.335532" style="fill: #1f77b4; stroke: #1f77b4" />
4592
- <use ns4:href="#md7efaf3aec" x="707.506337" y="82.088892" style="fill: #1f77b4; stroke: #1f77b4" />
4593
  <use ns4:href="#md7efaf3aec" x="809.921508" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
4594
  </g>
4595
  </g>
4596
  <g id="series--gpt-oss-experts" class="series">
4597
- <path d="M 93.01531 448.251939 L 195.430481 447.950554 L 297.845652 447.969402 L 400.260823 447.623067 L 502.675995 447.274263 L 605.091166 447.115423 L 707.506337 445.70441 L 809.921508 445.680583 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4598
  <defs>
4599
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4600
  </defs>
4601
  <g clip-path="url(#p5307ca50d8)">
4602
  <use ns4:href="#m9b8c54d372" x="93.01531" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
4603
- <use ns4:href="#m9b8c54d372" x="195.430481" y="447.950554" style="fill: #ff7f0e; stroke: #ff7f0e" />
4604
- <use ns4:href="#m9b8c54d372" x="297.845652" y="447.969402" style="fill: #ff7f0e; stroke: #ff7f0e" />
4605
- <use ns4:href="#m9b8c54d372" x="400.260823" y="447.623067" style="fill: #ff7f0e; stroke: #ff7f0e" />
4606
- <use ns4:href="#m9b8c54d372" x="502.675995" y="447.274263" style="fill: #ff7f0e; stroke: #ff7f0e" />
4607
- <use ns4:href="#m9b8c54d372" x="605.091166" y="447.115423" style="fill: #ff7f0e; stroke: #ff7f0e" />
4608
- <use ns4:href="#m9b8c54d372" x="707.506337" y="445.70441" style="fill: #ff7f0e; stroke: #ff7f0e" />
4609
- <use ns4:href="#m9b8c54d372" x="809.921508" y="445.680583" style="fill: #ff7f0e; stroke: #ff7f0e" />
4610
  </g>
4611
  </g>
4612
  <g id="patch_3">
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T19:55:39.293722</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
 
4025
  <g id="matplotlib.axis_2">
4026
  <g id="ytick_1">
4027
  <g id="grid-y--2" class="grid grid-y">
4028
+ <path d="M 57.17 448.894453 L 845.766818 448.894453 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4029
  </g>
4030
  <g id="line2d_9">
4031
  <defs>
4032
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4033
  </defs>
4034
  <g>
4035
+ <use ns4:href="#m0fca2865ba" x="57.17" y="448.894453" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="452.693672" transform="rotate(-0 50.17 452.693672)">0</text>
4040
  </g>
4041
  </g>
4042
  <g id="ytick_2">
4043
  <g id="grid-y--3" class="grid grid-y">
4044
+ <path d="M 57.17 387.738866 L 845.766818 387.738866 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4045
  </g>
4046
  <g id="line2d_10">
4047
  <g>
4048
+ <use ns4:href="#m0fca2865ba" x="57.17" y="387.738866" style="stroke: #000000; stroke-width: 0.8" />
4049
  </g>
4050
  </g>
4051
  <g id="text_10">
4052
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="391.538085" transform="rotate(-0 50.17 391.538085)">250</text>
4053
  </g>
4054
  </g>
4055
  <g id="ytick_3">
4056
  <g id="grid-y--4" class="grid grid-y">
4057
+ <path d="M 57.17 326.583279 L 845.766818 326.583279 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
  </g>
4059
  <g id="line2d_11">
4060
  <g>
4061
+ <use ns4:href="#m0fca2865ba" x="57.17" y="326.583279" style="stroke: #000000; stroke-width: 0.8" />
4062
  </g>
4063
  </g>
4064
  <g id="text_11">
4065
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="330.382498" transform="rotate(-0 50.17 330.382498)">500</text>
4066
  </g>
4067
  </g>
4068
  <g id="ytick_4">
4069
  <g id="grid-y--5" class="grid grid-y">
4070
+ <path d="M 57.17 265.427692 L 845.766818 265.427692 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
  </g>
4072
  <g id="line2d_12">
4073
  <g>
4074
+ <use ns4:href="#m0fca2865ba" x="57.17" y="265.427692" style="stroke: #000000; stroke-width: 0.8" />
4075
  </g>
4076
  </g>
4077
  <g id="text_12">
4078
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="269.226911" transform="rotate(-0 50.17 269.226911)">750</text>
4079
  </g>
4080
  </g>
4081
  <g id="ytick_5">
4082
  <g id="grid-y--6" class="grid grid-y">
4083
+ <path d="M 57.17 204.272105 L 845.766818 204.272105 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
  </g>
4085
  <g id="line2d_13">
4086
  <g>
4087
+ <use ns4:href="#m0fca2865ba" x="57.17" y="204.272105" style="stroke: #000000; stroke-width: 0.8" />
4088
  </g>
4089
  </g>
4090
  <g id="text_13">
4091
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="208.071324" transform="rotate(-0 50.17 208.071324)">1000</text>
4092
  </g>
4093
  </g>
4094
  <g id="ytick_6">
4095
  <g id="grid-y--7" class="grid grid-y">
4096
+ <path d="M 57.17 143.116518 L 845.766818 143.116518 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4097
  </g>
4098
  <g id="line2d_14">
4099
  <g>
4100
+ <use ns4:href="#m0fca2865ba" x="57.17" y="143.116518" style="stroke: #000000; stroke-width: 0.8" />
4101
  </g>
4102
  </g>
4103
  <g id="text_14">
4104
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="146.915736" transform="rotate(-0 50.17 146.915736)">1250</text>
4105
  </g>
4106
  </g>
4107
  <g id="ytick_7">
4108
  <g id="grid-y--8" class="grid grid-y">
4109
+ <path d="M 57.17 81.960931 L 845.766818 81.960931 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4110
  </g>
4111
  <g id="line2d_15">
4112
  <g>
4113
+ <use ns4:href="#m0fca2865ba" x="57.17" y="81.960931" style="stroke: #000000; stroke-width: 0.8" />
4114
  </g>
4115
  </g>
4116
  <g id="text_15">
4117
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="85.760149" transform="rotate(-0 50.17 85.760149)">1500</text>
4118
  </g>
4119
  </g>
4120
  <g id="label--y" class="ylabel">
 
4122
  </g>
4123
  </g>
4124
  <g id="series--binned-torch" class="series">
4125
+ <path d="M 93.01531 410.178286 L 195.430481 397.766778 L 297.845652 356.453725 L 400.260823 348.866756 L 502.675995 260.623561 L 605.091166 249.051311 L 707.506337 77.923227 L 809.921508 46.94533 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4126
  <defs>
4127
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4128
  </defs>
4129
  <g clip-path="url(#p5307ca50d8)">
4130
+ <use ns4:href="#md7efaf3aec" x="93.01531" y="410.178286" style="fill: #1f77b4; stroke: #1f77b4" />
4131
+ <use ns4:href="#md7efaf3aec" x="195.430481" y="397.766778" style="fill: #1f77b4; stroke: #1f77b4" />
4132
+ <use ns4:href="#md7efaf3aec" x="297.845652" y="356.453725" style="fill: #1f77b4; stroke: #1f77b4" />
4133
+ <use ns4:href="#md7efaf3aec" x="400.260823" y="348.866756" style="fill: #1f77b4; stroke: #1f77b4" />
4134
+ <use ns4:href="#md7efaf3aec" x="502.675995" y="260.623561" style="fill: #1f77b4; stroke: #1f77b4" />
4135
+ <use ns4:href="#md7efaf3aec" x="605.091166" y="249.051311" style="fill: #1f77b4; stroke: #1f77b4" />
4136
+ <use ns4:href="#md7efaf3aec" x="707.506337" y="77.923227" style="fill: #1f77b4; stroke: #1f77b4" />
4137
  <use ns4:href="#md7efaf3aec" x="809.921508" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
4138
  </g>
4139
  </g>
4140
  <g id="series--gpt-oss-experts" class="series">
4141
+ <path d="M 93.01531 448.251939 L 195.430481 447.932519 L 297.845652 447.952742 L 400.260823 447.595994 L 502.675995 447.251251 L 605.091166 447.055342 L 707.506337 445.656252 L 809.921508 445.625657 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4142
  <defs>
4143
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4144
  </defs>
4145
  <g clip-path="url(#p5307ca50d8)">
4146
  <use ns4:href="#m9b8c54d372" x="93.01531" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
4147
+ <use ns4:href="#m9b8c54d372" x="195.430481" y="447.932519" style="fill: #ff7f0e; stroke: #ff7f0e" />
4148
+ <use ns4:href="#m9b8c54d372" x="297.845652" y="447.952742" style="fill: #ff7f0e; stroke: #ff7f0e" />
4149
+ <use ns4:href="#m9b8c54d372" x="400.260823" y="447.595994" style="fill: #ff7f0e; stroke: #ff7f0e" />
4150
+ <use ns4:href="#m9b8c54d372" x="502.675995" y="447.251251" style="fill: #ff7f0e; stroke: #ff7f0e" />
4151
+ <use ns4:href="#m9b8c54d372" x="605.091166" y="447.055342" style="fill: #ff7f0e; stroke: #ff7f0e" />
4152
+ <use ns4:href="#m9b8c54d372" x="707.506337" y="445.656252" style="fill: #ff7f0e; stroke: #ff7f0e" />
4153
+ <use ns4:href="#m9b8c54d372" x="809.921508" y="445.625657" style="fill: #ff7f0e; stroke: #ff7f0e" />
4154
  </g>
4155
  </g>
4156
  <g id="patch_3">
 
4208
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4209
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4210
  </span> |
4211
+ Cell: combine | 4.43s
4212
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4213
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4214
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4297
  COMBINED BENCHMARK SUMMARY
4298
 
4299
  impl wl p50(ms) ok
4300
+ binned_torch cuda_B1_S1024_E2 377.89 True
4301
+ binned_torch cuda_B1_S1024_E4 408.91 True
4302
+ binned_torch cuda_B1_S512_E2 158.27 True
4303
+ binned_torch cuda_B1_S512_E4 209.01 True
4304
+ binned_torch cuda_B4_S1024_E2 1516.51 True
4305
+ binned_torch cuda_B4_S1024_E4 1643.14 True
4306
+ binned_torch cuda_B4_S512_E2 769.64 True
4307
+ binned_torch cuda_B4_S512_E4 816.95 True
4308
+ gpt_oss_experts cuda_B1_S1024_E2 3.85 True
4309
+ gpt_oss_experts cuda_B1_S1024_E4 5.31 True
4310
+ gpt_oss_experts cuda_B1_S512_E2 2.63 True
4311
+ gpt_oss_experts cuda_B1_S512_E4 3.93 True
4312
+ gpt_oss_experts cuda_B4_S1024_E2 13.24 True
4313
+ gpt_oss_experts cuda_B4_S1024_E4 13.36 True
4314
+ gpt_oss_experts cuda_B4_S512_E2 6.72 True
4315
+ gpt_oss_experts cuda_B4_S512_E4 7.52 True
4316
 
4317
  GENERATING COMBINED VISUALIZATION
4318
 
 
4332
  <div class="uv-install-logs" id="uv-logs-combine">
4333
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4334
  <div class="uv-logs-content" style="display: none;">
4335
+ Installed 37 packages in 205ms
4336
  </div>
4337
  </div>
4338
  <div class="cell-artifacts">
 
4345
  <rdf:RDF>
4346
  <ns2:Work>
4347
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4348
+ <dc:date>2025-12-19T19:55:39.293722</dc:date>
4349
  <dc:format>image/svg+xml</dc:format>
4350
  <dc:creator>
4351
  <ns2:Agent>
 
4481
  <g id="matplotlib.axis_2">
4482
  <g id="ytick_1">
4483
  <g id="grid-y--2" class="grid grid-y">
4484
+ <path d="M 57.17 448.894453 L 845.766818 448.894453 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4485
  </g>
4486
  <g id="line2d_9">
4487
  <defs>
4488
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4489
  </defs>
4490
  <g>
4491
+ <use ns4:href="#m0fca2865ba" x="57.17" y="448.894453" style="stroke: #000000; stroke-width: 0.8" />
4492
  </g>
4493
  </g>
4494
  <g id="text_9">
4495
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="452.693672" transform="rotate(-0 50.17 452.693672)">0</text>
4496
  </g>
4497
  </g>
4498
  <g id="ytick_2">
4499
  <g id="grid-y--3" class="grid grid-y">
4500
+ <path d="M 57.17 387.738866 L 845.766818 387.738866 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4501
  </g>
4502
  <g id="line2d_10">
4503
  <g>
4504
+ <use ns4:href="#m0fca2865ba" x="57.17" y="387.738866" style="stroke: #000000; stroke-width: 0.8" />
4505
  </g>
4506
  </g>
4507
  <g id="text_10">
4508
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="391.538085" transform="rotate(-0 50.17 391.538085)">250</text>
4509
  </g>
4510
  </g>
4511
  <g id="ytick_3">
4512
  <g id="grid-y--4" class="grid grid-y">
4513
+ <path d="M 57.17 326.583279 L 845.766818 326.583279 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4514
  </g>
4515
  <g id="line2d_11">
4516
  <g>
4517
+ <use ns4:href="#m0fca2865ba" x="57.17" y="326.583279" style="stroke: #000000; stroke-width: 0.8" />
4518
  </g>
4519
  </g>
4520
  <g id="text_11">
4521
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="330.382498" transform="rotate(-0 50.17 330.382498)">500</text>
4522
  </g>
4523
  </g>
4524
  <g id="ytick_4">
4525
  <g id="grid-y--5" class="grid grid-y">
4526
+ <path d="M 57.17 265.427692 L 845.766818 265.427692 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4527
  </g>
4528
  <g id="line2d_12">
4529
  <g>
4530
+ <use ns4:href="#m0fca2865ba" x="57.17" y="265.427692" style="stroke: #000000; stroke-width: 0.8" />
4531
  </g>
4532
  </g>
4533
  <g id="text_12">
4534
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="269.226911" transform="rotate(-0 50.17 269.226911)">750</text>
4535
  </g>
4536
  </g>
4537
  <g id="ytick_5">
4538
  <g id="grid-y--6" class="grid grid-y">
4539
+ <path d="M 57.17 204.272105 L 845.766818 204.272105 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4540
  </g>
4541
  <g id="line2d_13">
4542
  <g>
4543
+ <use ns4:href="#m0fca2865ba" x="57.17" y="204.272105" style="stroke: #000000; stroke-width: 0.8" />
4544
  </g>
4545
  </g>
4546
  <g id="text_13">
4547
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="208.071324" transform="rotate(-0 50.17 208.071324)">1000</text>
4548
  </g>
4549
  </g>
4550
  <g id="ytick_6">
4551
  <g id="grid-y--7" class="grid grid-y">
4552
+ <path d="M 57.17 143.116518 L 845.766818 143.116518 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4553
  </g>
4554
  <g id="line2d_14">
4555
  <g>
4556
+ <use ns4:href="#m0fca2865ba" x="57.17" y="143.116518" style="stroke: #000000; stroke-width: 0.8" />
4557
  </g>
4558
  </g>
4559
  <g id="text_14">
4560
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="146.915736" transform="rotate(-0 50.17 146.915736)">1250</text>
4561
  </g>
4562
  </g>
4563
  <g id="ytick_7">
4564
  <g id="grid-y--8" class="grid grid-y">
4565
+ <path d="M 57.17 81.960931 L 845.766818 81.960931 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4566
  </g>
4567
  <g id="line2d_15">
4568
  <g>
4569
+ <use ns4:href="#m0fca2865ba" x="57.17" y="81.960931" style="stroke: #000000; stroke-width: 0.8" />
4570
  </g>
4571
  </g>
4572
  <g id="text_15">
4573
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="85.760149" transform="rotate(-0 50.17 85.760149)">1500</text>
4574
  </g>
4575
  </g>
4576
  <g id="label--y" class="ylabel">
 
4578
  </g>
4579
  </g>
4580
  <g id="series--binned-torch" class="series">
4581
+ <path d="M 93.01531 410.178286 L 195.430481 397.766778 L 297.845652 356.453725 L 400.260823 348.866756 L 502.675995 260.623561 L 605.091166 249.051311 L 707.506337 77.923227 L 809.921508 46.94533 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4582
  <defs>
4583
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4584
  </defs>
4585
  <g clip-path="url(#p5307ca50d8)">
4586
+ <use ns4:href="#md7efaf3aec" x="93.01531" y="410.178286" style="fill: #1f77b4; stroke: #1f77b4" />
4587
+ <use ns4:href="#md7efaf3aec" x="195.430481" y="397.766778" style="fill: #1f77b4; stroke: #1f77b4" />
4588
+ <use ns4:href="#md7efaf3aec" x="297.845652" y="356.453725" style="fill: #1f77b4; stroke: #1f77b4" />
4589
+ <use ns4:href="#md7efaf3aec" x="400.260823" y="348.866756" style="fill: #1f77b4; stroke: #1f77b4" />
4590
+ <use ns4:href="#md7efaf3aec" x="502.675995" y="260.623561" style="fill: #1f77b4; stroke: #1f77b4" />
4591
+ <use ns4:href="#md7efaf3aec" x="605.091166" y="249.051311" style="fill: #1f77b4; stroke: #1f77b4" />
4592
+ <use ns4:href="#md7efaf3aec" x="707.506337" y="77.923227" style="fill: #1f77b4; stroke: #1f77b4" />
4593
  <use ns4:href="#md7efaf3aec" x="809.921508" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
4594
  </g>
4595
  </g>
4596
  <g id="series--gpt-oss-experts" class="series">
4597
+ <path d="M 93.01531 448.251939 L 195.430481 447.932519 L 297.845652 447.952742 L 400.260823 447.595994 L 502.675995 447.251251 L 605.091166 447.055342 L 707.506337 445.656252 L 809.921508 445.625657 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4598
  <defs>
4599
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4600
  </defs>
4601
  <g clip-path="url(#p5307ca50d8)">
4602
  <use ns4:href="#m9b8c54d372" x="93.01531" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
4603
+ <use ns4:href="#m9b8c54d372" x="195.430481" y="447.932519" style="fill: #ff7f0e; stroke: #ff7f0e" />
4604
+ <use ns4:href="#m9b8c54d372" x="297.845652" y="447.952742" style="fill: #ff7f0e; stroke: #ff7f0e" />
4605
+ <use ns4:href="#m9b8c54d372" x="400.260823" y="447.595994" style="fill: #ff7f0e; stroke: #ff7f0e" />
4606
+ <use ns4:href="#m9b8c54d372" x="502.675995" y="447.251251" style="fill: #ff7f0e; stroke: #ff7f0e" />
4607
+ <use ns4:href="#m9b8c54d372" x="605.091166" y="447.055342" style="fill: #ff7f0e; stroke: #ff7f0e" />
4608
+ <use ns4:href="#m9b8c54d372" x="707.506337" y="445.656252" style="fill: #ff7f0e; stroke: #ff7f0e" />
4609
+ <use ns4:href="#m9b8c54d372" x="809.921508" y="445.625657" style="fill: #ff7f0e; stroke: #ff7f0e" />
4610
  </g>
4611
  </g>
4612
  <g id="patch_3">
rotary/impls/artifacts/benchmark/rotary.jsonl CHANGED
@@ -1,24 +1,24 @@
1
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17331300000478223, "p50": 0.17603300000246236, "p90": 0.1797429999896849, "mean": 0.1784169999950791, "iqr": 0.0038800000083938357, "raw_times": [0.17603300000246236, 0.17586299998129107, 0.1797429999896849, 0.18713299999717492, 0.17331300000478223], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18657300000768373, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
2
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21556299998337636, "p50": 0.2165239999953883, "p90": 0.21698299997297, "mean": 0.21635159998822928, "iqr": 0.0013189999776841432, "raw_times": [0.2165239999953883, 0.21698299997297, 0.21566399999528585, 0.21556299998337636, 0.21702399999412592], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21905399995603148, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
3
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21391299998185787, "p50": 0.21503299996084024, "p90": 0.21681300000864212, "mean": 0.21537540000053923, "iqr": 0.0027289999593449465, "raw_times": [0.21503299996084024, 0.21408400004929717, 0.21681300000864212, 0.21703400000205875, 0.21391299998185787], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2214840000078766, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
4
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21356299998842587, "p50": 0.2151840000124139, "p90": 0.2162740000244412, "mean": 0.21532140000317668, "iqr": 0.0011410000411160581, "raw_times": [0.2162740000244412, 0.21513299998332513, 0.2151840000124139, 0.2164530000072773, 0.21356299998842587], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2165939999940747, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
5
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21375400001488742, "p50": 0.21507399998199617, "p90": 0.21535299998731716, "mean": 0.21505959999785773, "iqr": 0.0006099999723119254, "raw_times": [0.21474300001500524, 0.21375400001488742, 0.21535299998731716, 0.21507399998199617, 0.21637399999008267], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2174030000219318, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
6
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2112430000238419, "p50": 0.21400400004267794, "p90": 0.21425299996735703, "mean": 0.21312160000661606, "iqr": 0.002878999964650575, "raw_times": [0.21137400000270645, 0.21425299996735703, 0.21400400004267794, 0.214733999996497, 0.2112430000238419], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22874399996908323, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
7
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21246400001473376, "p50": 0.2133630000002995, "p90": 0.21390399996334963, "mean": 0.2133594000042649, "iqr": 0.0008009999419300584, "raw_times": [0.21396300002152202, 0.21246400001473376, 0.21390399996334963, 0.21310300002141958, 0.2133630000002995], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2195139999798812, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
8
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21415399999114015, "p50": 0.21443299999646115, "p90": 0.2147029999832739, "mean": 0.2148253999962435, "iqr": 0.000368999963029637, "raw_times": [0.21650299999009803, 0.21415399999114015, 0.21443299999646115, 0.21433400002024428, 0.2147029999832739], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2173330000232454, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
9
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21359300001222437, "p50": 0.2138830000149028, "p90": 0.21400299999640993, "mean": 0.21457699999700708, "iqr": 0.00012000003835055395, "raw_times": [0.21400299999640993, 0.2138830000149028, 0.21359300001222437, 0.21752300000343894, 0.21388299995805937], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2191329999732261, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
10
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21296400001347138, "p50": 0.21389400001226022, "p90": 0.21517300001505646, "mean": 0.21466560000362733, "iqr": 0.0013790000252811296, "raw_times": [0.21296400001347138, 0.21517300001505646, 0.21750299998757328, 0.21379399998977533, 0.21389400001226022], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21542299998600356, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
11
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2098030000183826, "p50": 0.21347300003071723, "p90": 0.21457399998325855, "mean": 0.21505920001345658, "iqr": 0.0023309999619414157, "raw_times": [0.2098030000183826, 0.21457399998325855, 0.21347300003071723, 0.21224300002131713, 0.22520300001360738], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21741399996244581, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
12
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22297399999615664, "p50": 0.22381400003723684, "p90": 0.22385300002270014, "mean": 0.2239618000203336, "iqr": 0.0007890000119914475, "raw_times": [0.2230640000107087, 0.22385300002270014, 0.22381400003723684, 0.22610400003486575, 0.22297399999615664], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22540300000173374, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
13
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21275400001741218, "p50": 0.21372400004793235, "p90": 0.21630299994512825, "mean": 0.22107159999222858, "iqr": 0.0030399999673136335, "raw_times": [0.21372400004793235, 0.24931399997285553, 0.21630299994512825, 0.21326299997781462, 0.21275400001741218], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21886299998641334, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
14
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21458399999119138, "p50": 0.21627299997817317, "p90": 0.21634299997685957, "mean": 0.21600339998713025, "iqr": 0.0007099999947968172, "raw_times": [0.21627299997817317, 0.21718400000736438, 0.21563299998206276, 0.21458399999119138, 0.21634299997685957], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2226130000053672, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
15
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2140940000003866, "p50": 0.215932999992674, "p90": 0.21619400001782196, "mean": 0.21597160000510485, "iqr": 0.0015699999948992627, "raw_times": [0.2140940000003866, 0.2146240000229227, 0.215932999992674, 0.21619400001782196, 0.21901299999171897], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2184540000484958, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
16
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21190300003581797, "p50": 0.21745400005102056, "p90": 0.21756400002459486, "mean": 0.21624960003236993, "iqr": 0.0009400000067216752, "raw_times": [0.21190300003581797, 0.21745400005102056, 0.21756400002459486, 0.2166240000178732, 0.21770300003254306], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25062399998887486, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
17
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21696300001394775, "p50": 0.21815399998104112, "p90": 0.21820400002070528, "mean": 0.21879360000411907, "iqr": 0.0004510000053414842, "raw_times": [0.2177530000153638, 0.21815399998104112, 0.2228939999895374, 0.21696300001394775, 0.21820400002070528], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2236640000319312, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
18
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21379300000035073, "p50": 0.21643299999141163, "p90": 0.21674399999938032, "mean": 0.21709340001052624, "iqr": 0.00039999997625272954, "raw_times": [0.21379300000035073, 0.21674399999938032, 0.21643299999141163, 0.2163440000231276, 0.22215300003836091], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21868300001415264, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
19
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2133630000002995, "p50": 0.21632300001783733, "p90": 0.21671399997558183, "mean": 0.21582319999424726, "iqr": 0.0009309999882134434, "raw_times": [0.21693299999014926, 0.2133630000002995, 0.21632300001783733, 0.21671399997558183, 0.21578299998736838], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2180729999849973, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
20
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21294399999760572, "p50": 0.21446299996341622, "p90": 0.21984300002486634, "mean": 0.21647359999406035, "iqr": 0.006489000043075066, "raw_times": [0.21335399998179128, 0.2217640000026222, 0.21984300002486634, 0.21294399999760572, 0.21446299996341622], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21826299996519083, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
21
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21578299998736838, "p50": 0.21700399997826025, "p90": 0.2204729999562005, "mean": 0.21918559997402554, "iqr": 0.004118999981983507, "raw_times": [0.21700399997826025, 0.22631399997408153, 0.2204729999562005, 0.216353999974217, 0.21578299998736838], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22091399995360916, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
22
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2135729999963587, "p50": 0.2144540000017514, "p90": 0.2173039999888715, "mean": 0.21536960000503313, "iqr": 0.003270999968663091, "raw_times": [0.21403300002020842, 0.2144540000017514, 0.2135729999963587, 0.2173039999888715, 0.21748400001797563], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22203300000001036, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
23
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22941399998899215, "p50": 0.23028300000760282, "p90": 0.23160400002097958, "mean": 0.23061779999125065, "iqr": 0.0017800000478018774, "raw_times": [0.231963999965501, 0.22941399998899215, 0.2298239999731777, 0.23160400002097958, 0.23028300000760282], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23190299998532282, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
24
- {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.6428210000422041, "p50": 0.6484909999926458, "p90": 0.6486400000085268, "mean": 0.6472164000115299, "iqr": 0.0035200000070290116, "raw_times": [0.651010000012775, 0.6428210000422041, 0.6486400000085268, 0.6451200000014978, 0.6484909999926458], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.6451109999829896, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
 
1
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07445200003530772, "p50": 0.07589100005134242, "p90": 0.07600200001434132, "mean": 0.0754678000248532, "iqr": 0.0014600000213249587, "raw_times": [0.0764520000302582, 0.07600200001434132, 0.07589100005134242, 0.07454199999301636, 0.07445200003530772], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08018199991965957, "peak_bytes": 3178496, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590452924915553e-08, "mae_k": 1.5487040982975486e-08, "mse_q": 2.5241010080938753e-15, "mse_k": 2.364223539299626e-15, "ref": "rotary_torch"}, "err": null}
2
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08898300006876525, "p50": 0.09174199999506527, "p90": 0.09300300007453188, "mean": 0.09168480000880663, "iqr": 0.0013200001376389991, "raw_times": [0.09168299993689288, 0.09300300007453188, 0.09174199999506527, 0.09301299996877788, 0.08898300006876525], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09485199984737847, "peak_bytes": 6356992, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5508486939097565e-08, "mae_k": 1.567566698668088e-08, "mse_q": 2.3630110116356316e-15, "mse_k": 2.416562128626943e-15, "ref": "rotary_torch"}, "err": null}
3
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08705300001565774, "p50": 0.09316300020145718, "p90": 0.10223200001746591, "mean": 0.09793460003493237, "iqr": 0.013889999991079094, "raw_times": [0.10223200001746591, 0.09316300020145718, 0.11888299991369422, 0.08834200002638681, 0.08705300001565774], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.09152200004791666, "peak_bytes": 12615680, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5856898016863852e-08, "mae_k": 1.572981211950264e-08, "mse_q": 2.4771055025978386e-15, "mse_k": 2.4544071371937915e-15, "ref": "rotary_torch"}, "err": null}
4
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0873520000368444, "p50": 0.08926300006351084, "p90": 0.08946200000536919, "mean": 0.08885220004231087, "iqr": 0.0013999999737279722, "raw_times": [0.0873520000368444, 0.09012200007418869, 0.08926300006351084, 0.08806200003164122, 0.08946200000536919], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09113200007959676, "peak_bytes": 25231360, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5617658277733426e-08, "mae_k": 1.5788685914230882e-08, "mse_q": 2.4549424620164562e-15, "mse_k": 2.492823469483563e-15, "ref": "rotary_torch"}, "err": null}
5
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08857299985720601, "p50": 0.09020299989970226, "p90": 0.09035299990500789, "mean": 0.0900987999557401, "iqr": 0.00085999977272877, "raw_times": [0.08949300013227912, 0.09020299989970226, 0.09035299990500789, 0.08857299985720601, 0.09187199998450524], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09262200001103338, "peak_bytes": 12779520, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5962712041073246e-08, "mae_k": 1.5743363945830424e-08, "mse_q": 2.534145124782417e-15, "mse_k": 2.451281585618423e-15, "ref": "rotary_torch"}, "err": null}
6
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08818200012683519, "p50": 0.08903200000531797, "p90": 0.08924200005822058, "mean": 0.08891200000107347, "iqr": 0.0008400002116104588, "raw_times": [0.08903200000531797, 0.08840199984661012, 0.08818200012683519, 0.08924200005822058, 0.08970199996838346], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09153199994216266, "peak_bytes": 25427968, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.578730035589615e-08, "mae_k": 1.5859711766097462e-08, "mse_q": 2.440287521479536e-15, "mse_k": 2.477901290051784e-15, "ref": "rotary_torch"}, "err": null}
7
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08838300004754274, "p50": 0.08999199985737505, "p90": 0.0905619999684859, "mean": 0.09254639999198844, "iqr": 0.0011899999208253575, "raw_times": [0.0905619999684859, 0.10442300003887794, 0.08937200004766055, 0.08999199985737505, 0.08838300004754274], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10543200005486142, "peak_bytes": 50462720, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5775295736375483e-08, "mae_k": 1.5847881229547056e-08, "mse_q": 2.471039476146077e-15, "mse_k": 2.472378635235686e-15, "ref": "rotary_torch"}, "err": null}
8
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08926200007408625, "p50": 0.08963200002654048, "p90": 0.09017300021696428, "mean": 0.08983220009213255, "iqr": 0.0005410001904238015, "raw_times": [0.08963200002654048, 0.09017300021696428, 0.08926200007408625, 0.08963200002654048, 0.09046200011653127], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09046300010595587, "peak_bytes": 100925440, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5959869870130206e-08, "mae_k": 1.588083975434529e-08, "mse_q": 2.510663677418633e-15, "mse_k": 2.502786271009168e-15, "ref": "rotary_torch"}, "err": null}
9
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08713199986232212, "p50": 0.08950200003710052, "p90": 0.08994299992082233, "mean": 0.08932219993766921, "iqr": 0.0012210000477352878, "raw_times": [0.08872199987308704, 0.08950200003710052, 0.08994299992082233, 0.09131199999501405, 0.08713199986232212], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09138199993685703, "peak_bytes": 51118080, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5894533689220225e-08, "mae_k": 1.5873395042831362e-08, "mse_q": 2.5093181655819197e-15, "mse_k": 2.488611809911578e-15, "ref": "rotary_torch"}, "err": null}
10
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08889199989425833, "p50": 0.09063199991032889, "p90": 0.09075200000552286, "mean": 0.09513419990980765, "iqr": 0.0011500001164677087, "raw_times": [0.08889199989425833, 0.08960199988905515, 0.09075200000552286, 0.09063199991032889, 0.11579299984987301], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09194200015372189, "peak_bytes": 101711872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5936768349433805e-08, "mae_k": 1.5960043953100467e-08, "mse_q": 2.51039008577667e-15, "mse_k": 2.5111253103748867e-15, "ref": "rotary_torch"}, "err": null}
11
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08985299996311369, "p50": 0.09080199993150018, "p90": 0.09128199985752872, "mean": 0.09099019994209812, "iqr": 0.0010899998414970469, "raw_times": [0.08985299996311369, 0.09019200001603167, 0.09128199985752872, 0.09282199994231632, 0.09080199993150018], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09297199994762195, "peak_bytes": 201850880, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 1.9073486328125e-06, "absmax_k": 9.5367431640625e-07, "mae_q": 1.586510300910504e-08, "mae_k": 1.5813935050346117e-08, "mse_q": 2.499836478770355e-15, "mse_k": 2.4755639026338358e-15, "ref": "rotary_torch"}, "err": null}
12
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2603559998988203, "p50": 0.2619460001369589, "p90": 0.2620170000682265, "mean": 0.26208240001324157, "iqr": 0.00017100001059588976, "raw_times": [0.2619460001369589, 0.2603559998988203, 0.2618460000576306, 0.26424699990457157, 0.2620170000682265], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.26098600005752814, "peak_bytes": 403701760, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.581049247079136e-08, "mae_k": 1.5861061797295406e-08, "mse_q": 2.4735094242202705e-15, "mse_k": 2.486832828964107e-15, "ref": "rotary_torch"}, "err": null}
13
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0871929998993437, "p50": 0.08902200011107197, "p90": 0.08904199989956396, "mean": 0.08843839996188763, "iqr": 0.0015389998679893324, "raw_times": [0.08750300003157463, 0.08904199989956396, 0.08943199986788386, 0.08902200011107197, 0.0871929998993437], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09225299982063007, "peak_bytes": 137396224, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5824980437173508e-08, "mae_k": 1.5615324144846454e-08, "mse_q": 2.488090249374306e-15, "mse_k": 2.425079044911585e-15, "ref": "rotary_torch"}, "err": null}
14
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0884020000739838, "p50": 0.08922200004235492, "p90": 0.08970199996838346, "mean": 0.08935000005294569, "iqr": 0.0005199999577598646, "raw_times": [0.08922200004235492, 0.0884020000739838, 0.09024200016938266, 0.08970199996838346, 0.0891820000106236], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09462200000598386, "peak_bytes": 12648448, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5683587761827766e-08, "mae_k": 1.574532682013796e-08, "mse_q": 2.4310271220254415e-15, "mse_k": 2.4601385856313877e-15, "ref": "rotary_torch"}, "err": null}
15
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08808200004750688, "p50": 0.08919200013224327, "p90": 0.09035200014295697, "mean": 0.0894580000476708, "iqr": 0.0017100001059588976, "raw_times": [0.08808200004750688, 0.08919200013224327, 0.09035200014295697, 0.09102199987864878, 0.08864200003699807], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0948819999848638, "peak_bytes": 25198592, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5835009747888762e-08, "mae_k": 1.572560215379326e-08, "mse_q": 2.478222950813504e-15, "mse_k": 2.4541699679685603e-15, "ref": "rotary_torch"}, "err": null}
16
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08770199997343298, "p50": 0.08942199997363787, "p90": 0.08942299996306247, "mean": 0.08932019995882001, "iqr": 0.0003010000000358559, "raw_times": [0.08912199996302661, 0.09093199992094014, 0.08770199997343298, 0.08942299996306247, 0.08942199997363787], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09424199993190996, "peak_bytes": 50397184, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5876850056883995e-08, "mae_k": 1.5927410501603845e-08, "mse_q": 2.504224532953606e-15, "mse_k": 2.503892919554756e-15, "ref": "rotary_torch"}, "err": null}
17
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08902200011107197, "p50": 0.09058199998435157, "p90": 0.09125299993684166, "mean": 0.09215640002366854, "iqr": 0.0012609998520929366, "raw_times": [0.09993300000132876, 0.09058199998435157, 0.09125299993684166, 0.08902200011107197, 0.08999200008474872], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09323300014330016, "peak_bytes": 25362432, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5820052823301012e-08, "mae_k": 1.580205122309053e-08, "mse_q": 2.4876468276264184e-15, "mse_k": 2.4866062476507165e-15, "ref": "rotary_torch"}, "err": null}
18
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08776200002102996, "p50": 0.08892300002116826, "p90": 0.08966199993665214, "mean": 0.0888985999608849, "iqr": 0.001638999947317643, "raw_times": [0.09012299983623961, 0.08892300002116826, 0.08966199993665214, 0.08776200002102996, 0.0880229999893345], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09301299996877788, "peak_bytes": 50593792, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5823172105911e-08, "mae_k": 1.582038855474366e-08, "mse_q": 2.464257071579175e-15, "mse_k": 2.4775099608301526e-15, "ref": "rotary_torch"}, "err": null}
19
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0880819998201332, "p50": 0.08909200005291495, "p90": 0.08928200008995191, "mean": 0.08905000004233443, "iqr": 0.00023999996301427018, "raw_times": [0.0880819998201332, 0.08904200012693764, 0.08975200012173445, 0.08909200005291495, 0.08928200008995191], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09304200011683861, "peak_bytes": 100794368, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5888783622131086e-08, "mae_k": 1.5861886026868888e-08, "mse_q": 2.4766798685418433e-15, "mse_k": 2.475923891636419e-15, "ref": "rotary_torch"}, "err": null}
20
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08700200010025583, "p50": 0.08849200003169244, "p90": 0.0890119999894523, "mean": 0.08845600000313425, "iqr": 0.0007099999947968172, "raw_times": [0.0890119999894523, 0.08830199999465549, 0.08947199989961518, 0.08700200010025583, 0.08849200003169244], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09235300012733205, "peak_bytes": 201588736, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5826390864503992e-08, "mae_k": 1.5792682717119533e-08, "mse_q": 2.480465258783123e-15, "mse_k": 2.475580631534544e-15, "ref": "rotary_torch"}, "err": null}
21
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08904200012693764, "p50": 0.0900719999208377, "p90": 0.09035200014295697, "mean": 0.09022600006574066, "iqr": 0.0009600000794307562, "raw_times": [0.09035200014295697, 0.08904200012693764, 0.0922720000744448, 0.08939200006352621, 0.0900719999208377], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09144199998445401, "peak_bytes": 101449728, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.592899323554775e-08, "mae_k": 1.5925031959795888e-08, "mse_q": 2.50783882253954e-15, "mse_k": 2.5015648494992274e-15, "ref": "rotary_torch"}, "err": null}
22
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08612200008428772, "p50": 0.08916199999475793, "p90": 0.08966199993665214, "mean": 0.08842420002110885, "iqr": 0.002328999926248798, "raw_times": [0.08612200008428772, 0.0898420000794431, 0.08916199999475793, 0.08733300001040334, 0.08966199993665214], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09376299999530602, "peak_bytes": 202375168, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590209919299923e-08, "mae_k": 1.590130160877834e-08, "mse_q": 2.4971026799330918e-15, "mse_k": 2.506967649153289e-15, "ref": "rotary_torch"}, "err": null}
23
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2547460001096624, "p50": 0.25804599999901257, "p90": 0.2586460000202351, "mean": 0.25757600001270475, "iqr": 0.0013200001376389991, "raw_times": [0.2547460001096624, 0.2591160000520176, 0.25804599999901257, 0.2586460000202351, 0.2573259998825961], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25434600001972285, "peak_bytes": 403177472, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5847520629108658e-08, "mae_k": 1.5862454461057496e-08, "mse_q": 2.4917348203881045e-15, "mse_k": 2.491306009958557e-15, "ref": "rotary_torch"}, "err": null}
24
+ {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8454199999050616, "p50": 0.8495600000060222, "p90": 0.8538209999642277, "mean": 0.8503745999860257, "iqr": 0.0067099999796482734, "raw_times": [0.8559610000702378, 0.8538209999642277, 0.8471109999845794, 0.8495600000060222, 0.8454199999050616], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8642910001981363, "peak_bytes": 806354944, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.585225106737198e-08, "mae_k": 1.581303976649906e-08, "mse_q": 2.4866460581992374e-15, "mse_k": 2.4721545950211372e-15, "ref": "rotary_torch"}, "err": null}
rotary/impls/cells/benchmark.py CHANGED
@@ -4,6 +4,7 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
@@ -12,46 +13,36 @@
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
15
 
 
 
16
 
17
- def apply_rotary_torch(x1, x2, cos, sin, conj=False):
18
- """Reference rotary implementation."""
19
- if not conj:
20
- out1 = x1 * cos - x2 * sin
21
- out2 = x1 * sin + x2 * cos
22
- else:
23
- out1 = x1 * cos + x2 * sin
24
- out2 = -x1 * sin + x2 * cos
25
- return out1, out2
26
 
27
-
28
- def torch_rotary(query, key, cos, sin, conj=False):
29
  rotary_dim = cos.shape[-1]
30
 
31
- # Clone inputs to avoid modifying them
32
  q_out = query.clone()
33
  k_out = key.clone()
34
 
35
  # Apply rotation to query
36
  q1 = q_out[..., :rotary_dim]
37
  q2 = q_out[..., rotary_dim : 2 * rotary_dim]
38
- q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
39
- q_out[..., :rotary_dim] = q_out_1
40
- q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2
41
 
42
  # Apply rotation to key
43
  k1 = k_out[..., :rotary_dim]
44
  k2 = k_out[..., rotary_dim : 2 * rotary_dim]
45
- k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
46
- k_out[..., :rotary_dim] = k_out_1
47
- k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2
48
 
49
  return q_out, k_out
50
 
51
 
52
  run_benchmark(
53
  kernel_type=KernelTypeEnum.ROTARY,
54
- impl_name="torch_eager",
55
- impl_tags={"family": "pytorch", "backend": "eager"},
56
- impl_func=torch_rotary,
 
57
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
+ # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
 
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
+ from kernels import get_kernel
17
 
18
+ # Load the rotary kernel
19
+ rotary = get_kernel("kernels-community/rotary")
20
 
 
 
 
 
 
 
 
 
 
21
 
22
+ def hf_kernels_rotary(query, key, cos, sin, conj=False):
 
23
  rotary_dim = cos.shape[-1]
24
 
25
+ # Clone to avoid modifying inputs
26
  q_out = query.clone()
27
  k_out = key.clone()
28
 
29
  # Apply rotation to query
30
  q1 = q_out[..., :rotary_dim]
31
  q2 = q_out[..., rotary_dim : 2 * rotary_dim]
32
+ rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)
 
 
33
 
34
  # Apply rotation to key
35
  k1 = k_out[..., :rotary_dim]
36
  k2 = k_out[..., rotary_dim : 2 * rotary_dim]
37
+ rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)
 
 
38
 
39
  return q_out, k_out
40
 
41
 
42
  run_benchmark(
43
  kernel_type=KernelTypeEnum.ROTARY,
44
+ impl_name="hf_kernels_rotary",
45
+ impl_tags={"family": "hf-kernels", "backend": "cuda"},
46
+ impl_func=hf_kernels_rotary,
47
+ dtype="float32",
48
  )
rotary/impls/hf_kernels_rotary.html CHANGED
The diff for this file is too large to render. See raw diff
 
rotary/impls/torch_rotary.html CHANGED
The diff for this file is too large to render. See raw diff
 
rotary/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 04c99f3bdfb8e557a70edb1f042a847172d839fb06e1e2252c0b4df4cf0c1abe
  • Pointer size: 130 Bytes
  • Size of remote file: 37.9 kB

Git LFS Details

  • SHA256: b50b39073ca470536083df3bd46c41d2c32d623cb6c07d81e1425542a6d29446
  • Pointer size: 130 Bytes
  • Size of remote file: 37.9 kB
rotary/results/combined_results.html CHANGED
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-12-19T19:09:41.164726</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
@@ -4233,109 +4233,109 @@ body[data-tool="eraser"] .main-content {
4233
  <g id="matplotlib.axis_2">
4234
  <g id="ytick_1">
4235
  <g id="grid-y--2" class="grid grid-y">
4236
- <path d="M 47.72 394.065769 L 823.142937 394.065769 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4237
  </g>
4238
  <g id="line2d_25">
4239
  <defs>
4240
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4241
  </defs>
4242
  <g>
4243
- <use ns4:href="#m0fca2865ba" x="47.72" y="394.065769" style="stroke: #000000; stroke-width: 0.8" />
4244
  </g>
4245
  </g>
4246
  <g id="text_25">
4247
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="397.864988" transform="rotate(-0 40.72 397.864988)">0.1</text>
4248
  </g>
4249
  </g>
4250
  <g id="ytick_2">
4251
  <g id="grid-y--3" class="grid grid-y">
4252
- <path d="M 47.72 347.214212 L 823.142937 347.214212 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4253
  </g>
4254
  <g id="line2d_26">
4255
  <g>
4256
- <use ns4:href="#m0fca2865ba" x="47.72" y="347.214212" style="stroke: #000000; stroke-width: 0.8" />
4257
  </g>
4258
  </g>
4259
  <g id="text_26">
4260
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="351.013431" transform="rotate(-0 40.72 351.013431)">0.2</text>
4261
  </g>
4262
  </g>
4263
  <g id="ytick_3">
4264
  <g id="grid-y--4" class="grid grid-y">
4265
- <path d="M 47.72 300.362656 L 823.142937 300.362656 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4266
  </g>
4267
  <g id="line2d_27">
4268
  <g>
4269
- <use ns4:href="#m0fca2865ba" x="47.72" y="300.362656" style="stroke: #000000; stroke-width: 0.8" />
4270
  </g>
4271
  </g>
4272
  <g id="text_27">
4273
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="304.161875" transform="rotate(-0 40.72 304.161875)">0.3</text>
4274
  </g>
4275
  </g>
4276
  <g id="ytick_4">
4277
  <g id="grid-y--5" class="grid grid-y">
4278
- <path d="M 47.72 253.511099 L 823.142937 253.511099 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4279
  </g>
4280
  <g id="line2d_28">
4281
  <g>
4282
- <use ns4:href="#m0fca2865ba" x="47.72" y="253.511099" style="stroke: #000000; stroke-width: 0.8" />
4283
  </g>
4284
  </g>
4285
  <g id="text_28">
4286
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.310318" transform="rotate(-0 40.72 257.310318)">0.4</text>
4287
  </g>
4288
  </g>
4289
  <g id="ytick_5">
4290
  <g id="grid-y--6" class="grid grid-y">
4291
- <path d="M 47.72 206.659543 L 823.142937 206.659543 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4292
  </g>
4293
  <g id="line2d_29">
4294
  <g>
4295
- <use ns4:href="#m0fca2865ba" x="47.72" y="206.659543" style="stroke: #000000; stroke-width: 0.8" />
4296
  </g>
4297
  </g>
4298
  <g id="text_29">
4299
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="210.458761" transform="rotate(-0 40.72 210.458761)">0.5</text>
4300
  </g>
4301
  </g>
4302
  <g id="ytick_6">
4303
  <g id="grid-y--7" class="grid grid-y">
4304
- <path d="M 47.72 159.807986 L 823.142937 159.807986 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4305
  </g>
4306
  <g id="line2d_30">
4307
  <g>
4308
- <use ns4:href="#m0fca2865ba" x="47.72" y="159.807986" style="stroke: #000000; stroke-width: 0.8" />
4309
  </g>
4310
  </g>
4311
  <g id="text_30">
4312
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="163.607205" transform="rotate(-0 40.72 163.607205)">0.6</text>
4313
  </g>
4314
  </g>
4315
  <g id="ytick_7">
4316
  <g id="grid-y--8" class="grid grid-y">
4317
- <path d="M 47.72 112.956429 L 823.142937 112.956429 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4318
  </g>
4319
  <g id="line2d_31">
4320
  <g>
4321
- <use ns4:href="#m0fca2865ba" x="47.72" y="112.956429" style="stroke: #000000; stroke-width: 0.8" />
4322
  </g>
4323
  </g>
4324
  <g id="text_31">
4325
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="116.755648" transform="rotate(-0 40.72 116.755648)">0.7</text>
4326
  </g>
4327
  </g>
4328
  <g id="ytick_8">
4329
  <g id="grid-y--9" class="grid grid-y">
4330
- <path d="M 47.72 66.104873 L 823.142937 66.104873 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4331
  </g>
4332
  <g id="line2d_32">
4333
  <g>
4334
- <use ns4:href="#m0fca2865ba" x="47.72" y="66.104873" style="stroke: #000000; stroke-width: 0.8" />
4335
  </g>
4336
  </g>
4337
  <g id="text_32">
4338
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="69.904092" transform="rotate(-0 40.72 69.904092)">0.8</text>
4339
  </g>
4340
  </g>
4341
  <g id="label--y" class="ylabel">
@@ -4343,67 +4343,67 @@ body[data-tool="eraser"] .main-content {
4343
  </g>
4344
  </g>
4345
  <g id="series--hf-kernels-rotary" class="series">
4346
- <path d="M 82.966497 405.060892 L 113.615625 399.43402 L 144.264753 400.029504 L 174.913881 399.696858 L 205.563009 399.059208 L 236.212137 400.038874 L 266.861265 399.640636 L 297.510393 400.038405 L 328.159521 400.006078 L 358.808648 399.691704 L 389.457776 399.640167 L 420.106904 318.131109 L 450.756032 400.455853 L 481.40516 400.197701 L 512.054288 399.907221 L 542.703416 399.860838 L 573.352544 400.20754 L 604.001672 400.567828 L 634.6508 399.780722 L 665.299928 400.403848 L 695.949056 399.312675 L 726.598184 400.328885 L 757.247312 320.371082 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4347
  <defs>
4348
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4349
  </defs>
4350
  <g clip-path="url(#p088c925177)">
4351
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
4352
- <use ns4:href="#md7efaf3aec" x="113.615625" y="399.43402" style="fill: #1f77b4; stroke: #1f77b4" />
4353
- <use ns4:href="#md7efaf3aec" x="144.264753" y="400.029504" style="fill: #1f77b4; stroke: #1f77b4" />
4354
- <use ns4:href="#md7efaf3aec" x="174.913881" y="399.696858" style="fill: #1f77b4; stroke: #1f77b4" />
4355
- <use ns4:href="#md7efaf3aec" x="205.563009" y="399.059208" style="fill: #1f77b4; stroke: #1f77b4" />
4356
- <use ns4:href="#md7efaf3aec" x="236.212137" y="400.038874" style="fill: #1f77b4; stroke: #1f77b4" />
4357
- <use ns4:href="#md7efaf3aec" x="266.861265" y="399.640636" style="fill: #1f77b4; stroke: #1f77b4" />
4358
- <use ns4:href="#md7efaf3aec" x="297.510393" y="400.038405" style="fill: #1f77b4; stroke: #1f77b4" />
4359
- <use ns4:href="#md7efaf3aec" x="328.159521" y="400.006078" style="fill: #1f77b4; stroke: #1f77b4" />
4360
- <use ns4:href="#md7efaf3aec" x="358.808648" y="399.691704" style="fill: #1f77b4; stroke: #1f77b4" />
4361
- <use ns4:href="#md7efaf3aec" x="389.457776" y="399.640167" style="fill: #1f77b4; stroke: #1f77b4" />
4362
- <use ns4:href="#md7efaf3aec" x="420.106904" y="318.131109" style="fill: #1f77b4; stroke: #1f77b4" />
4363
- <use ns4:href="#md7efaf3aec" x="450.756032" y="400.455853" style="fill: #1f77b4; stroke: #1f77b4" />
4364
- <use ns4:href="#md7efaf3aec" x="481.40516" y="400.197701" style="fill: #1f77b4; stroke: #1f77b4" />
4365
- <use ns4:href="#md7efaf3aec" x="512.054288" y="399.907221" style="fill: #1f77b4; stroke: #1f77b4" />
4366
- <use ns4:href="#md7efaf3aec" x="542.703416" y="399.860838" style="fill: #1f77b4; stroke: #1f77b4" />
4367
- <use ns4:href="#md7efaf3aec" x="573.352544" y="400.20754" style="fill: #1f77b4; stroke: #1f77b4" />
4368
- <use ns4:href="#md7efaf3aec" x="604.001672" y="400.567828" style="fill: #1f77b4; stroke: #1f77b4" />
4369
- <use ns4:href="#md7efaf3aec" x="634.6508" y="399.780722" style="fill: #1f77b4; stroke: #1f77b4" />
4370
- <use ns4:href="#md7efaf3aec" x="665.299928" y="400.403848" style="fill: #1f77b4; stroke: #1f77b4" />
4371
- <use ns4:href="#md7efaf3aec" x="695.949056" y="399.312675" style="fill: #1f77b4; stroke: #1f77b4" />
4372
- <use ns4:href="#md7efaf3aec" x="726.598184" y="400.328885" style="fill: #1f77b4; stroke: #1f77b4" />
4373
- <use ns4:href="#md7efaf3aec" x="757.247312" y="320.371082" style="fill: #1f77b4; stroke: #1f77b4" />
4374
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
4375
  </g>
4376
  </g>
4377
  <g id="series--torch-eager" class="series">
4378
- <path d="M 82.966497 358.443125 L 113.615625 339.472461 L 144.264753 340.171018 L 174.913881 340.100272 L 205.563009 340.151809 L 236.212137 340.65312 L 266.861265 340.953439 L 297.510393 340.452127 L 328.159521 340.709811 L 358.808648 340.704657 L 389.457776 340.901902 L 420.106904 336.056983 L 450.756032 340.784305 L 481.40516 339.590059 L 512.054288 339.749354 L 542.703416 339.036742 L 573.352544 338.708781 L 604.001672 339.515096 L 634.6508 339.566633 L 665.299928 340.438072 L 695.949056 339.247574 L 726.598184 340.442288 L 757.247312 333.026156 L 787.896439 137.089198 " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4379
  <defs>
4380
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4381
  </defs>
4382
  <g clip-path="url(#p088c925177)">
4383
- <use ns4:href="#m9b8c54d372" x="82.966497" y="358.443125" style="fill: #ff7f0e; stroke: #ff7f0e" />
4384
- <use ns4:href="#m9b8c54d372" x="113.615625" y="339.472461" style="fill: #ff7f0e; stroke: #ff7f0e" />
4385
- <use ns4:href="#m9b8c54d372" x="144.264753" y="340.171018" style="fill: #ff7f0e; stroke: #ff7f0e" />
4386
- <use ns4:href="#m9b8c54d372" x="174.913881" y="340.100272" style="fill: #ff7f0e; stroke: #ff7f0e" />
4387
- <use ns4:href="#m9b8c54d372" x="205.563009" y="340.151809" style="fill: #ff7f0e; stroke: #ff7f0e" />
4388
- <use ns4:href="#m9b8c54d372" x="236.212137" y="340.65312" style="fill: #ff7f0e; stroke: #ff7f0e" />
4389
- <use ns4:href="#m9b8c54d372" x="266.861265" y="340.953439" style="fill: #ff7f0e; stroke: #ff7f0e" />
4390
- <use ns4:href="#m9b8c54d372" x="297.510393" y="340.452127" style="fill: #ff7f0e; stroke: #ff7f0e" />
4391
- <use ns4:href="#m9b8c54d372" x="328.159521" y="340.709811" style="fill: #ff7f0e; stroke: #ff7f0e" />
4392
- <use ns4:href="#m9b8c54d372" x="358.808648" y="340.704657" style="fill: #ff7f0e; stroke: #ff7f0e" />
4393
- <use ns4:href="#m9b8c54d372" x="389.457776" y="340.901902" style="fill: #ff7f0e; stroke: #ff7f0e" />
4394
- <use ns4:href="#m9b8c54d372" x="420.106904" y="336.056983" style="fill: #ff7f0e; stroke: #ff7f0e" />
4395
- <use ns4:href="#m9b8c54d372" x="450.756032" y="340.784305" style="fill: #ff7f0e; stroke: #ff7f0e" />
4396
- <use ns4:href="#m9b8c54d372" x="481.40516" y="339.590059" style="fill: #ff7f0e; stroke: #ff7f0e" />
4397
- <use ns4:href="#m9b8c54d372" x="512.054288" y="339.749354" style="fill: #ff7f0e; stroke: #ff7f0e" />
4398
- <use ns4:href="#m9b8c54d372" x="542.703416" y="339.036742" style="fill: #ff7f0e; stroke: #ff7f0e" />
4399
- <use ns4:href="#m9b8c54d372" x="573.352544" y="338.708781" style="fill: #ff7f0e; stroke: #ff7f0e" />
4400
- <use ns4:href="#m9b8c54d372" x="604.001672" y="339.515096" style="fill: #ff7f0e; stroke: #ff7f0e" />
4401
- <use ns4:href="#m9b8c54d372" x="634.6508" y="339.566633" style="fill: #ff7f0e; stroke: #ff7f0e" />
4402
- <use ns4:href="#m9b8c54d372" x="665.299928" y="340.438072" style="fill: #ff7f0e; stroke: #ff7f0e" />
4403
- <use ns4:href="#m9b8c54d372" x="695.949056" y="339.247574" style="fill: #ff7f0e; stroke: #ff7f0e" />
4404
- <use ns4:href="#m9b8c54d372" x="726.598184" y="340.442288" style="fill: #ff7f0e; stroke: #ff7f0e" />
4405
- <use ns4:href="#m9b8c54d372" x="757.247312" y="333.026156" style="fill: #ff7f0e; stroke: #ff7f0e" />
4406
- <use ns4:href="#m9b8c54d372" x="787.896439" y="137.089198" style="fill: #ff7f0e; stroke: #ff7f0e" />
4407
  </g>
4408
  </g>
4409
  <g id="patch_3">
@@ -4461,7 +4461,7 @@ body[data-tool="eraser"] .main-content {
4461
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4462
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4463
  </span> |
4464
- Cell: combine | 4.85s
4465
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4466
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4467
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4572,30 +4572,30 @@ hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09 True
4572
  hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09 True
4573
  hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 True
4574
  hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True
4575
- torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
4576
- torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
4577
- torch_eager cuda_B1_S128_H8_D128_R64 0.22 True
4578
- torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
4579
- torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True
4580
- torch_eager cuda_B1_S2048_H32_D64_R32 0.21 True
4581
- torch_eager cuda_B1_S2048_H8_D128_R64 0.21 True
4582
- torch_eager cuda_B1_S2048_H8_D64_R32 0.21 True
4583
- torch_eager cuda_B1_S512_H32_D128_R64 0.21 True
4584
- torch_eager cuda_B1_S512_H32_D64_R32 0.21 True
4585
- torch_eager cuda_B1_S512_H8_D128_R64 0.21 True
4586
- torch_eager cuda_B1_S512_H8_D64_R32 0.22 True
4587
- torch_eager cuda_B2_S128_H32_D128_R64 0.22 True
4588
- torch_eager cuda_B2_S128_H32_D64_R32 0.22 True
4589
- torch_eager cuda_B2_S128_H8_D128_R64 0.22 True
4590
- torch_eager cuda_B2_S128_H8_D64_R32 0.21 True
4591
  torch_eager cuda_B2_S2048_H32_D128_R64 0.65 True
4592
- torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
4593
- torch_eager cuda_B2_S2048_H8_D128_R64 0.21 True
4594
- torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
4595
- torch_eager cuda_B2_S512_H32_D128_R64 0.21 True
4596
- torch_eager cuda_B2_S512_H32_D64_R32 0.22 True
4597
- torch_eager cuda_B2_S512_H8_D128_R64 0.22 True
4598
- torch_eager cuda_B2_S512_H8_D64_R32 0.22 True
4599
 
4600
  GENERATING COMBINED VISUALIZATION
4601
 
@@ -4615,7 +4615,7 @@ Implementations included:
4615
  <div class="uv-install-logs" id="uv-logs-combine">
4616
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4617
  <div class="uv-logs-content" style="display: none;">
4618
- Installed 37 packages in 330ms
4619
  </div>
4620
  </div>
4621
  <div class="cell-artifacts">
@@ -4628,7 +4628,7 @@ Installed 37 packages in 330ms
4628
  <rdf:RDF>
4629
  <ns2:Work>
4630
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4631
- <dc:date>2025-12-19T19:09:41.164726</dc:date>
4632
  <dc:format>image/svg+xml</dc:format>
4633
  <dc:creator>
4634
  <ns2:Agent>
@@ -4972,109 +4972,109 @@ Installed 37 packages in 330ms
4972
  <g id="matplotlib.axis_2">
4973
  <g id="ytick_1">
4974
  <g id="grid-y--2" class="grid grid-y">
4975
- <path d="M 47.72 394.065769 L 823.142937 394.065769 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4976
  </g>
4977
  <g id="line2d_25">
4978
  <defs>
4979
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4980
  </defs>
4981
  <g>
4982
- <use ns4:href="#m0fca2865ba" x="47.72" y="394.065769" style="stroke: #000000; stroke-width: 0.8" />
4983
  </g>
4984
  </g>
4985
  <g id="text_25">
4986
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="397.864988" transform="rotate(-0 40.72 397.864988)">0.1</text>
4987
  </g>
4988
  </g>
4989
  <g id="ytick_2">
4990
  <g id="grid-y--3" class="grid grid-y">
4991
- <path d="M 47.72 347.214212 L 823.142937 347.214212 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4992
  </g>
4993
  <g id="line2d_26">
4994
  <g>
4995
- <use ns4:href="#m0fca2865ba" x="47.72" y="347.214212" style="stroke: #000000; stroke-width: 0.8" />
4996
  </g>
4997
  </g>
4998
  <g id="text_26">
4999
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="351.013431" transform="rotate(-0 40.72 351.013431)">0.2</text>
5000
  </g>
5001
  </g>
5002
  <g id="ytick_3">
5003
  <g id="grid-y--4" class="grid grid-y">
5004
- <path d="M 47.72 300.362656 L 823.142937 300.362656 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5005
  </g>
5006
  <g id="line2d_27">
5007
  <g>
5008
- <use ns4:href="#m0fca2865ba" x="47.72" y="300.362656" style="stroke: #000000; stroke-width: 0.8" />
5009
  </g>
5010
  </g>
5011
  <g id="text_27">
5012
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="304.161875" transform="rotate(-0 40.72 304.161875)">0.3</text>
5013
  </g>
5014
  </g>
5015
  <g id="ytick_4">
5016
  <g id="grid-y--5" class="grid grid-y">
5017
- <path d="M 47.72 253.511099 L 823.142937 253.511099 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5018
  </g>
5019
  <g id="line2d_28">
5020
  <g>
5021
- <use ns4:href="#m0fca2865ba" x="47.72" y="253.511099" style="stroke: #000000; stroke-width: 0.8" />
5022
  </g>
5023
  </g>
5024
  <g id="text_28">
5025
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.310318" transform="rotate(-0 40.72 257.310318)">0.4</text>
5026
  </g>
5027
  </g>
5028
  <g id="ytick_5">
5029
  <g id="grid-y--6" class="grid grid-y">
5030
- <path d="M 47.72 206.659543 L 823.142937 206.659543 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5031
  </g>
5032
  <g id="line2d_29">
5033
  <g>
5034
- <use ns4:href="#m0fca2865ba" x="47.72" y="206.659543" style="stroke: #000000; stroke-width: 0.8" />
5035
  </g>
5036
  </g>
5037
  <g id="text_29">
5038
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="210.458761" transform="rotate(-0 40.72 210.458761)">0.5</text>
5039
  </g>
5040
  </g>
5041
  <g id="ytick_6">
5042
  <g id="grid-y--7" class="grid grid-y">
5043
- <path d="M 47.72 159.807986 L 823.142937 159.807986 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5044
  </g>
5045
  <g id="line2d_30">
5046
  <g>
5047
- <use ns4:href="#m0fca2865ba" x="47.72" y="159.807986" style="stroke: #000000; stroke-width: 0.8" />
5048
  </g>
5049
  </g>
5050
  <g id="text_30">
5051
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="163.607205" transform="rotate(-0 40.72 163.607205)">0.6</text>
5052
  </g>
5053
  </g>
5054
  <g id="ytick_7">
5055
  <g id="grid-y--8" class="grid grid-y">
5056
- <path d="M 47.72 112.956429 L 823.142937 112.956429 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5057
  </g>
5058
  <g id="line2d_31">
5059
  <g>
5060
- <use ns4:href="#m0fca2865ba" x="47.72" y="112.956429" style="stroke: #000000; stroke-width: 0.8" />
5061
  </g>
5062
  </g>
5063
  <g id="text_31">
5064
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="116.755648" transform="rotate(-0 40.72 116.755648)">0.7</text>
5065
  </g>
5066
  </g>
5067
  <g id="ytick_8">
5068
  <g id="grid-y--9" class="grid grid-y">
5069
- <path d="M 47.72 66.104873 L 823.142937 66.104873 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5070
  </g>
5071
  <g id="line2d_32">
5072
  <g>
5073
- <use ns4:href="#m0fca2865ba" x="47.72" y="66.104873" style="stroke: #000000; stroke-width: 0.8" />
5074
  </g>
5075
  </g>
5076
  <g id="text_32">
5077
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="69.904092" transform="rotate(-0 40.72 69.904092)">0.8</text>
5078
  </g>
5079
  </g>
5080
  <g id="label--y" class="ylabel">
@@ -5082,67 +5082,67 @@ Installed 37 packages in 330ms
5082
  </g>
5083
  </g>
5084
  <g id="series--hf-kernels-rotary" class="series">
5085
- <path d="M 82.966497 405.060892 L 113.615625 399.43402 L 144.264753 400.029504 L 174.913881 399.696858 L 205.563009 399.059208 L 236.212137 400.038874 L 266.861265 399.640636 L 297.510393 400.038405 L 328.159521 400.006078 L 358.808648 399.691704 L 389.457776 399.640167 L 420.106904 318.131109 L 450.756032 400.455853 L 481.40516 400.197701 L 512.054288 399.907221 L 542.703416 399.860838 L 573.352544 400.20754 L 604.001672 400.567828 L 634.6508 399.780722 L 665.299928 400.403848 L 695.949056 399.312675 L 726.598184 400.328885 L 757.247312 320.371082 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
5086
  <defs>
5087
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
5088
  </defs>
5089
  <g clip-path="url(#p088c925177)">
5090
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
5091
- <use ns4:href="#md7efaf3aec" x="113.615625" y="399.43402" style="fill: #1f77b4; stroke: #1f77b4" />
5092
- <use ns4:href="#md7efaf3aec" x="144.264753" y="400.029504" style="fill: #1f77b4; stroke: #1f77b4" />
5093
- <use ns4:href="#md7efaf3aec" x="174.913881" y="399.696858" style="fill: #1f77b4; stroke: #1f77b4" />
5094
- <use ns4:href="#md7efaf3aec" x="205.563009" y="399.059208" style="fill: #1f77b4; stroke: #1f77b4" />
5095
- <use ns4:href="#md7efaf3aec" x="236.212137" y="400.038874" style="fill: #1f77b4; stroke: #1f77b4" />
5096
- <use ns4:href="#md7efaf3aec" x="266.861265" y="399.640636" style="fill: #1f77b4; stroke: #1f77b4" />
5097
- <use ns4:href="#md7efaf3aec" x="297.510393" y="400.038405" style="fill: #1f77b4; stroke: #1f77b4" />
5098
- <use ns4:href="#md7efaf3aec" x="328.159521" y="400.006078" style="fill: #1f77b4; stroke: #1f77b4" />
5099
- <use ns4:href="#md7efaf3aec" x="358.808648" y="399.691704" style="fill: #1f77b4; stroke: #1f77b4" />
5100
- <use ns4:href="#md7efaf3aec" x="389.457776" y="399.640167" style="fill: #1f77b4; stroke: #1f77b4" />
5101
- <use ns4:href="#md7efaf3aec" x="420.106904" y="318.131109" style="fill: #1f77b4; stroke: #1f77b4" />
5102
- <use ns4:href="#md7efaf3aec" x="450.756032" y="400.455853" style="fill: #1f77b4; stroke: #1f77b4" />
5103
- <use ns4:href="#md7efaf3aec" x="481.40516" y="400.197701" style="fill: #1f77b4; stroke: #1f77b4" />
5104
- <use ns4:href="#md7efaf3aec" x="512.054288" y="399.907221" style="fill: #1f77b4; stroke: #1f77b4" />
5105
- <use ns4:href="#md7efaf3aec" x="542.703416" y="399.860838" style="fill: #1f77b4; stroke: #1f77b4" />
5106
- <use ns4:href="#md7efaf3aec" x="573.352544" y="400.20754" style="fill: #1f77b4; stroke: #1f77b4" />
5107
- <use ns4:href="#md7efaf3aec" x="604.001672" y="400.567828" style="fill: #1f77b4; stroke: #1f77b4" />
5108
- <use ns4:href="#md7efaf3aec" x="634.6508" y="399.780722" style="fill: #1f77b4; stroke: #1f77b4" />
5109
- <use ns4:href="#md7efaf3aec" x="665.299928" y="400.403848" style="fill: #1f77b4; stroke: #1f77b4" />
5110
- <use ns4:href="#md7efaf3aec" x="695.949056" y="399.312675" style="fill: #1f77b4; stroke: #1f77b4" />
5111
- <use ns4:href="#md7efaf3aec" x="726.598184" y="400.328885" style="fill: #1f77b4; stroke: #1f77b4" />
5112
- <use ns4:href="#md7efaf3aec" x="757.247312" y="320.371082" style="fill: #1f77b4; stroke: #1f77b4" />
5113
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
5114
  </g>
5115
  </g>
5116
  <g id="series--torch-eager" class="series">
5117
- <path d="M 82.966497 358.443125 L 113.615625 339.472461 L 144.264753 340.171018 L 174.913881 340.100272 L 205.563009 340.151809 L 236.212137 340.65312 L 266.861265 340.953439 L 297.510393 340.452127 L 328.159521 340.709811 L 358.808648 340.704657 L 389.457776 340.901902 L 420.106904 336.056983 L 450.756032 340.784305 L 481.40516 339.590059 L 512.054288 339.749354 L 542.703416 339.036742 L 573.352544 338.708781 L 604.001672 339.515096 L 634.6508 339.566633 L 665.299928 340.438072 L 695.949056 339.247574 L 726.598184 340.442288 L 757.247312 333.026156 L 787.896439 137.089198 " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5118
  <defs>
5119
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5120
  </defs>
5121
  <g clip-path="url(#p088c925177)">
5122
- <use ns4:href="#m9b8c54d372" x="82.966497" y="358.443125" style="fill: #ff7f0e; stroke: #ff7f0e" />
5123
- <use ns4:href="#m9b8c54d372" x="113.615625" y="339.472461" style="fill: #ff7f0e; stroke: #ff7f0e" />
5124
- <use ns4:href="#m9b8c54d372" x="144.264753" y="340.171018" style="fill: #ff7f0e; stroke: #ff7f0e" />
5125
- <use ns4:href="#m9b8c54d372" x="174.913881" y="340.100272" style="fill: #ff7f0e; stroke: #ff7f0e" />
5126
- <use ns4:href="#m9b8c54d372" x="205.563009" y="340.151809" style="fill: #ff7f0e; stroke: #ff7f0e" />
5127
- <use ns4:href="#m9b8c54d372" x="236.212137" y="340.65312" style="fill: #ff7f0e; stroke: #ff7f0e" />
5128
- <use ns4:href="#m9b8c54d372" x="266.861265" y="340.953439" style="fill: #ff7f0e; stroke: #ff7f0e" />
5129
- <use ns4:href="#m9b8c54d372" x="297.510393" y="340.452127" style="fill: #ff7f0e; stroke: #ff7f0e" />
5130
- <use ns4:href="#m9b8c54d372" x="328.159521" y="340.709811" style="fill: #ff7f0e; stroke: #ff7f0e" />
5131
- <use ns4:href="#m9b8c54d372" x="358.808648" y="340.704657" style="fill: #ff7f0e; stroke: #ff7f0e" />
5132
- <use ns4:href="#m9b8c54d372" x="389.457776" y="340.901902" style="fill: #ff7f0e; stroke: #ff7f0e" />
5133
- <use ns4:href="#m9b8c54d372" x="420.106904" y="336.056983" style="fill: #ff7f0e; stroke: #ff7f0e" />
5134
- <use ns4:href="#m9b8c54d372" x="450.756032" y="340.784305" style="fill: #ff7f0e; stroke: #ff7f0e" />
5135
- <use ns4:href="#m9b8c54d372" x="481.40516" y="339.590059" style="fill: #ff7f0e; stroke: #ff7f0e" />
5136
- <use ns4:href="#m9b8c54d372" x="512.054288" y="339.749354" style="fill: #ff7f0e; stroke: #ff7f0e" />
5137
- <use ns4:href="#m9b8c54d372" x="542.703416" y="339.036742" style="fill: #ff7f0e; stroke: #ff7f0e" />
5138
- <use ns4:href="#m9b8c54d372" x="573.352544" y="338.708781" style="fill: #ff7f0e; stroke: #ff7f0e" />
5139
- <use ns4:href="#m9b8c54d372" x="604.001672" y="339.515096" style="fill: #ff7f0e; stroke: #ff7f0e" />
5140
- <use ns4:href="#m9b8c54d372" x="634.6508" y="339.566633" style="fill: #ff7f0e; stroke: #ff7f0e" />
5141
- <use ns4:href="#m9b8c54d372" x="665.299928" y="340.438072" style="fill: #ff7f0e; stroke: #ff7f0e" />
5142
- <use ns4:href="#m9b8c54d372" x="695.949056" y="339.247574" style="fill: #ff7f0e; stroke: #ff7f0e" />
5143
- <use ns4:href="#m9b8c54d372" x="726.598184" y="340.442288" style="fill: #ff7f0e; stroke: #ff7f0e" />
5144
- <use ns4:href="#m9b8c54d372" x="757.247312" y="333.026156" style="fill: #ff7f0e; stroke: #ff7f0e" />
5145
- <use ns4:href="#m9b8c54d372" x="787.896439" y="137.089198" style="fill: #ff7f0e; stroke: #ff7f0e" />
5146
  </g>
5147
  </g>
5148
  <g id="patch_3">
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T19:55:53.341578</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
 
4233
  <g id="matplotlib.axis_2">
4234
  <g id="ytick_1">
4235
  <g id="grid-y--2" class="grid grid-y">
4236
+ <path d="M 47.72 393.837238 L 823.142937 393.837238 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4237
  </g>
4238
  <g id="line2d_25">
4239
  <defs>
4240
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4241
  </defs>
4242
  <g>
4243
+ <use ns4:href="#m0fca2865ba" x="47.72" y="393.837238" style="stroke: #000000; stroke-width: 0.8" />
4244
  </g>
4245
  </g>
4246
  <g id="text_25">
4247
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="397.636457" transform="rotate(-0 40.72 397.636457)">0.1</text>
4248
  </g>
4249
  </g>
4250
  <g id="ytick_2">
4251
  <g id="grid-y--3" class="grid grid-y">
4252
+ <path d="M 47.72 347.283443 L 823.142937 347.283443 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4253
  </g>
4254
  <g id="line2d_26">
4255
  <g>
4256
+ <use ns4:href="#m0fca2865ba" x="47.72" y="347.283443" style="stroke: #000000; stroke-width: 0.8" />
4257
  </g>
4258
  </g>
4259
  <g id="text_26">
4260
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="351.082662" transform="rotate(-0 40.72 351.082662)">0.2</text>
4261
  </g>
4262
  </g>
4263
  <g id="ytick_3">
4264
  <g id="grid-y--4" class="grid grid-y">
4265
+ <path d="M 47.72 300.729648 L 823.142937 300.729648 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4266
  </g>
4267
  <g id="line2d_27">
4268
  <g>
4269
+ <use ns4:href="#m0fca2865ba" x="47.72" y="300.729648" style="stroke: #000000; stroke-width: 0.8" />
4270
  </g>
4271
  </g>
4272
  <g id="text_27">
4273
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="304.528867" transform="rotate(-0 40.72 304.528867)">0.3</text>
4274
  </g>
4275
  </g>
4276
  <g id="ytick_4">
4277
  <g id="grid-y--5" class="grid grid-y">
4278
+ <path d="M 47.72 254.175854 L 823.142937 254.175854 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4279
  </g>
4280
  <g id="line2d_28">
4281
  <g>
4282
+ <use ns4:href="#m0fca2865ba" x="47.72" y="254.175854" style="stroke: #000000; stroke-width: 0.8" />
4283
  </g>
4284
  </g>
4285
  <g id="text_28">
4286
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.975072" transform="rotate(-0 40.72 257.975072)">0.4</text>
4287
  </g>
4288
  </g>
4289
  <g id="ytick_5">
4290
  <g id="grid-y--6" class="grid grid-y">
4291
+ <path d="M 47.72 207.622059 L 823.142937 207.622059 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4292
  </g>
4293
  <g id="line2d_29">
4294
  <g>
4295
+ <use ns4:href="#m0fca2865ba" x="47.72" y="207.622059" style="stroke: #000000; stroke-width: 0.8" />
4296
  </g>
4297
  </g>
4298
  <g id="text_29">
4299
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="211.421278" transform="rotate(-0 40.72 211.421278)">0.5</text>
4300
  </g>
4301
  </g>
4302
  <g id="ytick_6">
4303
  <g id="grid-y--7" class="grid grid-y">
4304
+ <path d="M 47.72 161.068264 L 823.142937 161.068264 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4305
  </g>
4306
  <g id="line2d_30">
4307
  <g>
4308
+ <use ns4:href="#m0fca2865ba" x="47.72" y="161.068264" style="stroke: #000000; stroke-width: 0.8" />
4309
  </g>
4310
  </g>
4311
  <g id="text_30">
4312
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="164.867483" transform="rotate(-0 40.72 164.867483)">0.6</text>
4313
  </g>
4314
  </g>
4315
  <g id="ytick_7">
4316
  <g id="grid-y--8" class="grid grid-y">
4317
+ <path d="M 47.72 114.514469 L 823.142937 114.514469 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4318
  </g>
4319
  <g id="line2d_31">
4320
  <g>
4321
+ <use ns4:href="#m0fca2865ba" x="47.72" y="114.514469" style="stroke: #000000; stroke-width: 0.8" />
4322
  </g>
4323
  </g>
4324
  <g id="text_31">
4325
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="118.313688" transform="rotate(-0 40.72 118.313688)">0.7</text>
4326
  </g>
4327
  </g>
4328
  <g id="ytick_8">
4329
  <g id="grid-y--9" class="grid grid-y">
4330
+ <path d="M 47.72 67.960675 L 823.142937 67.960675 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4331
  </g>
4332
  <g id="line2d_32">
4333
  <g>
4334
+ <use ns4:href="#m0fca2865ba" x="47.72" y="67.960675" style="stroke: #000000; stroke-width: 0.8" />
4335
  </g>
4336
  </g>
4337
  <g id="text_32">
4338
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="71.759893" transform="rotate(-0 40.72 71.759893)">0.8</text>
4339
  </g>
4340
  </g>
4341
  <g id="label--y" class="ylabel">
 
4343
  </g>
4344
  </g>
4345
  <g id="series--hf-kernels-rotary" class="series">
4346
+ <path d="M 82.966497 405.060892 L 113.615625 397.68165 L 144.264753 397.020121 L 174.913881 398.835719 L 205.563009 398.398113 L 236.212137 398.943258 L 266.861265 398.496342 L 297.510393 398.663935 L 328.159521 398.724455 L 358.808648 398.198398 L 389.457776 398.119256 L 420.106904 318.445229 L 450.756032 398.947914 L 481.40516 398.854806 L 512.054288 398.868772 L 542.703416 398.761698 L 573.352544 398.221674 L 604.001672 398.994002 L 634.6508 398.915326 L 665.299928 399.194649 L 695.949056 398.459099 L 726.598184 398.882738 L 757.247312 320.260827 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4347
  <defs>
4348
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4349
  </defs>
4350
  <g clip-path="url(#p088c925177)">
4351
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
4352
+ <use ns4:href="#md7efaf3aec" x="113.615625" y="397.68165" style="fill: #1f77b4; stroke: #1f77b4" />
4353
+ <use ns4:href="#md7efaf3aec" x="144.264753" y="397.020121" style="fill: #1f77b4; stroke: #1f77b4" />
4354
+ <use ns4:href="#md7efaf3aec" x="174.913881" y="398.835719" style="fill: #1f77b4; stroke: #1f77b4" />
4355
+ <use ns4:href="#md7efaf3aec" x="205.563009" y="398.398113" style="fill: #1f77b4; stroke: #1f77b4" />
4356
+ <use ns4:href="#md7efaf3aec" x="236.212137" y="398.943258" style="fill: #1f77b4; stroke: #1f77b4" />
4357
+ <use ns4:href="#md7efaf3aec" x="266.861265" y="398.496342" style="fill: #1f77b4; stroke: #1f77b4" />
4358
+ <use ns4:href="#md7efaf3aec" x="297.510393" y="398.663935" style="fill: #1f77b4; stroke: #1f77b4" />
4359
+ <use ns4:href="#md7efaf3aec" x="328.159521" y="398.724455" style="fill: #1f77b4; stroke: #1f77b4" />
4360
+ <use ns4:href="#md7efaf3aec" x="358.808648" y="398.198398" style="fill: #1f77b4; stroke: #1f77b4" />
4361
+ <use ns4:href="#md7efaf3aec" x="389.457776" y="398.119256" style="fill: #1f77b4; stroke: #1f77b4" />
4362
+ <use ns4:href="#md7efaf3aec" x="420.106904" y="318.445229" style="fill: #1f77b4; stroke: #1f77b4" />
4363
+ <use ns4:href="#md7efaf3aec" x="450.756032" y="398.947914" style="fill: #1f77b4; stroke: #1f77b4" />
4364
+ <use ns4:href="#md7efaf3aec" x="481.40516" y="398.854806" style="fill: #1f77b4; stroke: #1f77b4" />
4365
+ <use ns4:href="#md7efaf3aec" x="512.054288" y="398.868772" style="fill: #1f77b4; stroke: #1f77b4" />
4366
+ <use ns4:href="#md7efaf3aec" x="542.703416" y="398.761698" style="fill: #1f77b4; stroke: #1f77b4" />
4367
+ <use ns4:href="#md7efaf3aec" x="573.352544" y="398.221674" style="fill: #1f77b4; stroke: #1f77b4" />
4368
+ <use ns4:href="#md7efaf3aec" x="604.001672" y="398.994002" style="fill: #1f77b4; stroke: #1f77b4" />
4369
+ <use ns4:href="#md7efaf3aec" x="634.6508" y="398.915326" style="fill: #1f77b4; stroke: #1f77b4" />
4370
+ <use ns4:href="#md7efaf3aec" x="665.299928" y="399.194649" style="fill: #1f77b4; stroke: #1f77b4" />
4371
+ <use ns4:href="#md7efaf3aec" x="695.949056" y="398.459099" style="fill: #1f77b4; stroke: #1f77b4" />
4372
+ <use ns4:href="#md7efaf3aec" x="726.598184" y="398.882738" style="fill: #1f77b4; stroke: #1f77b4" />
4373
+ <use ns4:href="#md7efaf3aec" x="757.247312" y="320.260827" style="fill: #1f77b4; stroke: #1f77b4" />
4374
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
4375
  </g>
4376
  </g>
4377
  <g id="series--torch-eager" class="series">
4378
+ <path d="M 82.966497 349.953303 L 113.615625 322.365059 L 144.264753 325.516751 L 174.913881 315.800508 L 205.563009 324.962761 L 236.212137 326.731805 L 266.861265 326.587488 L 297.510393 327.849096 L 328.159521 325.428299 L 358.808648 326.648008 L 389.457776 326.024187 L 420.106904 324.492102 L 450.756032 326.825378 L 481.40516 325.460886 L 512.054288 325.069369 L 542.703416 326.410584 L 573.352544 329.236399 L 604.001672 326.769048 L 634.6508 326.391962 L 665.299928 325.102422 L 695.949056 327.513909 L 726.598184 325.782107 L 757.247312 325.730898 L 787.896439 139.436578 " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4379
  <defs>
4380
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4381
  </defs>
4382
  <g clip-path="url(#p088c925177)">
4383
+ <use ns4:href="#m9b8c54d372" x="82.966497" y="349.953303" style="fill: #ff7f0e; stroke: #ff7f0e" />
4384
+ <use ns4:href="#m9b8c54d372" x="113.615625" y="322.365059" style="fill: #ff7f0e; stroke: #ff7f0e" />
4385
+ <use ns4:href="#m9b8c54d372" x="144.264753" y="325.516751" style="fill: #ff7f0e; stroke: #ff7f0e" />
4386
+ <use ns4:href="#m9b8c54d372" x="174.913881" y="315.800508" style="fill: #ff7f0e; stroke: #ff7f0e" />
4387
+ <use ns4:href="#m9b8c54d372" x="205.563009" y="324.962761" style="fill: #ff7f0e; stroke: #ff7f0e" />
4388
+ <use ns4:href="#m9b8c54d372" x="236.212137" y="326.731805" style="fill: #ff7f0e; stroke: #ff7f0e" />
4389
+ <use ns4:href="#m9b8c54d372" x="266.861265" y="326.587488" style="fill: #ff7f0e; stroke: #ff7f0e" />
4390
+ <use ns4:href="#m9b8c54d372" x="297.510393" y="327.849096" style="fill: #ff7f0e; stroke: #ff7f0e" />
4391
+ <use ns4:href="#m9b8c54d372" x="328.159521" y="325.428299" style="fill: #ff7f0e; stroke: #ff7f0e" />
4392
+ <use ns4:href="#m9b8c54d372" x="358.808648" y="326.648008" style="fill: #ff7f0e; stroke: #ff7f0e" />
4393
+ <use ns4:href="#m9b8c54d372" x="389.457776" y="326.024187" style="fill: #ff7f0e; stroke: #ff7f0e" />
4394
+ <use ns4:href="#m9b8c54d372" x="420.106904" y="324.492102" style="fill: #ff7f0e; stroke: #ff7f0e" />
4395
+ <use ns4:href="#m9b8c54d372" x="450.756032" y="326.825378" style="fill: #ff7f0e; stroke: #ff7f0e" />
4396
+ <use ns4:href="#m9b8c54d372" x="481.40516" y="325.460886" style="fill: #ff7f0e; stroke: #ff7f0e" />
4397
+ <use ns4:href="#m9b8c54d372" x="512.054288" y="325.069369" style="fill: #ff7f0e; stroke: #ff7f0e" />
4398
+ <use ns4:href="#m9b8c54d372" x="542.703416" y="326.410584" style="fill: #ff7f0e; stroke: #ff7f0e" />
4399
+ <use ns4:href="#m9b8c54d372" x="573.352544" y="329.236399" style="fill: #ff7f0e; stroke: #ff7f0e" />
4400
+ <use ns4:href="#m9b8c54d372" x="604.001672" y="326.769048" style="fill: #ff7f0e; stroke: #ff7f0e" />
4401
+ <use ns4:href="#m9b8c54d372" x="634.6508" y="326.391962" style="fill: #ff7f0e; stroke: #ff7f0e" />
4402
+ <use ns4:href="#m9b8c54d372" x="665.299928" y="325.102422" style="fill: #ff7f0e; stroke: #ff7f0e" />
4403
+ <use ns4:href="#m9b8c54d372" x="695.949056" y="327.513909" style="fill: #ff7f0e; stroke: #ff7f0e" />
4404
+ <use ns4:href="#m9b8c54d372" x="726.598184" y="325.782107" style="fill: #ff7f0e; stroke: #ff7f0e" />
4405
+ <use ns4:href="#m9b8c54d372" x="757.247312" y="325.730898" style="fill: #ff7f0e; stroke: #ff7f0e" />
4406
+ <use ns4:href="#m9b8c54d372" x="787.896439" y="139.436578" style="fill: #ff7f0e; stroke: #ff7f0e" />
4407
  </g>
4408
  </g>
4409
  <g id="patch_3">
 
4461
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4462
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4463
  </span> |
4464
+ Cell: combine | 4.98s
4465
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4466
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4467
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4572
  hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09 True
4573
  hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 True
4574
  hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True
4575
+ torch_eager cuda_B1_S128_H32_D128_R64 0.27 True
4576
+ torch_eager cuda_B1_S128_H32_D64_R32 0.25 True
4577
+ torch_eager cuda_B1_S128_H8_D128_R64 0.25 True
4578
+ torch_eager cuda_B1_S128_H8_D64_R32 0.19 True
4579
+ torch_eager cuda_B1_S2048_H32_D128_R64 0.25 True
4580
+ torch_eager cuda_B1_S2048_H32_D64_R32 0.25 True
4581
+ torch_eager cuda_B1_S2048_H8_D128_R64 0.24 True
4582
+ torch_eager cuda_B1_S2048_H8_D64_R32 0.25 True
4583
+ torch_eager cuda_B1_S512_H32_D128_R64 0.24 True
4584
+ torch_eager cuda_B1_S512_H32_D64_R32 0.24 True
4585
+ torch_eager cuda_B1_S512_H8_D128_R64 0.24 True
4586
+ torch_eager cuda_B1_S512_H8_D64_R32 0.25 True
4587
+ torch_eager cuda_B2_S128_H32_D128_R64 0.24 True
4588
+ torch_eager cuda_B2_S128_H32_D64_R32 0.25 True
4589
+ torch_eager cuda_B2_S128_H8_D128_R64 0.25 True
4590
+ torch_eager cuda_B2_S128_H8_D64_R32 0.24 True
4591
  torch_eager cuda_B2_S2048_H32_D128_R64 0.65 True
4592
+ torch_eager cuda_B2_S2048_H32_D64_R32 0.25 True
4593
+ torch_eager cuda_B2_S2048_H8_D128_R64 0.25 True
4594
+ torch_eager cuda_B2_S2048_H8_D64_R32 0.24 True
4595
+ torch_eager cuda_B2_S512_H32_D128_R64 0.25 True
4596
+ torch_eager cuda_B2_S512_H32_D64_R32 0.24 True
4597
+ torch_eager cuda_B2_S512_H8_D128_R64 0.24 True
4598
+ torch_eager cuda_B2_S512_H8_D64_R32 0.24 True
4599
 
4600
  GENERATING COMBINED VISUALIZATION
4601
 
 
4615
  <div class="uv-install-logs" id="uv-logs-combine">
4616
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4617
  <div class="uv-logs-content" style="display: none;">
4618
+ Installed 37 packages in 314ms
4619
  </div>
4620
  </div>
4621
  <div class="cell-artifacts">
 
4628
  <rdf:RDF>
4629
  <ns2:Work>
4630
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4631
+ <dc:date>2025-12-19T19:55:53.341578</dc:date>
4632
  <dc:format>image/svg+xml</dc:format>
4633
  <dc:creator>
4634
  <ns2:Agent>
 
4972
  <g id="matplotlib.axis_2">
4973
  <g id="ytick_1">
4974
  <g id="grid-y--2" class="grid grid-y">
4975
+ <path d="M 47.72 393.837238 L 823.142937 393.837238 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4976
  </g>
4977
  <g id="line2d_25">
4978
  <defs>
4979
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4980
  </defs>
4981
  <g>
4982
+ <use ns4:href="#m0fca2865ba" x="47.72" y="393.837238" style="stroke: #000000; stroke-width: 0.8" />
4983
  </g>
4984
  </g>
4985
  <g id="text_25">
4986
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="397.636457" transform="rotate(-0 40.72 397.636457)">0.1</text>
4987
  </g>
4988
  </g>
4989
  <g id="ytick_2">
4990
  <g id="grid-y--3" class="grid grid-y">
4991
+ <path d="M 47.72 347.283443 L 823.142937 347.283443 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4992
  </g>
4993
  <g id="line2d_26">
4994
  <g>
4995
+ <use ns4:href="#m0fca2865ba" x="47.72" y="347.283443" style="stroke: #000000; stroke-width: 0.8" />
4996
  </g>
4997
  </g>
4998
  <g id="text_26">
4999
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="351.082662" transform="rotate(-0 40.72 351.082662)">0.2</text>
5000
  </g>
5001
  </g>
5002
  <g id="ytick_3">
5003
  <g id="grid-y--4" class="grid grid-y">
5004
+ <path d="M 47.72 300.729648 L 823.142937 300.729648 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5005
  </g>
5006
  <g id="line2d_27">
5007
  <g>
5008
+ <use ns4:href="#m0fca2865ba" x="47.72" y="300.729648" style="stroke: #000000; stroke-width: 0.8" />
5009
  </g>
5010
  </g>
5011
  <g id="text_27">
5012
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="304.528867" transform="rotate(-0 40.72 304.528867)">0.3</text>
5013
  </g>
5014
  </g>
5015
  <g id="ytick_4">
5016
  <g id="grid-y--5" class="grid grid-y">
5017
+ <path d="M 47.72 254.175854 L 823.142937 254.175854 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5018
  </g>
5019
  <g id="line2d_28">
5020
  <g>
5021
+ <use ns4:href="#m0fca2865ba" x="47.72" y="254.175854" style="stroke: #000000; stroke-width: 0.8" />
5022
  </g>
5023
  </g>
5024
  <g id="text_28">
5025
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.975072" transform="rotate(-0 40.72 257.975072)">0.4</text>
5026
  </g>
5027
  </g>
5028
  <g id="ytick_5">
5029
  <g id="grid-y--6" class="grid grid-y">
5030
+ <path d="M 47.72 207.622059 L 823.142937 207.622059 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5031
  </g>
5032
  <g id="line2d_29">
5033
  <g>
5034
+ <use ns4:href="#m0fca2865ba" x="47.72" y="207.622059" style="stroke: #000000; stroke-width: 0.8" />
5035
  </g>
5036
  </g>
5037
  <g id="text_29">
5038
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="211.421278" transform="rotate(-0 40.72 211.421278)">0.5</text>
5039
  </g>
5040
  </g>
5041
  <g id="ytick_6">
5042
  <g id="grid-y--7" class="grid grid-y">
5043
+ <path d="M 47.72 161.068264 L 823.142937 161.068264 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5044
  </g>
5045
  <g id="line2d_30">
5046
  <g>
5047
+ <use ns4:href="#m0fca2865ba" x="47.72" y="161.068264" style="stroke: #000000; stroke-width: 0.8" />
5048
  </g>
5049
  </g>
5050
  <g id="text_30">
5051
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="164.867483" transform="rotate(-0 40.72 164.867483)">0.6</text>
5052
  </g>
5053
  </g>
5054
  <g id="ytick_7">
5055
  <g id="grid-y--8" class="grid grid-y">
5056
+ <path d="M 47.72 114.514469 L 823.142937 114.514469 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5057
  </g>
5058
  <g id="line2d_31">
5059
  <g>
5060
+ <use ns4:href="#m0fca2865ba" x="47.72" y="114.514469" style="stroke: #000000; stroke-width: 0.8" />
5061
  </g>
5062
  </g>
5063
  <g id="text_31">
5064
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="118.313688" transform="rotate(-0 40.72 118.313688)">0.7</text>
5065
  </g>
5066
  </g>
5067
  <g id="ytick_8">
5068
  <g id="grid-y--9" class="grid grid-y">
5069
+ <path d="M 47.72 67.960675 L 823.142937 67.960675 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
5070
  </g>
5071
  <g id="line2d_32">
5072
  <g>
5073
+ <use ns4:href="#m0fca2865ba" x="47.72" y="67.960675" style="stroke: #000000; stroke-width: 0.8" />
5074
  </g>
5075
  </g>
5076
  <g id="text_32">
5077
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="71.759893" transform="rotate(-0 40.72 71.759893)">0.8</text>
5078
  </g>
5079
  </g>
5080
  <g id="label--y" class="ylabel">
 
5082
  </g>
5083
  </g>
5084
  <g id="series--hf-kernels-rotary" class="series">
5085
+ <path d="M 82.966497 405.060892 L 113.615625 397.68165 L 144.264753 397.020121 L 174.913881 398.835719 L 205.563009 398.398113 L 236.212137 398.943258 L 266.861265 398.496342 L 297.510393 398.663935 L 328.159521 398.724455 L 358.808648 398.198398 L 389.457776 398.119256 L 420.106904 318.445229 L 450.756032 398.947914 L 481.40516 398.854806 L 512.054288 398.868772 L 542.703416 398.761698 L 573.352544 398.221674 L 604.001672 398.994002 L 634.6508 398.915326 L 665.299928 399.194649 L 695.949056 398.459099 L 726.598184 398.882738 L 757.247312 320.260827 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
5086
  <defs>
5087
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
5088
  </defs>
5089
  <g clip-path="url(#p088c925177)">
5090
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
5091
+ <use ns4:href="#md7efaf3aec" x="113.615625" y="397.68165" style="fill: #1f77b4; stroke: #1f77b4" />
5092
+ <use ns4:href="#md7efaf3aec" x="144.264753" y="397.020121" style="fill: #1f77b4; stroke: #1f77b4" />
5093
+ <use ns4:href="#md7efaf3aec" x="174.913881" y="398.835719" style="fill: #1f77b4; stroke: #1f77b4" />
5094
+ <use ns4:href="#md7efaf3aec" x="205.563009" y="398.398113" style="fill: #1f77b4; stroke: #1f77b4" />
5095
+ <use ns4:href="#md7efaf3aec" x="236.212137" y="398.943258" style="fill: #1f77b4; stroke: #1f77b4" />
5096
+ <use ns4:href="#md7efaf3aec" x="266.861265" y="398.496342" style="fill: #1f77b4; stroke: #1f77b4" />
5097
+ <use ns4:href="#md7efaf3aec" x="297.510393" y="398.663935" style="fill: #1f77b4; stroke: #1f77b4" />
5098
+ <use ns4:href="#md7efaf3aec" x="328.159521" y="398.724455" style="fill: #1f77b4; stroke: #1f77b4" />
5099
+ <use ns4:href="#md7efaf3aec" x="358.808648" y="398.198398" style="fill: #1f77b4; stroke: #1f77b4" />
5100
+ <use ns4:href="#md7efaf3aec" x="389.457776" y="398.119256" style="fill: #1f77b4; stroke: #1f77b4" />
5101
+ <use ns4:href="#md7efaf3aec" x="420.106904" y="318.445229" style="fill: #1f77b4; stroke: #1f77b4" />
5102
+ <use ns4:href="#md7efaf3aec" x="450.756032" y="398.947914" style="fill: #1f77b4; stroke: #1f77b4" />
5103
+ <use ns4:href="#md7efaf3aec" x="481.40516" y="398.854806" style="fill: #1f77b4; stroke: #1f77b4" />
5104
+ <use ns4:href="#md7efaf3aec" x="512.054288" y="398.868772" style="fill: #1f77b4; stroke: #1f77b4" />
5105
+ <use ns4:href="#md7efaf3aec" x="542.703416" y="398.761698" style="fill: #1f77b4; stroke: #1f77b4" />
5106
+ <use ns4:href="#md7efaf3aec" x="573.352544" y="398.221674" style="fill: #1f77b4; stroke: #1f77b4" />
5107
+ <use ns4:href="#md7efaf3aec" x="604.001672" y="398.994002" style="fill: #1f77b4; stroke: #1f77b4" />
5108
+ <use ns4:href="#md7efaf3aec" x="634.6508" y="398.915326" style="fill: #1f77b4; stroke: #1f77b4" />
5109
+ <use ns4:href="#md7efaf3aec" x="665.299928" y="399.194649" style="fill: #1f77b4; stroke: #1f77b4" />
5110
+ <use ns4:href="#md7efaf3aec" x="695.949056" y="398.459099" style="fill: #1f77b4; stroke: #1f77b4" />
5111
+ <use ns4:href="#md7efaf3aec" x="726.598184" y="398.882738" style="fill: #1f77b4; stroke: #1f77b4" />
5112
+ <use ns4:href="#md7efaf3aec" x="757.247312" y="320.260827" style="fill: #1f77b4; stroke: #1f77b4" />
5113
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
5114
  </g>
5115
  </g>
5116
  <g id="series--torch-eager" class="series">
5117
+ <path d="M 82.966497 349.953303 L 113.615625 322.365059 L 144.264753 325.516751 L 174.913881 315.800508 L 205.563009 324.962761 L 236.212137 326.731805 L 266.861265 326.587488 L 297.510393 327.849096 L 328.159521 325.428299 L 358.808648 326.648008 L 389.457776 326.024187 L 420.106904 324.492102 L 450.756032 326.825378 L 481.40516 325.460886 L 512.054288 325.069369 L 542.703416 326.410584 L 573.352544 329.236399 L 604.001672 326.769048 L 634.6508 326.391962 L 665.299928 325.102422 L 695.949056 327.513909 L 726.598184 325.782107 L 757.247312 325.730898 L 787.896439 139.436578 " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5118
  <defs>
5119
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5120
  </defs>
5121
  <g clip-path="url(#p088c925177)">
5122
+ <use ns4:href="#m9b8c54d372" x="82.966497" y="349.953303" style="fill: #ff7f0e; stroke: #ff7f0e" />
5123
+ <use ns4:href="#m9b8c54d372" x="113.615625" y="322.365059" style="fill: #ff7f0e; stroke: #ff7f0e" />
5124
+ <use ns4:href="#m9b8c54d372" x="144.264753" y="325.516751" style="fill: #ff7f0e; stroke: #ff7f0e" />
5125
+ <use ns4:href="#m9b8c54d372" x="174.913881" y="315.800508" style="fill: #ff7f0e; stroke: #ff7f0e" />
5126
+ <use ns4:href="#m9b8c54d372" x="205.563009" y="324.962761" style="fill: #ff7f0e; stroke: #ff7f0e" />
5127
+ <use ns4:href="#m9b8c54d372" x="236.212137" y="326.731805" style="fill: #ff7f0e; stroke: #ff7f0e" />
5128
+ <use ns4:href="#m9b8c54d372" x="266.861265" y="326.587488" style="fill: #ff7f0e; stroke: #ff7f0e" />
5129
+ <use ns4:href="#m9b8c54d372" x="297.510393" y="327.849096" style="fill: #ff7f0e; stroke: #ff7f0e" />
5130
+ <use ns4:href="#m9b8c54d372" x="328.159521" y="325.428299" style="fill: #ff7f0e; stroke: #ff7f0e" />
5131
+ <use ns4:href="#m9b8c54d372" x="358.808648" y="326.648008" style="fill: #ff7f0e; stroke: #ff7f0e" />
5132
+ <use ns4:href="#m9b8c54d372" x="389.457776" y="326.024187" style="fill: #ff7f0e; stroke: #ff7f0e" />
5133
+ <use ns4:href="#m9b8c54d372" x="420.106904" y="324.492102" style="fill: #ff7f0e; stroke: #ff7f0e" />
5134
+ <use ns4:href="#m9b8c54d372" x="450.756032" y="326.825378" style="fill: #ff7f0e; stroke: #ff7f0e" />
5135
+ <use ns4:href="#m9b8c54d372" x="481.40516" y="325.460886" style="fill: #ff7f0e; stroke: #ff7f0e" />
5136
+ <use ns4:href="#m9b8c54d372" x="512.054288" y="325.069369" style="fill: #ff7f0e; stroke: #ff7f0e" />
5137
+ <use ns4:href="#m9b8c54d372" x="542.703416" y="326.410584" style="fill: #ff7f0e; stroke: #ff7f0e" />
5138
+ <use ns4:href="#m9b8c54d372" x="573.352544" y="329.236399" style="fill: #ff7f0e; stroke: #ff7f0e" />
5139
+ <use ns4:href="#m9b8c54d372" x="604.001672" y="326.769048" style="fill: #ff7f0e; stroke: #ff7f0e" />
5140
+ <use ns4:href="#m9b8c54d372" x="634.6508" y="326.391962" style="fill: #ff7f0e; stroke: #ff7f0e" />
5141
+ <use ns4:href="#m9b8c54d372" x="665.299928" y="325.102422" style="fill: #ff7f0e; stroke: #ff7f0e" />
5142
+ <use ns4:href="#m9b8c54d372" x="695.949056" y="327.513909" style="fill: #ff7f0e; stroke: #ff7f0e" />
5143
+ <use ns4:href="#m9b8c54d372" x="726.598184" y="325.782107" style="fill: #ff7f0e; stroke: #ff7f0e" />
5144
+ <use ns4:href="#m9b8c54d372" x="757.247312" y="325.730898" style="fill: #ff7f0e; stroke: #ff7f0e" />
5145
+ <use ns4:href="#m9b8c54d372" x="787.896439" y="139.436578" style="fill: #ff7f0e; stroke: #ff7f0e" />
5146
  </g>
5147
  </g>
5148
  <g id="patch_3">