Upload folder using huggingface_hub
Browse files- activation/impls/artifacts/benchmark/activation.jsonl +9 -9
- activation/impls/hf_kernels_swiglu.html +95 -94
- activation/impls/torch_swiglu.html +121 -121
- activation/results/artifacts/combine/latency.svg +2 -2
- activation/results/combined_results.html +93 -93
- causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +24 -0
- causal_conv1d/impls/cells/benchmark.py +40 -0
- causal_conv1d/impls/cells/nv.py +2 -0
- causal_conv1d/impls/hf_kernels_causal_conv1d.html +0 -0
- causal_conv1d/impls/index.html +89 -0
- causal_conv1d/impls/torch_causal_conv1d.html +0 -0
- causal_conv1d/index.html +89 -0
- causal_conv1d/results/artifacts/combine/latency.svg +3 -0
- causal_conv1d/results/cells/combine.py +26 -0
- causal_conv1d/results/combined_results.html +0 -0
- causal_conv1d/results/index.html +88 -0
- flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -6
- flash_attn/impls/cells/benchmark.py +8 -10
- flash_attn/impls/flash_attention.html +139 -139
- flash_attn/impls/hf_kernels_flash_attn.html +95 -142
- flash_attn/impls/hf_kernels_flash_attn3.html +80 -80
- flash_attn/impls/mem_efficient_attention.html +186 -134
- flash_attn/impls/sage_attention.html +13 -19
- flash_attn/impls/xformers.html +91 -91
- flash_attn/results/artifacts/combine/latency.svg +2 -2
- flash_attn/results/combined_results.html +147 -147
- index.html +2 -0
- layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +4 -48
- layer_norm/impls/hf_kernels_layer_norm.html +0 -0
- layer_norm/impls/torch_layer_norm.html +0 -0
- layer_norm/results/artifacts/combine/latency.svg +2 -2
- layer_norm/results/combined_results.html +429 -106
- rotary/impls/artifacts/benchmark/rotary.jsonl +24 -0
- rotary/impls/cells/benchmark.py +57 -0
- rotary/impls/cells/nv.py +2 -0
- rotary/impls/hf_kernels_rotary.html +0 -0
- rotary/impls/index.html +89 -0
- rotary/impls/torch_rotary.html +0 -0
- rotary/index.html +89 -0
- rotary/results/artifacts/combine/latency.svg +3 -0
- rotary/results/cells/combine.py +26 -0
- rotary/results/combined_results.html +0 -0
- rotary/results/index.html +88 -0
activation/impls/artifacts/benchmark/activation.jsonl
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
| 7 |
-
{"ts": "2025-10-
|
| 8 |
-
{"ts": "2025-10-
|
| 9 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02110099990204617, "p50": 0.022570000055566197, "p90": 0.02266100000269944, "mean": 0.022242599993660406, "iqr": 0.0007410000080199097, "raw_times": [0.022570000055566197, 0.022961000013310695, 0.02191999999467953, 0.02266100000269944, 0.02110099990204617], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02889100005631917, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02585100003216212, "p50": 0.02831100005096232, "p90": 0.02854100000604376, "mean": 0.02791500000967062, "iqr": 0.0013400000398178236, "raw_times": [0.02585100003216212, 0.02854100000604376, 0.02967099999295897, 0.02831100005096232, 0.027200999966225936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031750999937685265, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02804099995046272, "p50": 0.028271000019230996, "p90": 0.02853099999811093, "mean": 0.032097199982672464, "iqr": 0.0004900000476482091, "raw_times": [0.04760199999509496, 0.028271000019230996, 0.02853099999811093, 0.02804099995046272, 0.02804099995046272], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031132000003708526, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02513000004000787, "p50": 0.027131000024382956, "p90": 0.027909999971598154, "mean": 0.027204600019103964, "iqr": 0.0014589999182135216, "raw_times": [0.02513000004000787, 0.027131000024382956, 0.027909999971598154, 0.029401000006146205, 0.026451000053384632], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030690999892613036, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02570000003743189, "p50": 0.026741000056063058, "p90": 0.02731099993980024, "mean": 0.02703079999264446, "iqr": 0.0012099999366910197, "raw_times": [0.02570000003743189, 0.02731099993980024, 0.029300999926817894, 0.02610100000310922, 0.026741000056063058], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030331000061778468, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025050999965969822, "p50": 0.026220999984616356, "p90": 0.028031000056216726, "mean": 0.026778999995258346, "iqr": 0.0018400000953988638, "raw_times": [0.025050999965969822, 0.026190999960817862, 0.026220999984616356, 0.028031000056216726, 0.028401000008670962], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031100999990485434, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02494000000297092, "p50": 0.026971000011144497, "p90": 0.02789099994515709, "mean": 0.027030599972022173, "iqr": 0.0009699999736767495, "raw_times": [0.02494000000297092, 0.026971000011144497, 0.02789099994515709, 0.02842999992935802, 0.02692099997148034], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029161000043131935, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024340999971173005, "p50": 0.02594099998987076, "p90": 0.027440999929240206, "mean": 0.026286999968760938, "iqr": 0.0016499999446750735, "raw_times": [0.024340999971173005, 0.027920999968955584, 0.027440999929240206, 0.02594099998987076, 0.025790999984565133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02797100000861974, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025551000021550863, "p50": 0.026880999939749017, "p90": 0.028271000019230996, "mean": 0.027656800011754967, "iqr": 0.002240999947389355, "raw_times": [0.025551000021550863, 0.026880999939749017, 0.02603000007184164, 0.03155100000640232, 0.028271000019230996], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02960100005111599, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
activation/impls/hf_kernels_swiglu.html
CHANGED
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: nv | 0.
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3887,7 +3887,7 @@ Cell: nv | 0.26s
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3896,7 +3896,7 @@ Cell: nv | 0.26s
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
-
| N/A
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
@@ -3920,7 +3920,7 @@ Cell: nv | 0.26s
|
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
-
Cell: benchmark | 4.
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3976,17 +3976,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
|
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3978 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3979 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3980 |
-
hf_kernels_swiglu
|
| 3981 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 3982 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3983 |
-
Activity Buffer Request
|
| 3984 |
-
aten::empty 2.
|
| 3985 |
-
cudaLaunchKernel 2.
|
| 3986 |
-
cudaDeviceSynchronize 0.
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
-
Self CPU time total: 1.
|
| 3989 |
-
Self CUDA time total: 4.
|
| 3990 |
|
| 3991 |
|
| 3992 |
|
|
@@ -3996,17 +3996,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
|
|
| 3996 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3997 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4000 |
-
hf_kernels_swiglu
|
| 4001 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4002 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4003 |
-
Activity Buffer Request
|
| 4004 |
-
aten::empty 1.
|
| 4005 |
-
cudaLaunchKernel 1.
|
| 4006 |
-
cudaDeviceSynchronize 0.30% 4.
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
-
Self CPU time total: 1.
|
| 4009 |
-
Self CUDA time total: 3.
|
| 4010 |
|
| 4011 |
|
| 4012 |
|
|
@@ -4016,17 +4016,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.
|
| 4020 |
-
hf_kernels_swiglu 6.
|
| 4021 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4022 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4023 |
-
Activity Buffer Request
|
| 4024 |
-
aten::empty 1.
|
| 4025 |
-
cudaLaunchKernel 1.
|
| 4026 |
-
cudaDeviceSynchronize 0.
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
-
Self CPU time total: 1.
|
| 4029 |
-
Self CUDA time total: 4.
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
@@ -4036,17 +4036,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4040 |
-
hf_kernels_swiglu
|
| 4041 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4042 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4043 |
-
Activity Buffer Request 79.
|
| 4044 |
-
aten::empty 1.
|
| 4045 |
-
cudaLaunchKernel
|
| 4046 |
-
cudaDeviceSynchronize 0.
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
-
Self CPU time total: 1.
|
| 4049 |
-
Self CUDA time total: 4.
|
| 4050 |
|
| 4051 |
|
| 4052 |
|
|
@@ -4056,16 +4056,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
|
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4060 |
-
hf_kernels_swiglu
|
| 4061 |
-
_activation_beeaae6::silu_and_mul 5.
|
| 4062 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 100.00% 5.888us 1.963us 3
|
| 4063 |
-
Activity Buffer Request
|
| 4064 |
-
aten::empty 4.
|
| 4065 |
-
cudaLaunchKernel 36.
|
| 4066 |
-
cudaDeviceSynchronize 1.
|
| 4067 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4068 |
-
Self CPU time total:
|
| 4069 |
Self CUDA time total: 5.888us
|
| 4070 |
|
| 4071 |
|
|
@@ -4076,17 +4076,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
|
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4078 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4079 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4080 |
-
hf_kernels_swiglu
|
| 4081 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4082 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4083 |
-
Activity Buffer Request
|
| 4084 |
-
aten::empty 1.
|
| 4085 |
-
cudaLaunchKernel 8.
|
| 4086 |
-
cudaDeviceSynchronize 0.
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
-
Self CPU time total: 1.
|
| 4089 |
-
Self CUDA time total: 7.
|
| 4090 |
|
| 4091 |
|
| 4092 |
|
|
@@ -4096,16 +4096,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
|
|
| 4096 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4097 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4098 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4099 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4100 |
-
hf_kernels_swiglu
|
| 4101 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4102 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 100.00% 6.592us 2.197us 3
|
| 4103 |
-
Activity Buffer Request 82.
|
| 4104 |
-
aten::empty 1.
|
| 4105 |
-
cudaLaunchKernel 8.
|
| 4106 |
-
cudaDeviceSynchronize 0.
|
| 4107 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4108 |
-
Self CPU time total: 1.
|
| 4109 |
Self CUDA time total: 6.592us
|
| 4110 |
|
| 4111 |
|
|
@@ -4116,17 +4116,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
|
|
| 4116 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4117 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4120 |
-
hf_kernels_swiglu 22.
|
| 4121 |
-
_activation_beeaae6::silu_and_mul 5.
|
| 4122 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4123 |
-
Activity Buffer Request 26.
|
| 4124 |
-
aten::empty 5.
|
| 4125 |
-
cudaLaunchKernel 39.
|
| 4126 |
-
cudaDeviceSynchronize 1.
|
| 4127 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4128 |
-
Self CPU time total:
|
| 4129 |
-
Self CUDA time total: 9.
|
| 4130 |
|
| 4131 |
|
| 4132 |
|
|
@@ -4136,23 +4136,23 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
|
|
| 4136 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4137 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4138 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4139 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4140 |
-
hf_kernels_swiglu 24.
|
| 4141 |
-
_activation_beeaae6::silu_and_mul 5.
|
| 4142 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4143 |
-
Activity Buffer Request 28.
|
| 4144 |
-
aten::empty 4.
|
| 4145 |
-
cudaLaunchKernel
|
| 4146 |
-
cudaDeviceSynchronize 1.
|
| 4147 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4148 |
-
Self CPU time total:
|
| 4149 |
-
Self CUDA time total:
|
| 4150 |
|
| 4151 |
|
| 4152 |
impl wl p50(ms) ok
|
| 4153 |
hf_kernels_swiglu cuda_T128_D1024 0.03 True
|
| 4154 |
hf_kernels_swiglu cuda_T128_D2048 0.03 True
|
| 4155 |
-
hf_kernels_swiglu cuda_T128_D768 0.
|
| 4156 |
hf_kernels_swiglu cuda_T256_D1024 0.03 True
|
| 4157 |
hf_kernels_swiglu cuda_T256_D2048 0.03 True
|
| 4158 |
hf_kernels_swiglu cuda_T256_D768 0.03 True
|
|
@@ -4163,12 +4163,13 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
|
|
| 4163 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4164 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4165 |
<div class="uv-logs-content" style="display: none;">
|
| 4166 |
-
Installed 15 packages in
|
| 4167 |
</div>
|
| 4168 |
</div>
|
| 4169 |
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4170 |
-
Fetching 7 files:
|
| 4171 |
-
Fetching 7 files:
|
|
|
|
| 4172 |
<div class="cell-artifacts">
|
| 4173 |
<h4>Artifacts:</h4>
|
| 4174 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.21s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
+
<div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:07:54 2025
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
+
| N/A 27C P0 80W / 350W | 0MiB / 46068MiB | 1% Default |
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
|
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
+
Cell: benchmark | 4.26s
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3978 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3979 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.944us 1745.67% 70.944us 70.944us 1
|
| 3980 |
+
hf_kernels_swiglu 10.31% 179.916us 99.57% 1.738ms 1.738ms 0.000us 0.00% 5.472us 5.472us 1
|
| 3981 |
+
_activation_beeaae6::silu_and_mul 1.09% 18.951us 86.60% 1.512ms 503.911us 4.064us 100.00% 5.472us 1.824us 3
|
| 3982 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.064us 100.00% 4.064us 1.355us 3
|
| 3983 |
+
Activity Buffer Request 83.12% 1.451ms 83.12% 1.451ms 1.451ms 1.408us 34.65% 1.408us 1.408us 1
|
| 3984 |
+
aten::empty 2.66% 46.432us 2.66% 46.432us 15.477us 0.000us 0.00% 0.000us 0.000us 3
|
| 3985 |
+
cudaLaunchKernel 2.39% 41.801us 2.39% 41.801us 13.934us 0.000us 0.00% 0.000us 0.000us 3
|
| 3986 |
+
cudaDeviceSynchronize 0.43% 7.500us 0.43% 7.500us 7.500us 0.000us 0.00% 0.000us 0.000us 1
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
+
Self CPU time total: 1.746ms
|
| 3989 |
+
Self CUDA time total: 4.064us
|
| 3990 |
|
| 3991 |
|
| 3992 |
|
|
|
|
| 3996 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3997 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.703us 1759.36% 68.703us 68.703us 1
|
| 4000 |
+
hf_kernels_swiglu 6.60% 109.215us 99.70% 1.650ms 1.650ms 0.000us 0.00% 5.217us 5.217us 1
|
| 4001 |
+
_activation_beeaae6::silu_and_mul 1.44% 23.760us 91.91% 1.521ms 506.927us 3.905us 100.00% 5.217us 1.739us 3
|
| 4002 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.905us 100.00% 3.905us 1.302us 3
|
| 4003 |
+
Activity Buffer Request 88.83% 1.470ms 88.83% 1.470ms 1.470ms 1.312us 33.60% 1.312us 1.312us 1
|
| 4004 |
+
aten::empty 1.19% 19.640us 1.19% 19.640us 6.547us 0.000us 0.00% 0.000us 0.000us 3
|
| 4005 |
+
cudaLaunchKernel 1.65% 27.251us 1.65% 27.251us 9.084us 0.000us 0.00% 0.000us 0.000us 3
|
| 4006 |
+
cudaDeviceSynchronize 0.30% 4.941us 0.30% 4.941us 4.941us 0.000us 0.00% 0.000us 0.000us 1
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
+
Self CPU time total: 1.655ms
|
| 4009 |
+
Self CUDA time total: 3.905us
|
| 4010 |
|
| 4011 |
|
| 4012 |
|
|
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.999us 1388.58% 67.999us 67.999us 1
|
| 4020 |
+
hf_kernels_swiglu 6.71% 113.524us 99.73% 1.687ms 1.687ms 0.000us 0.00% 6.529us 6.529us 1
|
| 4021 |
+
_activation_beeaae6::silu_and_mul 1.26% 21.380us 91.91% 1.555ms 518.231us 4.897us 100.00% 6.529us 2.176us 3
|
| 4022 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.897us 100.00% 4.897us 1.632us 3
|
| 4023 |
+
Activity Buffer Request 89.08% 1.507ms 89.08% 1.507ms 1.507ms 1.632us 33.33% 1.632us 1.632us 1
|
| 4024 |
+
aten::empty 1.11% 18.802us 1.11% 18.802us 6.267us 0.000us 0.00% 0.000us 0.000us 3
|
| 4025 |
+
cudaLaunchKernel 1.56% 26.371us 1.56% 26.371us 8.790us 0.000us 0.00% 0.000us 0.000us 3
|
| 4026 |
+
cudaDeviceSynchronize 0.27% 4.571us 0.27% 4.571us 4.571us 0.000us 0.00% 0.000us 0.000us 1
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
+
Self CPU time total: 1.692ms
|
| 4029 |
+
Self CUDA time total: 4.897us
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.776us 1498.50% 63.776us 63.776us 1
|
| 4040 |
+
hf_kernels_swiglu 5.54% 99.283us 99.75% 1.788ms 1.788ms 0.000us 0.00% 5.696us 5.696us 1
|
| 4041 |
+
_activation_beeaae6::silu_and_mul 1.20% 21.550us 93.21% 1.671ms 556.862us 4.256us 100.00% 5.696us 1.899us 3
|
| 4042 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.256us 100.00% 4.256us 1.419us 3
|
| 4043 |
+
Activity Buffer Request 79.15% 1.419ms 79.15% 1.419ms 1.419ms 1.440us 33.83% 1.440us 1.440us 1
|
| 4044 |
+
aten::empty 1.00% 17.972us 1.00% 17.972us 5.991us 0.000us 0.00% 0.000us 0.000us 3
|
| 4045 |
+
cudaLaunchKernel 12.85% 230.398us 12.85% 230.398us 76.799us 0.000us 0.00% 0.000us 0.000us 3
|
| 4046 |
+
cudaDeviceSynchronize 0.25% 4.510us 0.25% 4.510us 4.510us 0.000us 0.00% 0.000us 0.000us 1
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
+
Self CPU time total: 1.792ms
|
| 4049 |
+
Self CUDA time total: 4.256us
|
| 4050 |
|
| 4051 |
|
| 4052 |
|
|
|
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.431us 1060.31% 62.431us 62.431us 1
|
| 4060 |
+
hf_kernels_swiglu 20.17% 83.914us 98.89% 411.305us 411.305us 0.000us 0.00% 7.872us 7.872us 1
|
| 4061 |
+
_activation_beeaae6::silu_and_mul 5.09% 21.171us 74.40% 309.470us 103.157us 5.888us 100.00% 7.872us 2.624us 3
|
| 4062 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 100.00% 5.888us 1.963us 3
|
| 4063 |
+
Activity Buffer Request 32.60% 135.614us 32.60% 135.614us 135.614us 1.984us 33.70% 1.984us 1.984us 1
|
| 4064 |
+
aten::empty 4.31% 17.921us 4.31% 17.921us 5.974us 0.000us 0.00% 0.000us 0.000us 3
|
| 4065 |
+
cudaLaunchKernel 36.71% 152.685us 36.71% 152.685us 50.895us 0.000us 0.00% 0.000us 0.000us 3
|
| 4066 |
+
cudaDeviceSynchronize 1.11% 4.631us 1.11% 4.631us 4.631us 0.000us 0.00% 0.000us 0.000us 1
|
| 4067 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4068 |
+
Self CPU time total: 415.936us
|
| 4069 |
Self CUDA time total: 5.888us
|
| 4070 |
|
| 4071 |
|
|
|
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4078 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4079 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.615us 880.40% 67.615us 67.615us 1
|
| 4080 |
+
hf_kernels_swiglu 5.97% 103.444us 99.74% 1.727ms 1.727ms 0.000us 0.00% 10.240us 10.240us 1
|
| 4081 |
+
_activation_beeaae6::silu_and_mul 1.23% 21.310us 92.70% 1.605ms 535.135us 7.680us 100.00% 10.240us 3.413us 3
|
| 4082 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.680us 100.00% 7.680us 2.560us 3
|
| 4083 |
+
Activity Buffer Request 82.79% 1.434ms 82.79% 1.434ms 1.434ms 2.560us 33.33% 2.560us 2.560us 1
|
| 4084 |
+
aten::empty 1.07% 18.611us 1.07% 18.611us 6.204us 0.000us 0.00% 0.000us 0.000us 3
|
| 4085 |
+
cudaLaunchKernel 8.68% 150.305us 8.68% 150.305us 50.102us 0.000us 0.00% 0.000us 0.000us 3
|
| 4086 |
+
cudaDeviceSynchronize 0.26% 4.450us 0.26% 4.450us 4.450us 0.000us 0.00% 0.000us 0.000us 1
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
+
Self CPU time total: 1.732ms
|
| 4089 |
+
Self CUDA time total: 7.680us
|
| 4090 |
|
| 4091 |
|
| 4092 |
|
|
|
|
| 4096 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4097 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4098 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4099 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.423us 962.12% 63.423us 63.423us 1
|
| 4100 |
+
hf_kernels_swiglu 5.71% 97.705us 99.74% 1.706ms 1.706ms 0.000us 0.00% 8.800us 8.800us 1
|
| 4101 |
+
_activation_beeaae6::silu_and_mul 1.25% 21.440us 92.96% 1.590ms 530.071us 6.592us 100.00% 8.800us 2.933us 3
|
| 4102 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 100.00% 6.592us 2.197us 3
|
| 4103 |
+
Activity Buffer Request 82.94% 1.419ms 82.94% 1.419ms 1.419ms 2.208us 33.50% 2.208us 2.208us 1
|
| 4104 |
+
aten::empty 1.07% 18.230us 1.07% 18.230us 6.077us 0.000us 0.00% 0.000us 0.000us 3
|
| 4105 |
+
cudaLaunchKernel 8.77% 149.945us 8.77% 149.945us 49.982us 0.000us 0.00% 0.000us 0.000us 3
|
| 4106 |
+
cudaDeviceSynchronize 0.26% 4.450us 0.26% 4.450us 4.450us 0.000us 0.00% 0.000us 0.000us 1
|
| 4107 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4108 |
+
Self CPU time total: 1.711ms
|
| 4109 |
Self CUDA time total: 6.592us
|
| 4110 |
|
| 4111 |
|
|
|
|
| 4116 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4117 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 61.982us 658.89% 61.982us 61.982us 1
|
| 4120 |
+
hf_kernels_swiglu 22.04% 82.603us 98.77% 370.213us 370.213us 0.000us 0.00% 12.543us 12.543us 1
|
| 4121 |
+
_activation_beeaae6::silu_and_mul 5.90% 22.112us 71.72% 268.830us 89.610us 9.407us 100.00% 12.543us 4.181us 3
|
| 4122 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.407us 100.00% 9.407us 3.136us 3
|
| 4123 |
+
Activity Buffer Request 26.16% 98.063us 26.16% 98.063us 98.063us 3.136us 33.34% 3.136us 3.136us 1
|
| 4124 |
+
aten::empty 5.01% 18.780us 5.01% 18.780us 6.260us 0.000us 0.00% 0.000us 0.000us 3
|
| 4125 |
+
cudaLaunchKernel 39.66% 148.655us 39.66% 148.655us 49.552us 0.000us 0.00% 0.000us 0.000us 3
|
| 4126 |
+
cudaDeviceSynchronize 1.23% 4.600us 1.23% 4.600us 4.600us 0.000us 0.00% 0.000us 0.000us 1
|
| 4127 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4128 |
+
Self CPU time total: 374.813us
|
| 4129 |
+
Self CUDA time total: 9.407us
|
| 4130 |
|
| 4131 |
|
| 4132 |
|
|
|
|
| 4136 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4137 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4138 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4139 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.776us 490.85% 63.776us 63.776us 1
|
| 4140 |
+
hf_kernels_swiglu 24.11% 99.284us 98.97% 407.515us 407.515us 0.000us 0.00% 17.346us 17.346us 1
|
| 4141 |
+
_activation_beeaae6::silu_and_mul 5.19% 21.351us 70.31% 289.510us 96.503us 12.993us 100.00% 17.346us 5.782us 3
|
| 4142 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 12.993us 100.00% 12.993us 4.331us 3
|
| 4143 |
+
Activity Buffer Request 28.96% 119.264us 28.96% 119.264us 119.264us 4.353us 33.50% 4.353us 4.353us 1
|
| 4144 |
+
aten::empty 4.55% 18.721us 4.55% 18.721us 6.240us 0.000us 0.00% 0.000us 0.000us 3
|
| 4145 |
+
cudaLaunchKernel 36.16% 148.895us 36.16% 148.895us 49.632us 0.000us 0.00% 0.000us 0.000us 3
|
| 4146 |
+
cudaDeviceSynchronize 1.03% 4.240us 1.03% 4.240us 4.240us 0.000us 0.00% 0.000us 0.000us 1
|
| 4147 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4148 |
+
Self CPU time total: 411.755us
|
| 4149 |
+
Self CUDA time total: 12.993us
|
| 4150 |
|
| 4151 |
|
| 4152 |
impl wl p50(ms) ok
|
| 4153 |
hf_kernels_swiglu cuda_T128_D1024 0.03 True
|
| 4154 |
hf_kernels_swiglu cuda_T128_D2048 0.03 True
|
| 4155 |
+
hf_kernels_swiglu cuda_T128_D768 0.02 True
|
| 4156 |
hf_kernels_swiglu cuda_T256_D1024 0.03 True
|
| 4157 |
hf_kernels_swiglu cuda_T256_D2048 0.03 True
|
| 4158 |
hf_kernels_swiglu cuda_T256_D768 0.03 True
|
|
|
|
| 4163 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4164 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4165 |
<div class="uv-logs-content" style="display: none;">
|
| 4166 |
+
Installed 15 packages in 14ms
|
| 4167 |
</div>
|
| 4168 |
</div>
|
| 4169 |
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4170 |
+
Fetching 7 files: 14%|█▍ | 1/7 [00:00<00:00, 7.79it/s]
|
| 4171 |
+
Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 11.48it/s]
|
| 4172 |
+
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 15.62it/s]</div>
|
| 4173 |
<div class="cell-artifacts">
|
| 4174 |
<h4>Artifacts:</h4>
|
| 4175 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
activation/impls/torch_swiglu.html
CHANGED
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: nv | 0.
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3887,7 +3887,7 @@ Cell: nv | 0.26s
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3896,7 +3896,7 @@ Cell: nv | 0.26s
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
-
| N/A
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
@@ -3920,7 +3920,7 @@ Cell: nv | 0.26s
|
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
-
Cell: benchmark | 6.
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3970,20 +3970,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
|
|
| 3970 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3971 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3974 |
-
torch_eager 11.
|
| 3975 |
-
aten::silu 3.
|
| 3976 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 3977 |
-
aten::mul
|
| 3978 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 3979 |
-
Activity Buffer Request 76.
|
| 3980 |
-
aten::slice 2.
|
| 3981 |
-
aten::as_strided 0.
|
| 3982 |
-
cudaLaunchKernel 3.
|
| 3983 |
-
cudaDeviceSynchronize 0.
|
| 3984 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3985 |
-
Self CPU time total: 1.
|
| 3986 |
-
Self CUDA time total: 12.
|
| 3987 |
|
| 3988 |
|
| 3989 |
|
|
@@ -3993,20 +3993,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
|
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3997 |
-
torch_eager
|
| 3998 |
-
aten::silu 2.
|
| 3999 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4000 |
-
aten::mul 1.
|
| 4001 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4002 |
-
Activity Buffer Request 84.
|
| 4003 |
-
aten::slice 1.
|
| 4004 |
-
aten::as_strided 0.
|
| 4005 |
-
cudaLaunchKernel 2.
|
| 4006 |
-
cudaDeviceSynchronize 0.
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
-
Self CPU time total: 1.
|
| 4009 |
-
Self CUDA time total: 12.
|
| 4010 |
|
| 4011 |
|
| 4012 |
|
|
@@ -4016,20 +4016,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4020 |
-
torch_eager 6.
|
| 4021 |
-
aten::silu 2.
|
| 4022 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4023 |
-
aten::mul 1.
|
| 4024 |
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.55% 6.432us 2.144us 3
|
| 4025 |
-
Activity Buffer Request
|
| 4026 |
-
aten::slice 1.
|
| 4027 |
-
aten::as_strided 0.
|
| 4028 |
-
cudaLaunchKernel 2.
|
| 4029 |
-
cudaDeviceSynchronize 0.
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
-
Self CPU time total: 1.
|
| 4032 |
-
Self CUDA time total: 13.
|
| 4033 |
|
| 4034 |
|
| 4035 |
|
|
@@ -4039,20 +4039,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
|
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4041 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4042 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4043 |
-
torch_eager
|
| 4044 |
-
aten::silu 2.
|
| 4045 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4046 |
-
aten::mul 1.
|
| 4047 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.
|
| 4048 |
-
Activity Buffer Request
|
| 4049 |
-
aten::slice 1.
|
| 4050 |
-
aten::as_strided 0.
|
| 4051 |
-
cudaLaunchKernel
|
| 4052 |
-
cudaDeviceSynchronize 0.
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
-
Self CPU time total: 1.
|
| 4055 |
-
Self CUDA time total: 12.
|
| 4056 |
|
| 4057 |
|
| 4058 |
|
|
@@ -4062,20 +4062,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
|
|
| 4062 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4063 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4064 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4065 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4066 |
-
torch_eager 5.
|
| 4067 |
-
aten::silu 2.
|
| 4068 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4069 |
-
aten::mul 1.
|
| 4070 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4071 |
-
Activity Buffer Request 78.
|
| 4072 |
-
aten::slice 1.
|
| 4073 |
-
aten::as_strided 0.
|
| 4074 |
-
cudaLaunchKernel 10.
|
| 4075 |
-
cudaDeviceSynchronize 0.
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
-
Self CPU time total: 1.
|
| 4078 |
-
Self CUDA time total: 13.
|
| 4079 |
|
| 4080 |
|
| 4081 |
|
|
@@ -4085,20 +4085,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
|
|
| 4085 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4086 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4089 |
-
torch_eager 21.
|
| 4090 |
-
aten::silu 8.
|
| 4091 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4092 |
-
aten::mul
|
| 4093 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4094 |
-
Activity Buffer Request
|
| 4095 |
-
aten::slice 4.
|
| 4096 |
-
aten::as_strided 1.
|
| 4097 |
-
cudaLaunchKernel 35.
|
| 4098 |
-
cudaDeviceSynchronize
|
| 4099 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4100 |
-
Self CPU time total:
|
| 4101 |
-
Self CUDA time total: 15.
|
| 4102 |
|
| 4103 |
|
| 4104 |
|
|
@@ -4108,20 +4108,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
|
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4110 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4111 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4112 |
-
torch_eager 5.
|
| 4113 |
-
aten::silu 2.
|
| 4114 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4115 |
-
aten::mul 1.
|
| 4116 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.
|
| 4117 |
-
Activity Buffer Request 78.
|
| 4118 |
-
aten::slice 1.
|
| 4119 |
-
aten::as_strided 0.
|
| 4120 |
-
cudaLaunchKernel 9.
|
| 4121 |
-
cudaDeviceSynchronize 0.27% 4.
|
| 4122 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4123 |
-
Self CPU time total: 1.
|
| 4124 |
-
Self CUDA time total: 14.
|
| 4125 |
|
| 4126 |
|
| 4127 |
|
|
@@ -4131,20 +4131,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
|
|
| 4131 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4132 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4133 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4134 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4135 |
-
torch_eager
|
| 4136 |
-
aten::silu 8.
|
| 4137 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4138 |
-
aten::mul 5.
|
| 4139 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4140 |
-
Activity Buffer Request
|
| 4141 |
-
aten::slice
|
| 4142 |
-
aten::as_strided 1.
|
| 4143 |
-
cudaLaunchKernel
|
| 4144 |
-
cudaDeviceSynchronize 1.
|
| 4145 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4146 |
-
Self CPU time total:
|
| 4147 |
-
Self CUDA time total: 15.
|
| 4148 |
|
| 4149 |
|
| 4150 |
|
|
@@ -4154,20 +4154,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
|
|
| 4154 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4155 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4156 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4157 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4158 |
-
torch_eager 5.
|
| 4159 |
-
aten::silu 2.
|
| 4160 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.
|
| 4161 |
-
aten::mul 1.
|
| 4162 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.
|
| 4163 |
-
Activity Buffer Request
|
| 4164 |
-
aten::slice 1.
|
| 4165 |
-
aten::as_strided 0.
|
| 4166 |
-
cudaLaunchKernel
|
| 4167 |
-
cudaDeviceSynchronize 0.
|
| 4168 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4169 |
-
Self CPU time total: 1.
|
| 4170 |
-
Self CUDA time total: 22.
|
| 4171 |
|
| 4172 |
|
| 4173 |
impl wl p50(ms) ok
|
|
@@ -4184,7 +4184,7 @@ torch_eager cuda_T512_D768 0.05 True
|
|
| 4184 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4185 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4186 |
<div class="uv-logs-content" style="display: none;">
|
| 4187 |
-
Installed 37 packages in
|
| 4188 |
</div>
|
| 4189 |
</div>
|
| 4190 |
<div class="cell-artifacts">
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.21s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
+
<div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:07:54 2025
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
+
| N/A 27C P0 80W / 350W | 0MiB / 46068MiB | 1% Default |
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
|
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
+
Cell: benchmark | 6.88s
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3970 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3971 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 206.526us 1621.34% 206.526us 206.526us 1
|
| 3974 |
+
torch_eager 11.16% 213.167us 99.55% 1.902ms 1.902ms 0.000us 0.00% 15.042us 15.042us 1
|
| 3975 |
+
aten::silu 3.29% 62.892us 81.79% 1.563ms 520.961us 6.529us 51.26% 8.833us 2.944us 3
|
| 3976 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.529us 51.26% 6.529us 2.176us 3
|
| 3977 |
+
aten::mul 2.06% 39.382us 3.23% 61.724us 20.575us 6.209us 48.74% 6.209us 2.070us 3
|
| 3978 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.209us 48.74% 6.209us 2.070us 3
|
| 3979 |
+
Activity Buffer Request 76.05% 1.453ms 76.05% 1.453ms 1.453ms 2.304us 18.09% 2.304us 2.304us 1
|
| 3980 |
+
aten::slice 2.72% 51.931us 3.38% 64.581us 10.764us 0.000us 0.00% 0.000us 0.000us 6
|
| 3981 |
+
aten::as_strided 0.66% 12.650us 0.66% 12.650us 2.108us 0.000us 0.00% 0.000us 0.000us 6
|
| 3982 |
+
cudaLaunchKernel 3.62% 69.144us 3.62% 69.144us 11.524us 0.000us 0.00% 0.000us 0.000us 6
|
| 3983 |
+
cudaDeviceSynchronize 0.45% 8.521us 0.45% 8.521us 8.521us 0.000us 0.00% 0.000us 0.000us 1
|
| 3984 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3985 |
+
Self CPU time total: 1.911ms
|
| 3986 |
+
Self CUDA time total: 12.738us
|
| 3987 |
|
| 3988 |
|
| 3989 |
|
|
|
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.965us 1227.21% 151.965us 151.965us 1
|
| 3997 |
+
torch_eager 7.02% 119.974us 99.63% 1.704ms 1.704ms 0.000us 0.00% 14.558us 14.558us 1
|
| 3998 |
+
aten::silu 2.35% 40.140us 88.12% 1.507ms 502.320us 6.399us 51.68% 8.574us 2.858us 3
|
| 3999 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.68% 6.399us 2.133us 3
|
| 4000 |
+
aten::mul 1.61% 27.481us 2.72% 46.541us 15.514us 5.984us 48.32% 5.984us 1.995us 3
|
| 4001 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.32% 5.984us 1.995us 3
|
| 4002 |
+
Activity Buffer Request 84.14% 1.439ms 84.14% 1.439ms 1.439ms 2.175us 17.56% 2.175us 2.175us 1
|
| 4003 |
+
aten::slice 1.43% 24.471us 1.78% 30.412us 5.069us 0.000us 0.00% 0.000us 0.000us 6
|
| 4004 |
+
aten::as_strided 0.35% 5.941us 0.35% 5.941us 0.990us 0.000us 0.00% 0.000us 0.000us 6
|
| 4005 |
+
cudaLaunchKernel 2.74% 46.851us 2.74% 46.851us 7.809us 0.000us 0.00% 0.000us 0.000us 6
|
| 4006 |
+
cudaDeviceSynchronize 0.37% 6.320us 0.37% 6.320us 6.320us 0.000us 0.00% 0.000us 0.000us 1
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
+
Self CPU time total: 1.710ms
|
| 4009 |
+
Self CUDA time total: 12.383us
|
| 4010 |
|
| 4011 |
|
| 4012 |
|
|
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.008us 1139.77% 151.008us 151.008us 1
|
| 4020 |
+
torch_eager 6.34% 107.173us 99.70% 1.687ms 1.687ms 0.000us 0.00% 15.522us 15.522us 1
|
| 4021 |
+
aten::silu 2.38% 40.332us 88.83% 1.503ms 500.911us 6.817us 51.45% 9.090us 3.030us 3
|
| 4022 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.817us 51.45% 6.817us 2.272us 3
|
| 4023 |
+
aten::mul 1.57% 26.503us 2.73% 46.253us 15.418us 6.432us 48.55% 6.432us 2.144us 3
|
| 4024 |
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.55% 6.432us 2.144us 3
|
| 4025 |
+
Activity Buffer Request 84.91% 1.436ms 84.91% 1.436ms 1.436ms 2.273us 17.16% 2.273us 2.273us 1
|
| 4026 |
+
aten::slice 1.43% 24.250us 1.81% 30.550us 5.092us 0.000us 0.00% 0.000us 0.000us 6
|
| 4027 |
+
aten::as_strided 0.37% 6.300us 0.37% 6.300us 1.050us 0.000us 0.00% 0.000us 0.000us 6
|
| 4028 |
+
cudaLaunchKernel 2.70% 45.731us 2.70% 45.731us 7.622us 0.000us 0.00% 0.000us 0.000us 6
|
| 4029 |
+
cudaDeviceSynchronize 0.30% 5.000us 0.30% 5.000us 5.000us 0.000us 0.00% 0.000us 0.000us 1
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
+
Self CPU time total: 1.692ms
|
| 4032 |
+
Self CUDA time total: 13.249us
|
| 4033 |
|
| 4034 |
|
| 4035 |
|
|
|
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4041 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4042 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.149us 1202.68% 153.149us 153.149us 1
|
| 4043 |
+
torch_eager 6.34% 109.104us 99.71% 1.717ms 1.717ms 0.000us 0.00% 14.941us 14.941us 1
|
| 4044 |
+
aten::silu 2.38% 40.982us 88.93% 1.531ms 510.411us 6.558us 51.50% 8.765us 2.922us 3
|
| 4045 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.558us 51.50% 6.558us 2.186us 3
|
| 4046 |
+
aten::mul 1.52% 26.241us 2.68% 46.222us 15.407us 6.176us 48.50% 6.176us 2.059us 3
|
| 4047 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.50% 6.176us 2.059us 3
|
| 4048 |
+
Activity Buffer Request 73.41% 1.264ms 73.41% 1.264ms 1.264ms 2.207us 17.33% 2.207us 2.207us 1
|
| 4049 |
+
aten::slice 1.43% 24.560us 1.77% 30.400us 5.067us 0.000us 0.00% 0.000us 0.000us 6
|
| 4050 |
+
aten::as_strided 0.34% 5.840us 0.34% 5.840us 0.973us 0.000us 0.00% 0.000us 0.000us 6
|
| 4051 |
+
cudaLaunchKernel 14.29% 246.139us 14.29% 246.139us 41.023us 0.000us 0.00% 0.000us 0.000us 6
|
| 4052 |
+
cudaDeviceSynchronize 0.29% 4.920us 0.29% 4.920us 4.920us 0.000us 0.00% 0.000us 0.000us 1
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
+
Self CPU time total: 1.722ms
|
| 4055 |
+
Self CUDA time total: 12.734us
|
| 4056 |
|
| 4057 |
|
| 4058 |
|
|
|
|
| 4062 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4063 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4064 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4065 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 149.310us 1126.87% 149.310us 149.310us 1
|
| 4066 |
+
torch_eager 5.88% 107.113us 99.73% 1.817ms 1.817ms 0.000us 0.00% 15.555us 15.555us 1
|
| 4067 |
+
aten::silu 2.34% 42.602us 89.83% 1.636ms 545.432us 6.785us 51.21% 9.090us 3.030us 3
|
| 4068 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.785us 51.21% 6.785us 2.262us 3
|
| 4069 |
+
aten::mul 1.33% 24.312us 2.33% 42.512us 14.171us 6.465us 48.79% 6.465us 2.155us 3
|
| 4070 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.465us 48.79% 6.465us 2.155us 3
|
| 4071 |
+
Activity Buffer Request 78.20% 1.424ms 78.20% 1.424ms 1.424ms 2.305us 17.40% 2.305us 2.305us 1
|
| 4072 |
+
aten::slice 1.35% 24.650us 1.68% 30.660us 5.110us 0.000us 0.00% 0.000us 0.000us 6
|
| 4073 |
+
aten::as_strided 0.33% 6.010us 0.33% 6.010us 1.002us 0.000us 0.00% 0.000us 0.000us 6
|
| 4074 |
+
cudaLaunchKernel 10.29% 187.406us 10.29% 187.406us 31.234us 0.000us 0.00% 0.000us 0.000us 6
|
| 4075 |
+
cudaDeviceSynchronize 0.27% 4.950us 0.27% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
+
Self CPU time total: 1.822ms
|
| 4078 |
+
Self CUDA time total: 13.250us
|
| 4079 |
|
| 4080 |
|
| 4081 |
|
|
|
|
| 4085 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4086 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 143.804us 924.73% 143.804us 143.804us 1
|
| 4089 |
+
torch_eager 21.50% 103.524us 99.01% 476.736us 476.736us 0.000us 0.00% 18.271us 18.271us 1
|
| 4090 |
+
aten::silu 8.70% 41.893us 62.70% 301.891us 100.630us 7.999us 51.44% 10.719us 3.573us 3
|
| 4091 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.999us 51.44% 7.999us 2.666us 3
|
| 4092 |
+
aten::mul 5.07% 24.390us 8.83% 42.521us 14.174us 7.552us 48.56% 7.552us 2.517us 3
|
| 4093 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.56% 7.552us 2.517us 3
|
| 4094 |
+
Activity Buffer Request 22.22% 106.973us 22.22% 106.973us 106.973us 2.720us 17.49% 2.720us 2.720us 1
|
| 4095 |
+
aten::slice 4.80% 23.090us 5.98% 28.800us 4.800us 0.000us 0.00% 0.000us 0.000us 6
|
| 4096 |
+
aten::as_strided 1.19% 5.710us 1.19% 5.710us 0.952us 0.000us 0.00% 0.000us 0.000us 6
|
| 4097 |
+
cudaLaunchKernel 35.55% 171.156us 35.55% 171.156us 28.526us 0.000us 0.00% 0.000us 0.000us 6
|
| 4098 |
+
cudaDeviceSynchronize 0.99% 4.760us 0.99% 4.760us 4.760us 0.000us 0.00% 0.000us 0.000us 1
|
| 4099 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4100 |
+
Self CPU time total: 481.496us
|
| 4101 |
+
Self CUDA time total: 15.551us
|
| 4102 |
|
| 4103 |
|
| 4104 |
|
|
|
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4110 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4111 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.372us 1067.46% 153.372us 153.372us 1
|
| 4112 |
+
torch_eager 5.96% 108.164us 99.73% 1.810ms 1.810ms 0.000us 0.00% 16.832us 16.832us 1
|
| 4113 |
+
aten::silu 2.30% 41.731us 89.59% 1.626ms 541.925us 7.360us 51.22% 9.824us 3.275us 3
|
| 4114 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 51.22% 7.360us 2.453us 3
|
| 4115 |
+
aten::mul 1.41% 25.542us 2.47% 44.792us 14.931us 7.008us 48.78% 7.008us 2.336us 3
|
| 4116 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.78% 7.008us 2.336us 3
|
| 4117 |
+
Activity Buffer Request 78.82% 1.430ms 78.82% 1.430ms 1.430ms 2.464us 17.15% 2.464us 2.464us 1
|
| 4118 |
+
aten::slice 1.37% 24.840us 1.70% 30.900us 5.150us 0.000us 0.00% 0.000us 0.000us 6
|
| 4119 |
+
aten::as_strided 0.33% 6.060us 0.33% 6.060us 1.010us 0.000us 0.00% 0.000us 0.000us 6
|
| 4120 |
+
cudaLaunchKernel 9.53% 172.976us 9.53% 172.976us 28.829us 0.000us 0.00% 0.000us 0.000us 6
|
| 4121 |
+
cudaDeviceSynchronize 0.27% 4.960us 0.27% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1
|
| 4122 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4123 |
+
Self CPU time total: 1.815ms
|
| 4124 |
+
Self CUDA time total: 14.368us
|
| 4125 |
|
| 4126 |
|
| 4127 |
|
|
|
|
| 4131 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4132 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4133 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4134 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 146.240us 942.27% 146.240us 146.240us 1
|
| 4135 |
+
torch_eager 22.59% 104.486us 98.96% 457.726us 457.726us 0.000us 0.00% 18.208us 18.208us 1
|
| 4136 |
+
aten::silu 8.78% 40.590us 60.43% 279.519us 93.173us 7.936us 51.13% 10.624us 3.541us 3
|
| 4137 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.13% 7.936us 2.645us 3
|
| 4138 |
+
aten::mul 5.53% 25.579us 9.45% 43.730us 14.577us 7.584us 48.87% 7.584us 2.528us 3
|
| 4139 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.87% 7.584us 2.528us 3
|
| 4140 |
+
Activity Buffer Request 18.85% 87.193us 18.85% 87.193us 87.193us 2.688us 17.32% 2.688us 2.688us 1
|
| 4141 |
+
aten::slice 5.23% 24.201us 6.48% 29.991us 4.999us 0.000us 0.00% 0.000us 0.000us 6
|
| 4142 |
+
aten::as_strided 1.25% 5.790us 1.25% 5.790us 0.965us 0.000us 0.00% 0.000us 0.000us 6
|
| 4143 |
+
cudaLaunchKernel 36.73% 169.887us 36.73% 169.887us 28.314us 0.000us 0.00% 0.000us 0.000us 6
|
| 4144 |
+
cudaDeviceSynchronize 1.04% 4.800us 1.04% 4.800us 4.800us 0.000us 0.00% 0.000us 0.000us 1
|
| 4145 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4146 |
+
Self CPU time total: 462.526us
|
| 4147 |
+
Self CUDA time total: 15.520us
|
| 4148 |
|
| 4149 |
|
| 4150 |
|
|
|
|
| 4154 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4155 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4156 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4157 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 181.470us 803.28% 181.470us 181.470us 1
|
| 4158 |
+
torch_eager 5.97% 109.125us 99.74% 1.823ms 1.823ms 0.000us 0.00% 26.526us 26.526us 1
|
| 4159 |
+
aten::silu 2.38% 43.492us 88.50% 1.617ms 539.072us 11.647us 51.56% 15.582us 5.194us 3
|
| 4160 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.647us 51.56% 11.647us 3.882us 3
|
| 4161 |
+
aten::mul 1.42% 25.882us 3.51% 64.123us 21.374us 10.944us 48.44% 10.944us 3.648us 3
|
| 4162 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.944us 48.44% 10.944us 3.648us 3
|
| 4163 |
+
Activity Buffer Request 77.67% 1.419ms 77.67% 1.419ms 1.419ms 3.935us 17.42% 3.935us 3.935us 1
|
| 4164 |
+
aten::slice 1.42% 25.910us 1.76% 32.089us 5.348us 0.000us 0.00% 0.000us 0.000us 6
|
| 4165 |
+
aten::as_strided 0.34% 6.179us 0.34% 6.179us 1.030us 0.000us 0.00% 0.000us 0.000us 6
|
| 4166 |
+
cudaLaunchKernel 10.54% 192.606us 10.54% 192.606us 32.101us 0.000us 0.00% 0.000us 0.000us 6
|
| 4167 |
+
cudaDeviceSynchronize 0.26% 4.790us 0.26% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1
|
| 4168 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4169 |
+
Self CPU time total: 1.827ms
|
| 4170 |
+
Self CUDA time total: 22.591us
|
| 4171 |
|
| 4172 |
|
| 4173 |
impl wl p50(ms) ok
|
|
|
|
| 4184 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4185 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4186 |
<div class="uv-logs-content" style="display: none;">
|
| 4187 |
+
Installed 37 packages in 192ms
|
| 4188 |
</div>
|
| 4189 |
</div>
|
| 4190 |
<div class="cell-artifacts">
|
activation/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
activation/results/combined_results.html
CHANGED
|
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -4021,83 +4021,83 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4021 |
<g id="matplotlib.axis_2">
|
| 4022 |
<g id="ytick_1">
|
| 4023 |
<g id="grid-y--2" class="grid grid-y">
|
| 4024 |
-
<path d="M 60.23
|
| 4025 |
</g>
|
| 4026 |
<g id="line2d_10">
|
| 4027 |
<defs>
|
| 4028 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4029 |
</defs>
|
| 4030 |
<g>
|
| 4031 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_2">
|
| 4039 |
<g id="grid-y--3" class="grid grid-y">
|
| 4040 |
-
<path d="M 60.23
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_3">
|
| 4052 |
<g id="grid-y--4" class="grid grid-y">
|
| 4053 |
-
<path d="M 60.23
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="ytick_4">
|
| 4065 |
<g id="grid-y--5" class="grid grid-y">
|
| 4066 |
-
<path d="M 60.23
|
| 4067 |
</g>
|
| 4068 |
<g id="line2d_13">
|
| 4069 |
<g>
|
| 4070 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4071 |
</g>
|
| 4072 |
</g>
|
| 4073 |
<g id="text_13">
|
| 4074 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="ytick_5">
|
| 4078 |
<g id="grid-y--6" class="grid grid-y">
|
| 4079 |
-
<path d="M 60.23 134.
|
| 4080 |
</g>
|
| 4081 |
<g id="line2d_14">
|
| 4082 |
<g>
|
| 4083 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="134.
|
| 4084 |
</g>
|
| 4085 |
</g>
|
| 4086 |
<g id="text_14">
|
| 4087 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="ytick_6">
|
| 4091 |
<g id="grid-y--7" class="grid grid-y">
|
| 4092 |
-
<path d="M 60.23
|
| 4093 |
</g>
|
| 4094 |
<g id="line2d_15">
|
| 4095 |
<g>
|
| 4096 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4097 |
</g>
|
| 4098 |
</g>
|
| 4099 |
<g id="text_15">
|
| 4100 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4101 |
</g>
|
| 4102 |
</g>
|
| 4103 |
<g id="label--y" class="ylabel">
|
|
@@ -4105,37 +4105,37 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4105 |
</g>
|
| 4106 |
</g>
|
| 4107 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4108 |
-
<path d="M 96.005644 451.16779 L 185.444754
|
| 4109 |
<defs>
|
| 4110 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4111 |
</defs>
|
| 4112 |
<g clip-path="url(#p620c7d392f)">
|
| 4113 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4114 |
-
<use ns4:href="#md7efaf3aec" x="185.444754" y="
|
| 4115 |
-
<use ns4:href="#md7efaf3aec" x="274.883864" y="
|
| 4116 |
-
<use ns4:href="#md7efaf3aec" x="364.322974" y="
|
| 4117 |
-
<use ns4:href="#md7efaf3aec" x="453.762084" y="
|
| 4118 |
-
<use ns4:href="#md7efaf3aec" x="543.201194" y="
|
| 4119 |
-
<use ns4:href="#md7efaf3aec" x="632.640304" y="
|
| 4120 |
-
<use ns4:href="#md7efaf3aec" x="722.079415" y="
|
| 4121 |
-
<use ns4:href="#md7efaf3aec" x="811.518525" y="
|
| 4122 |
</g>
|
| 4123 |
</g>
|
| 4124 |
<g id="series--torch-eager" class="series">
|
| 4125 |
-
<path d="M 96.005644
|
| 4126 |
<defs>
|
| 4127 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4128 |
</defs>
|
| 4129 |
<g clip-path="url(#p620c7d392f)">
|
| 4130 |
-
<use ns4:href="#m9b8c54d372" x="96.005644" y="
|
| 4131 |
-
<use ns4:href="#m9b8c54d372" x="185.444754" y="
|
| 4132 |
-
<use ns4:href="#m9b8c54d372" x="274.883864" y="
|
| 4133 |
-
<use ns4:href="#m9b8c54d372" x="364.322974" y="
|
| 4134 |
-
<use ns4:href="#m9b8c54d372" x="453.762084" y="
|
| 4135 |
-
<use ns4:href="#m9b8c54d372" x="543.201194" y="
|
| 4136 |
-
<use ns4:href="#m9b8c54d372" x="632.640304" y="
|
| 4137 |
-
<use ns4:href="#m9b8c54d372" x="722.079415" y="
|
| 4138 |
-
<use ns4:href="#m9b8c54d372" x="811.518525" y="
|
| 4139 |
</g>
|
| 4140 |
</g>
|
| 4141 |
<g id="patch_3">
|
|
@@ -4155,25 +4155,25 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4155 |
</g>
|
| 4156 |
<g id="legend" class="legend">
|
| 4157 |
<g id="patch_7">
|
| 4158 |
-
<path d="M 720.811356
|
| 4159 |
</g>
|
| 4160 |
<g id="line2d_16">
|
| 4161 |
-
<path d="M 722.811356
|
| 4162 |
<g>
|
| 4163 |
-
<use ns4:href="#md7efaf3aec" x="732.811356" y="
|
| 4164 |
</g>
|
| 4165 |
</g>
|
| 4166 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4167 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="
|
| 4168 |
</g>
|
| 4169 |
<g id="line2d_17">
|
| 4170 |
-
<path d="M 722.811356
|
| 4171 |
<g>
|
| 4172 |
-
<use ns4:href="#m9b8c54d372" x="732.811356" y="
|
| 4173 |
</g>
|
| 4174 |
</g>
|
| 4175 |
<g id="legend-label--torch-eager" class="legend">
|
| 4176 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="
|
| 4177 |
</g>
|
| 4178 |
</g>
|
| 4179 |
</g>
|
|
@@ -4193,7 +4193,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4193 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4194 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4195 |
</span> |
|
| 4196 |
-
Cell: combine | 4.
|
| 4197 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4198 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4199 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4284,7 +4284,7 @@ COMBINED BENCHMARK SUMMARY
|
|
| 4284 |
impl wl p50(ms) ok
|
| 4285 |
hf_kernels_swiglu cuda_T128_D1024 0.03 True
|
| 4286 |
hf_kernels_swiglu cuda_T128_D2048 0.03 True
|
| 4287 |
-
hf_kernels_swiglu cuda_T128_D768 0.
|
| 4288 |
hf_kernels_swiglu cuda_T256_D1024 0.03 True
|
| 4289 |
hf_kernels_swiglu cuda_T256_D2048 0.03 True
|
| 4290 |
hf_kernels_swiglu cuda_T256_D768 0.03 True
|
|
@@ -4319,7 +4319,7 @@ Implementations included:
|
|
| 4319 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4320 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4321 |
<div class="uv-logs-content" style="display: none;">
|
| 4322 |
-
Installed 37 packages in
|
| 4323 |
</div>
|
| 4324 |
</div>
|
| 4325 |
<div class="cell-artifacts">
|
|
@@ -4332,7 +4332,7 @@ Installed 37 packages in 250ms
|
|
| 4332 |
<rdf:RDF>
|
| 4333 |
<ns2:Work>
|
| 4334 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4335 |
-
<dc:date>2025-10-
|
| 4336 |
<dc:format>image/svg+xml</dc:format>
|
| 4337 |
<dc:creator>
|
| 4338 |
<ns2:Agent>
|
|
@@ -4481,83 +4481,83 @@ Installed 37 packages in 250ms
|
|
| 4481 |
<g id="matplotlib.axis_2">
|
| 4482 |
<g id="ytick_1">
|
| 4483 |
<g id="grid-y--2" class="grid grid-y">
|
| 4484 |
-
<path d="M 60.23
|
| 4485 |
</g>
|
| 4486 |
<g id="line2d_10">
|
| 4487 |
<defs>
|
| 4488 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4489 |
</defs>
|
| 4490 |
<g>
|
| 4491 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4492 |
</g>
|
| 4493 |
</g>
|
| 4494 |
<g id="text_10">
|
| 4495 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4496 |
</g>
|
| 4497 |
</g>
|
| 4498 |
<g id="ytick_2">
|
| 4499 |
<g id="grid-y--3" class="grid grid-y">
|
| 4500 |
-
<path d="M 60.23
|
| 4501 |
</g>
|
| 4502 |
<g id="line2d_11">
|
| 4503 |
<g>
|
| 4504 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4505 |
</g>
|
| 4506 |
</g>
|
| 4507 |
<g id="text_11">
|
| 4508 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4509 |
</g>
|
| 4510 |
</g>
|
| 4511 |
<g id="ytick_3">
|
| 4512 |
<g id="grid-y--4" class="grid grid-y">
|
| 4513 |
-
<path d="M 60.23
|
| 4514 |
</g>
|
| 4515 |
<g id="line2d_12">
|
| 4516 |
<g>
|
| 4517 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4518 |
</g>
|
| 4519 |
</g>
|
| 4520 |
<g id="text_12">
|
| 4521 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="ytick_4">
|
| 4525 |
<g id="grid-y--5" class="grid grid-y">
|
| 4526 |
-
<path d="M 60.23
|
| 4527 |
</g>
|
| 4528 |
<g id="line2d_13">
|
| 4529 |
<g>
|
| 4530 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4531 |
</g>
|
| 4532 |
</g>
|
| 4533 |
<g id="text_13">
|
| 4534 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4535 |
</g>
|
| 4536 |
</g>
|
| 4537 |
<g id="ytick_5">
|
| 4538 |
<g id="grid-y--6" class="grid grid-y">
|
| 4539 |
-
<path d="M 60.23 134.
|
| 4540 |
</g>
|
| 4541 |
<g id="line2d_14">
|
| 4542 |
<g>
|
| 4543 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="134.
|
| 4544 |
</g>
|
| 4545 |
</g>
|
| 4546 |
<g id="text_14">
|
| 4547 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4548 |
</g>
|
| 4549 |
</g>
|
| 4550 |
<g id="ytick_6">
|
| 4551 |
<g id="grid-y--7" class="grid grid-y">
|
| 4552 |
-
<path d="M 60.23
|
| 4553 |
</g>
|
| 4554 |
<g id="line2d_15">
|
| 4555 |
<g>
|
| 4556 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4557 |
</g>
|
| 4558 |
</g>
|
| 4559 |
<g id="text_15">
|
| 4560 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4561 |
</g>
|
| 4562 |
</g>
|
| 4563 |
<g id="label--y" class="ylabel">
|
|
@@ -4565,37 +4565,37 @@ Installed 37 packages in 250ms
|
|
| 4565 |
</g>
|
| 4566 |
</g>
|
| 4567 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4568 |
-
<path d="M 96.005644 451.16779 L 185.444754
|
| 4569 |
<defs>
|
| 4570 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4571 |
</defs>
|
| 4572 |
<g clip-path="url(#p620c7d392f)">
|
| 4573 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4574 |
-
<use ns4:href="#md7efaf3aec" x="185.444754" y="
|
| 4575 |
-
<use ns4:href="#md7efaf3aec" x="274.883864" y="
|
| 4576 |
-
<use ns4:href="#md7efaf3aec" x="364.322974" y="
|
| 4577 |
-
<use ns4:href="#md7efaf3aec" x="453.762084" y="
|
| 4578 |
-
<use ns4:href="#md7efaf3aec" x="543.201194" y="
|
| 4579 |
-
<use ns4:href="#md7efaf3aec" x="632.640304" y="
|
| 4580 |
-
<use ns4:href="#md7efaf3aec" x="722.079415" y="
|
| 4581 |
-
<use ns4:href="#md7efaf3aec" x="811.518525" y="
|
| 4582 |
</g>
|
| 4583 |
</g>
|
| 4584 |
<g id="series--torch-eager" class="series">
|
| 4585 |
-
<path d="M 96.005644
|
| 4586 |
<defs>
|
| 4587 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4588 |
</defs>
|
| 4589 |
<g clip-path="url(#p620c7d392f)">
|
| 4590 |
-
<use ns4:href="#m9b8c54d372" x="96.005644" y="
|
| 4591 |
-
<use ns4:href="#m9b8c54d372" x="185.444754" y="
|
| 4592 |
-
<use ns4:href="#m9b8c54d372" x="274.883864" y="
|
| 4593 |
-
<use ns4:href="#m9b8c54d372" x="364.322974" y="
|
| 4594 |
-
<use ns4:href="#m9b8c54d372" x="453.762084" y="
|
| 4595 |
-
<use ns4:href="#m9b8c54d372" x="543.201194" y="
|
| 4596 |
-
<use ns4:href="#m9b8c54d372" x="632.640304" y="
|
| 4597 |
-
<use ns4:href="#m9b8c54d372" x="722.079415" y="
|
| 4598 |
-
<use ns4:href="#m9b8c54d372" x="811.518525" y="
|
| 4599 |
</g>
|
| 4600 |
</g>
|
| 4601 |
<g id="patch_3">
|
|
@@ -4615,25 +4615,25 @@ Installed 37 packages in 250ms
|
|
| 4615 |
</g>
|
| 4616 |
<g id="legend" class="legend">
|
| 4617 |
<g id="patch_7">
|
| 4618 |
-
<path d="M 720.811356
|
| 4619 |
</g>
|
| 4620 |
<g id="line2d_16">
|
| 4621 |
-
<path d="M 722.811356
|
| 4622 |
<g>
|
| 4623 |
-
<use ns4:href="#md7efaf3aec" x="732.811356" y="
|
| 4624 |
</g>
|
| 4625 |
</g>
|
| 4626 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4627 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="
|
| 4628 |
</g>
|
| 4629 |
<g id="line2d_17">
|
| 4630 |
-
<path d="M 722.811356
|
| 4631 |
<g>
|
| 4632 |
-
<use ns4:href="#m9b8c54d372" x="732.811356" y="
|
| 4633 |
</g>
|
| 4634 |
</g>
|
| 4635 |
<g id="legend-label--torch-eager" class="legend">
|
| 4636 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="
|
| 4637 |
</g>
|
| 4638 |
</g>
|
| 4639 |
</g>
|
|
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
+
<dc:date>2025-10-28T14:09:13.211569</dc:date>
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
|
|
| 4021 |
<g id="matplotlib.axis_2">
|
| 4022 |
<g id="ytick_1">
|
| 4023 |
<g id="grid-y--2" class="grid grid-y">
|
| 4024 |
+
<path d="M 60.23 416.825206 L 847.294169 416.825206 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4025 |
</g>
|
| 4026 |
<g id="line2d_10">
|
| 4027 |
<defs>
|
| 4028 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4029 |
</defs>
|
| 4030 |
<g>
|
| 4031 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="416.825206" style="stroke: #000000; stroke-width: 0.8" />
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="420.624425" transform="rotate(-0 53.23 420.624425)">0.025</text>
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_2">
|
| 4039 |
<g id="grid-y--3" class="grid grid-y">
|
| 4040 |
+
<path d="M 60.23 346.161452 L 847.294169 346.161452 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="346.161452" style="stroke: #000000; stroke-width: 0.8" />
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="349.960671" transform="rotate(-0 53.23 349.960671)">0.030</text>
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_3">
|
| 4052 |
<g id="grid-y--4" class="grid grid-y">
|
| 4053 |
+
<path d="M 60.23 275.497698 L 847.294169 275.497698 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="275.497698" style="stroke: #000000; stroke-width: 0.8" />
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="279.296917" transform="rotate(-0 53.23 279.296917)">0.035</text>
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="ytick_4">
|
| 4065 |
<g id="grid-y--5" class="grid grid-y">
|
| 4066 |
+
<path d="M 60.23 204.833944 L 847.294169 204.833944 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4067 |
</g>
|
| 4068 |
<g id="line2d_13">
|
| 4069 |
<g>
|
| 4070 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="204.833944" style="stroke: #000000; stroke-width: 0.8" />
|
| 4071 |
</g>
|
| 4072 |
</g>
|
| 4073 |
<g id="text_13">
|
| 4074 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="208.633163" transform="rotate(-0 53.23 208.633163)">0.040</text>
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="ytick_5">
|
| 4078 |
<g id="grid-y--6" class="grid grid-y">
|
| 4079 |
+
<path d="M 60.23 134.170191 L 847.294169 134.170191 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4080 |
</g>
|
| 4081 |
<g id="line2d_14">
|
| 4082 |
<g>
|
| 4083 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="134.170191" style="stroke: #000000; stroke-width: 0.8" />
|
| 4084 |
</g>
|
| 4085 |
</g>
|
| 4086 |
<g id="text_14">
|
| 4087 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="137.969409" transform="rotate(-0 53.23 137.969409)">0.045</text>
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="ytick_6">
|
| 4091 |
<g id="grid-y--7" class="grid grid-y">
|
| 4092 |
+
<path d="M 60.23 63.506437 L 847.294169 63.506437 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4093 |
</g>
|
| 4094 |
<g id="line2d_15">
|
| 4095 |
<g>
|
| 4096 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="63.506437" style="stroke: #000000; stroke-width: 0.8" />
|
| 4097 |
</g>
|
| 4098 |
</g>
|
| 4099 |
<g id="text_15">
|
| 4100 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="67.305655" transform="rotate(-0 53.23 67.305655)">0.050</text>
|
| 4101 |
</g>
|
| 4102 |
</g>
|
| 4103 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4105 |
</g>
|
| 4106 |
</g>
|
| 4107 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4108 |
+
<path d="M 96.005644 451.16779 L 185.444754 370.031668 L 274.883864 370.596978 L 364.322974 386.708314 L 453.762084 392.220086 L 543.201194 399.569118 L 632.640304 388.969554 L 722.079415 403.526288 L 811.518525 390.241503 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4109 |
<defs>
|
| 4110 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4111 |
</defs>
|
| 4112 |
<g clip-path="url(#p620c7d392f)">
|
| 4113 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4114 |
+
<use ns4:href="#md7efaf3aec" x="185.444754" y="370.031668" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4115 |
+
<use ns4:href="#md7efaf3aec" x="274.883864" y="370.596978" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4116 |
+
<use ns4:href="#md7efaf3aec" x="364.322974" y="386.708314" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4117 |
+
<use ns4:href="#md7efaf3aec" x="453.762084" y="392.220086" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4118 |
+
<use ns4:href="#md7efaf3aec" x="543.201194" y="399.569118" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4119 |
+
<use ns4:href="#md7efaf3aec" x="632.640304" y="388.969554" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4120 |
+
<use ns4:href="#md7efaf3aec" x="722.079415" y="403.526288" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4121 |
+
<use ns4:href="#md7efaf3aec" x="811.518525" y="390.241503" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4122 |
</g>
|
| 4123 |
</g>
|
| 4124 |
<g id="series--torch-eager" class="series">
|
| 4125 |
+
<path d="M 96.005644 166.37873 L 185.444754 47.08418 L 274.883864 54.857193 L 364.322974 60.807081 L 453.762084 69.569387 L 543.201194 78.176231 L 632.640304 66.44605 L 722.079415 63.902153 L 811.518525 71.109857 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4126 |
<defs>
|
| 4127 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4128 |
</defs>
|
| 4129 |
<g clip-path="url(#p620c7d392f)">
|
| 4130 |
+
<use ns4:href="#m9b8c54d372" x="96.005644" y="166.37873" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4131 |
+
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4132 |
+
<use ns4:href="#m9b8c54d372" x="274.883864" y="54.857193" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4133 |
+
<use ns4:href="#m9b8c54d372" x="364.322974" y="60.807081" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4134 |
+
<use ns4:href="#m9b8c54d372" x="453.762084" y="69.569387" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4135 |
+
<use ns4:href="#m9b8c54d372" x="543.201194" y="78.176231" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4136 |
+
<use ns4:href="#m9b8c54d372" x="632.640304" y="66.44605" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4137 |
+
<use ns4:href="#m9b8c54d372" x="722.079415" y="63.902153" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4138 |
+
<use ns4:href="#m9b8c54d372" x="811.518525" y="71.109857" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4139 |
</g>
|
| 4140 |
</g>
|
| 4141 |
<g id="patch_3">
|
|
|
|
| 4155 |
</g>
|
| 4156 |
<g id="legend" class="legend">
|
| 4157 |
<g id="patch_7">
|
| 4158 |
+
<path d="M 720.811356 466.37197 L 840.294169 466.37197 Q 842.294169 466.37197 842.294169 464.37197 L 842.294169 435.45947 Q 842.294169 433.45947 840.294169 433.45947 L 720.811356 433.45947 Q 718.811356 433.45947 718.811356 435.45947 L 718.811356 464.37197 Q 718.811356 466.37197 720.811356 466.37197 L 720.811356 466.37197 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4159 |
</g>
|
| 4160 |
<g id="line2d_16">
|
| 4161 |
+
<path d="M 722.811356 441.557908 L 732.811356 441.557908 L 742.811356 441.557908 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4162 |
<g>
|
| 4163 |
+
<use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4164 |
</g>
|
| 4165 |
</g>
|
| 4166 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4167 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
|
| 4168 |
</g>
|
| 4169 |
<g id="line2d_17">
|
| 4170 |
+
<path d="M 722.811356 456.514158 L 732.811356 456.514158 L 742.811356 456.514158 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4171 |
<g>
|
| 4172 |
+
<use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4173 |
</g>
|
| 4174 |
</g>
|
| 4175 |
<g id="legend-label--torch-eager" class="legend">
|
| 4176 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="460.014158" transform="rotate(-0 750.811356 460.014158)">torch_eager</text>
|
| 4177 |
</g>
|
| 4178 |
</g>
|
| 4179 |
</g>
|
|
|
|
| 4193 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4194 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4195 |
</span> |
|
| 4196 |
+
Cell: combine | 4.28s
|
| 4197 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4198 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4199 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4284 |
impl wl p50(ms) ok
|
| 4285 |
hf_kernels_swiglu cuda_T128_D1024 0.03 True
|
| 4286 |
hf_kernels_swiglu cuda_T128_D2048 0.03 True
|
| 4287 |
+
hf_kernels_swiglu cuda_T128_D768 0.02 True
|
| 4288 |
hf_kernels_swiglu cuda_T256_D1024 0.03 True
|
| 4289 |
hf_kernels_swiglu cuda_T256_D2048 0.03 True
|
| 4290 |
hf_kernels_swiglu cuda_T256_D768 0.03 True
|
|
|
|
| 4319 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4320 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4321 |
<div class="uv-logs-content" style="display: none;">
|
| 4322 |
+
Installed 37 packages in 195ms
|
| 4323 |
</div>
|
| 4324 |
</div>
|
| 4325 |
<div class="cell-artifacts">
|
|
|
|
| 4332 |
<rdf:RDF>
|
| 4333 |
<ns2:Work>
|
| 4334 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4335 |
+
<dc:date>2025-10-28T14:09:13.211569</dc:date>
|
| 4336 |
<dc:format>image/svg+xml</dc:format>
|
| 4337 |
<dc:creator>
|
| 4338 |
<ns2:Agent>
|
|
|
|
| 4481 |
<g id="matplotlib.axis_2">
|
| 4482 |
<g id="ytick_1">
|
| 4483 |
<g id="grid-y--2" class="grid grid-y">
|
| 4484 |
+
<path d="M 60.23 416.825206 L 847.294169 416.825206 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4485 |
</g>
|
| 4486 |
<g id="line2d_10">
|
| 4487 |
<defs>
|
| 4488 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4489 |
</defs>
|
| 4490 |
<g>
|
| 4491 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="416.825206" style="stroke: #000000; stroke-width: 0.8" />
|
| 4492 |
</g>
|
| 4493 |
</g>
|
| 4494 |
<g id="text_10">
|
| 4495 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="420.624425" transform="rotate(-0 53.23 420.624425)">0.025</text>
|
| 4496 |
</g>
|
| 4497 |
</g>
|
| 4498 |
<g id="ytick_2">
|
| 4499 |
<g id="grid-y--3" class="grid grid-y">
|
| 4500 |
+
<path d="M 60.23 346.161452 L 847.294169 346.161452 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4501 |
</g>
|
| 4502 |
<g id="line2d_11">
|
| 4503 |
<g>
|
| 4504 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="346.161452" style="stroke: #000000; stroke-width: 0.8" />
|
| 4505 |
</g>
|
| 4506 |
</g>
|
| 4507 |
<g id="text_11">
|
| 4508 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="349.960671" transform="rotate(-0 53.23 349.960671)">0.030</text>
|
| 4509 |
</g>
|
| 4510 |
</g>
|
| 4511 |
<g id="ytick_3">
|
| 4512 |
<g id="grid-y--4" class="grid grid-y">
|
| 4513 |
+
<path d="M 60.23 275.497698 L 847.294169 275.497698 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4514 |
</g>
|
| 4515 |
<g id="line2d_12">
|
| 4516 |
<g>
|
| 4517 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="275.497698" style="stroke: #000000; stroke-width: 0.8" />
|
| 4518 |
</g>
|
| 4519 |
</g>
|
| 4520 |
<g id="text_12">
|
| 4521 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="279.296917" transform="rotate(-0 53.23 279.296917)">0.035</text>
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="ytick_4">
|
| 4525 |
<g id="grid-y--5" class="grid grid-y">
|
| 4526 |
+
<path d="M 60.23 204.833944 L 847.294169 204.833944 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4527 |
</g>
|
| 4528 |
<g id="line2d_13">
|
| 4529 |
<g>
|
| 4530 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="204.833944" style="stroke: #000000; stroke-width: 0.8" />
|
| 4531 |
</g>
|
| 4532 |
</g>
|
| 4533 |
<g id="text_13">
|
| 4534 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="208.633163" transform="rotate(-0 53.23 208.633163)">0.040</text>
|
| 4535 |
</g>
|
| 4536 |
</g>
|
| 4537 |
<g id="ytick_5">
|
| 4538 |
<g id="grid-y--6" class="grid grid-y">
|
| 4539 |
+
<path d="M 60.23 134.170191 L 847.294169 134.170191 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4540 |
</g>
|
| 4541 |
<g id="line2d_14">
|
| 4542 |
<g>
|
| 4543 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="134.170191" style="stroke: #000000; stroke-width: 0.8" />
|
| 4544 |
</g>
|
| 4545 |
</g>
|
| 4546 |
<g id="text_14">
|
| 4547 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="137.969409" transform="rotate(-0 53.23 137.969409)">0.045</text>
|
| 4548 |
</g>
|
| 4549 |
</g>
|
| 4550 |
<g id="ytick_6">
|
| 4551 |
<g id="grid-y--7" class="grid grid-y">
|
| 4552 |
+
<path d="M 60.23 63.506437 L 847.294169 63.506437 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4553 |
</g>
|
| 4554 |
<g id="line2d_15">
|
| 4555 |
<g>
|
| 4556 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="63.506437" style="stroke: #000000; stroke-width: 0.8" />
|
| 4557 |
</g>
|
| 4558 |
</g>
|
| 4559 |
<g id="text_15">
|
| 4560 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="67.305655" transform="rotate(-0 53.23 67.305655)">0.050</text>
|
| 4561 |
</g>
|
| 4562 |
</g>
|
| 4563 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4565 |
</g>
|
| 4566 |
</g>
|
| 4567 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4568 |
+
<path d="M 96.005644 451.16779 L 185.444754 370.031668 L 274.883864 370.596978 L 364.322974 386.708314 L 453.762084 392.220086 L 543.201194 399.569118 L 632.640304 388.969554 L 722.079415 403.526288 L 811.518525 390.241503 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4569 |
<defs>
|
| 4570 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4571 |
</defs>
|
| 4572 |
<g clip-path="url(#p620c7d392f)">
|
| 4573 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4574 |
+
<use ns4:href="#md7efaf3aec" x="185.444754" y="370.031668" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4575 |
+
<use ns4:href="#md7efaf3aec" x="274.883864" y="370.596978" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4576 |
+
<use ns4:href="#md7efaf3aec" x="364.322974" y="386.708314" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4577 |
+
<use ns4:href="#md7efaf3aec" x="453.762084" y="392.220086" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4578 |
+
<use ns4:href="#md7efaf3aec" x="543.201194" y="399.569118" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4579 |
+
<use ns4:href="#md7efaf3aec" x="632.640304" y="388.969554" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4580 |
+
<use ns4:href="#md7efaf3aec" x="722.079415" y="403.526288" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4581 |
+
<use ns4:href="#md7efaf3aec" x="811.518525" y="390.241503" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4582 |
</g>
|
| 4583 |
</g>
|
| 4584 |
<g id="series--torch-eager" class="series">
|
| 4585 |
+
<path d="M 96.005644 166.37873 L 185.444754 47.08418 L 274.883864 54.857193 L 364.322974 60.807081 L 453.762084 69.569387 L 543.201194 78.176231 L 632.640304 66.44605 L 722.079415 63.902153 L 811.518525 71.109857 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4586 |
<defs>
|
| 4587 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4588 |
</defs>
|
| 4589 |
<g clip-path="url(#p620c7d392f)">
|
| 4590 |
+
<use ns4:href="#m9b8c54d372" x="96.005644" y="166.37873" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4591 |
+
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4592 |
+
<use ns4:href="#m9b8c54d372" x="274.883864" y="54.857193" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4593 |
+
<use ns4:href="#m9b8c54d372" x="364.322974" y="60.807081" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4594 |
+
<use ns4:href="#m9b8c54d372" x="453.762084" y="69.569387" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4595 |
+
<use ns4:href="#m9b8c54d372" x="543.201194" y="78.176231" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4596 |
+
<use ns4:href="#m9b8c54d372" x="632.640304" y="66.44605" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4597 |
+
<use ns4:href="#m9b8c54d372" x="722.079415" y="63.902153" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4598 |
+
<use ns4:href="#m9b8c54d372" x="811.518525" y="71.109857" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4599 |
</g>
|
| 4600 |
</g>
|
| 4601 |
<g id="patch_3">
|
|
|
|
| 4615 |
</g>
|
| 4616 |
<g id="legend" class="legend">
|
| 4617 |
<g id="patch_7">
|
| 4618 |
+
<path d="M 720.811356 466.37197 L 840.294169 466.37197 Q 842.294169 466.37197 842.294169 464.37197 L 842.294169 435.45947 Q 842.294169 433.45947 840.294169 433.45947 L 720.811356 433.45947 Q 718.811356 433.45947 718.811356 435.45947 L 718.811356 464.37197 Q 718.811356 466.37197 720.811356 466.37197 L 720.811356 466.37197 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4619 |
</g>
|
| 4620 |
<g id="line2d_16">
|
| 4621 |
+
<path d="M 722.811356 441.557908 L 732.811356 441.557908 L 742.811356 441.557908 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4622 |
<g>
|
| 4623 |
+
<use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4624 |
</g>
|
| 4625 |
</g>
|
| 4626 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4627 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
|
| 4628 |
</g>
|
| 4629 |
<g id="line2d_17">
|
| 4630 |
+
<path d="M 722.811356 456.514158 L 732.811356 456.514158 L 742.811356 456.514158 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4631 |
<g>
|
| 4632 |
+
<use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4633 |
</g>
|
| 4634 |
</g>
|
| 4635 |
<g id="legend-label--torch-eager" class="legend">
|
| 4636 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="460.014158" transform="rotate(-0 750.811356 460.014158)">torch_eager</text>
|
| 4637 |
</g>
|
| 4638 |
</g>
|
| 4639 |
</g>
|
causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.06712200001857127, "p50": 0.06883200001084333, "p90": 0.06976199995278876, "mean": 0.06901199997173535, "iqr": 0.0014600000213249587, "raw_times": [0.06976199995278876, 0.07104199994500959, 0.06712200001857127, 0.0683019999314638, 0.06883200001084333], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0738530000035098, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08455299996512622, "p50": 0.08599400007369695, "p90": 0.0868530000843748, "mean": 0.08612520005044644, "iqr": 0.0014299999975264654, "raw_times": [0.08780300004218589, 0.08455299996512622, 0.0868530000843748, 0.08542300008684833, 0.08599400007369695], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08941300006881647, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08494299993344612, "p50": 0.08714299997336639, "p90": 0.08724299993900786, "mean": 0.086546999955317, "iqr": 0.0020200000108161476, "raw_times": [0.08522299992819171, 0.08714299997336639, 0.08818300000257295, 0.08724299993900786, 0.08494299993344612], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09105300000555872, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08327299997290538, "p50": 0.084122999965075, "p90": 0.08580299993354856, "mean": 0.08452299998680246, "iqr": 0.0023699999474047218, "raw_times": [0.08327299997290538, 0.084122999965075, 0.08598300007633952, 0.08580299993354856, 0.08343299998614384], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08891300001323543, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08298299997022696, "p50": 0.08508299993081891, "p90": 0.08600299997851835, "mean": 0.0849267999683434, "iqr": 0.0016210000239880173, "raw_times": [0.08298299997022696, 0.08508299993081891, 0.08600299997851835, 0.08438199995453033, 0.08618300000762247], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08780300004218589, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08270299997548136, "p50": 0.08315299999139825, "p90": 0.0846430000365217, "mean": 0.08407499999520951, "iqr": 0.0019010000187336118, "raw_times": [0.08315299999139825, 0.08713399995485815, 0.08270299997548136, 0.08274200001778809, 0.0846430000365217], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08981299993138236, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08372299998882227, "p50": 0.08510199995725998, "p90": 0.08608299992829416, "mean": 0.08701479998762807, "iqr": 0.0011499998890940333, "raw_times": [0.08493300003920012, 0.09523300002456381, 0.08510199995725998, 0.08372299998882227, 0.08608299992829416], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08923300003971235, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08285199999136239, "p50": 0.08483300007355865, "p90": 0.08511300006830425, "mean": 0.08449480001218035, "iqr": 0.0016500000583619112, "raw_times": [0.08285199999136239, 0.08346300000994233, 0.08483300007355865, 0.08621299991773412, 0.08511300006830425], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08870299996033282, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08278300003894401, "p50": 0.08427300008406746, "p90": 0.08444299999155191, "mean": 0.08422300002166594, "iqr": 0.0002599999788799323, "raw_times": [0.08444299999155191, 0.08418300001267198, 0.08278300003894401, 0.08543299998109433, 0.08427300008406746], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08903299999474257, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 10 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08352199995442788, "p50": 0.0842329999386493, "p90": 0.08553300006042264, "mean": 0.08496079999531503, "iqr": 0.0014400000054592965, "raw_times": [0.08409300005496334, 0.08742299996811198, 0.08553300006042264, 0.08352199995442788, 0.0842329999386493], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08985400006622513, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 11 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.14414499992199126, "p50": 0.14512600000671227, "p90": 0.14515400005166157, "mean": 0.1465472000063528, "iqr": 0.0008580000212532468, "raw_times": [0.14512600000671227, 0.14414499992199126, 0.14429600003040832, 0.15401500002099056, 0.14515400005166157], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.14571500003057736, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 12 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16020600003230356, "p50": 0.16135600003508443, "p90": 0.16139600006681576, "mean": 0.16140360005465482, "iqr": 0.00029099999210302485, "raw_times": [0.16139600006681576, 0.1629550000643576, 0.16110500007471273, 0.16020600003230356, 0.16135600003508443], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1623660000404925, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 13 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07979300005445111, "p50": 0.08039299996198679, "p90": 0.08136300004935038, "mean": 0.08070500002759218, "iqr": 0.001150000002780871, "raw_times": [0.0802130000465695, 0.0817630000256031, 0.07979300005445111, 0.08039299996198679, 0.08136300004935038], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0855329999467358, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 14 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0800829999434427, "p50": 0.08147299990923784, "p90": 0.08197300007850572, "mean": 0.08146099996793055, "iqr": 0.00109000018255756, "raw_times": [0.0800829999434427, 0.08197300007850572, 0.08147299990923784, 0.08289300001251831, 0.08088299989594816], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08291199992527254, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 15 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0799729999698684, "p50": 0.08137199995417177, "p90": 0.081513000054656, "mean": 0.08127659998535819, "iqr": 0.0006500000608866685, "raw_times": [0.0799729999698684, 0.08266199995432544, 0.081513000054656, 0.08086299999376934, 0.08137199995417177], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08939400004237541, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 16 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08067300007041922, "p50": 0.08162300002823031, "p90": 0.08189199991193163, "mean": 0.08365860001049441, "iqr": 0.0008099999604382901, "raw_times": [0.08067300007041922, 0.08108199995149334, 0.08189199991193163, 0.08162300002823031, 0.09302300009039755], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08415299998887349, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 17 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0806030000148894, "p50": 0.08186299999124458, "p90": 0.08199299998068454, "mean": 0.08162900001025264, "iqr": 0.001009999891721236, "raw_times": [0.08270299997548136, 0.08186299999124458, 0.0806030000148894, 0.08199299998068454, 0.08098300008896331], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10199300004387624, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 18 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08040199998049502, "p50": 0.08168299996214046, "p90": 0.08185199999388715, "mean": 0.08171659999334224, "iqr": 0.0013889999763705418, "raw_times": [0.0804630000175166, 0.08418300001267198, 0.08168299996214046, 0.08040199998049502, 0.08185199999388715], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08522300004187855, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 19 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08097300008103048, "p50": 0.08150300004672317, "p90": 0.08173299988811777, "mean": 0.08153900000706926, "iqr": 0.0005599998758043512, "raw_times": [0.08117300001231342, 0.08231300000716146, 0.08150300004672317, 0.08173299988811777, 0.08097300008103048], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08440300007350743, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 20 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0802130000465695, "p50": 0.08124300006784324, "p90": 0.08242299998073577, "mean": 0.08162480000919459, "iqr": 0.0012000000424450263, "raw_times": [0.0802130000465695, 0.08302200001253368, 0.08242299998073577, 0.08124300006784324, 0.08122299993829074], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08460300000479037, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 21 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09226300005593657, "p50": 0.09320300000581483, "p90": 0.0934630000983816, "mean": 0.09316100004070904, "iqr": 0.0007800000503266347, "raw_times": [0.09419299999535724, 0.09320300000581483, 0.0934630000983816, 0.09226300005593657, 0.09268300004805496], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0951240000404141, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 22 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09887299995625654, "p50": 0.09917300008055463, "p90": 0.09990300009121711, "mean": 0.09939520000443736, "iqr": 0.0009100001534534385, "raw_times": [0.09887299995625654, 0.09917300008055463, 0.09990300009121711, 0.10003399995639484, 0.09899299993776367], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1023739999936879, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 23 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4842959999677987, "p50": 0.4860569999891595, "p90": 0.4878769999550059, "mean": 0.48646659997757524, "iqr": 0.002959999960694404, "raw_times": [0.4849169999943115, 0.4860569999891595, 0.4878769999550059, 0.4842959999677987, 0.4891859999816006], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4877669999814316, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 24 |
+
{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4968179999877975, "p50": 0.49805800006197387, "p90": 0.4990780000753148, "mean": 0.4983496000022569, "iqr": 0.001141000097959477, "raw_times": [0.4979369999773553, 0.49985699990884314, 0.4990780000753148, 0.49805800006197387, 0.4968179999877975], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.49727700002222264, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
causal_conv1d/impls/cells/benchmark.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# ]
|
| 8 |
+
#
|
| 9 |
+
# [tool.uv.sources]
|
| 10 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 11 |
+
# ///
|
| 12 |
+
import torch
|
| 13 |
+
import torch.nn.functional as F
|
| 14 |
+
import sys
|
| 15 |
+
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def torch_causal_conv1d(input_tensor, weight, bias):
|
| 19 |
+
# Convert to weight dtype for computation
|
| 20 |
+
x = input_tensor.to(weight.dtype)
|
| 21 |
+
dim = weight.shape[0]
|
| 22 |
+
width = weight.shape[1]
|
| 23 |
+
seqlen = input_tensor.shape[-1]
|
| 24 |
+
|
| 25 |
+
# Depthwise causal conv1d using PyTorch
|
| 26 |
+
out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
|
| 27 |
+
|
| 28 |
+
# Truncate to original sequence length
|
| 29 |
+
out = out[..., :seqlen]
|
| 30 |
+
|
| 31 |
+
# Convert back to original dtype
|
| 32 |
+
return out.to(input_tensor.dtype)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
run_benchmark(
|
| 36 |
+
kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
|
| 37 |
+
impl_name="torch_eager",
|
| 38 |
+
impl_tags={"family": "pytorch", "backend": "eager"},
|
| 39 |
+
impl_func=torch_causal_conv1d,
|
| 40 |
+
)
|
causal_conv1d/impls/cells/nv.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
causal_conv1d/impls/hf_kernels_causal_conv1d.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/impls/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /causal_conv1d/impls</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /causal_conv1d/impls</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='hf_kernels_causal_conv1d.html' class='file'>hf_kernels_causal_conv1d.html</a></li>
|
| 86 |
+
<li><a href='torch_causal_conv1d.html' class='file'>torch_causal_conv1d.html</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
causal_conv1d/impls/torch_causal_conv1d.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /causal_conv1d</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /causal_conv1d</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='impls/index.html' class='dir'>impls/</a></li>
|
| 86 |
+
<li><a href='results/index.html' class='dir'>results/</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
causal_conv1d/results/artifacts/combine/latency.svg
ADDED
|
|
Git LFS Details
|
causal_conv1d/results/cells/combine.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "matplotlib",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
+
# ///
|
| 13 |
+
from kernels_benchmark_tools.core.visuals import generate_combined_results
|
| 14 |
+
|
| 15 |
+
# Map display names to uvnote environment variables
|
| 16 |
+
cache_env_map = {
|
| 17 |
+
"HF Kernels Causal Conv1D": "UVNOTE_FILE_HF_KERNELS_CAUSAL_CONV1D_BENCHMARK",
|
| 18 |
+
"PyTorch Causal Conv1D": "UVNOTE_FILE_TORCH_CAUSAL_CONV1D_BENCHMARK",
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
# Generate combined results with visualization
|
| 22 |
+
generate_combined_results(
|
| 23 |
+
cache_env_map=cache_env_map,
|
| 24 |
+
output_filename="causal_conv1d.jsonl",
|
| 25 |
+
svg_filename="latency.svg"
|
| 26 |
+
)
|
causal_conv1d/results/combined_results.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/results/index.html
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /causal_conv1d/results</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /causal_conv1d/results</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='combined_results.html' class='file'>combined_results.html</a></li>
|
| 86 |
+
</ul>
|
| 87 |
+
</body>
|
| 88 |
+
</html>
|
flash_attn/impls/artifacts/benchmark/attention.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9106109999947876, "p50": 0.9171110000352201, "p90": 0.9204320000435473, "mean": 0.9179216000347878, "iqr": 0.005419999979494605, "raw_times": [0.9171110000352201, 0.9150120000640527, 0.9106109999947876, 0.9204320000435473, 0.9264420000363316], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9176309999929799, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9576329999845257, "p50": 0.960063999968952, "p90": 0.9623629999850891, "mean": 0.9611931999643275, "iqr": 0.0033900000744324643, "raw_times": [0.9589729999106567, 0.9576329999845257, 0.960063999968952, 0.9669329999724141, 0.9623629999850891], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9673530000782193, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0019650000003821, "p50": 1.0193159999971613, "p90": 1.0211459999709405, "mean": 1.015251600006195, "iqr": 0.01198099994326185, "raw_times": [1.0019650000003821, 1.0091650000276786, 1.024666000034813, 1.0193159999971613, 1.0211459999709405], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.009233999980097, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0176959999625979, "p50": 1.0199849999708022, "p90": 1.025695000066662, "mean": 1.0218714000075124, "iqr": 0.006820000066909415, "raw_times": [1.0271060000377474, 1.0176959999625979, 1.0188749999997526, 1.0199849999708022, 1.025695000066662], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.027405000058934, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1665810000067722, "p50": 1.1845809999613266, "p90": 1.185440999961429, "mean": 1.1787729999923613, "iqr": 0.01419000000169035, "raw_times": [1.1712509999597387, 1.1665810000067722, 1.18601100007254, 1.1845809999613266, 1.185440999961429], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1787800000320203, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1722899999995207, "p50": 1.1832310000272628, "p90": 1.1854509999693619, "mean": 1.181276799979969, "iqr": 0.008630000024822948, "raw_times": [1.1885909999591604, 1.1854509999693619, 1.176820999944539, 1.1832310000272628, 1.1722899999995207], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1782799999764393, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/cells/benchmark.py
CHANGED
|
@@ -3,9 +3,8 @@
|
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
-
# "kernels",
|
| 7 |
# "kernels-benchmark-tools",
|
| 8 |
-
# "
|
| 9 |
# ]
|
| 10 |
#
|
| 11 |
# [tool.uv.sources]
|
|
@@ -16,18 +15,17 @@ import sys
|
|
| 16 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 17 |
from kernels import get_kernel
|
| 18 |
|
| 19 |
-
# Load the
|
| 20 |
-
|
| 21 |
|
| 22 |
|
| 23 |
-
def
|
| 24 |
-
|
| 25 |
-
return hf_kernels_sage_attn.fwd(query, key, value, is_causal=False)[0]
|
| 26 |
|
| 27 |
|
| 28 |
run_benchmark(
|
| 29 |
kernel_type=KernelTypeEnum.ATTENTION,
|
| 30 |
-
impl_name="
|
| 31 |
-
impl_tags={"family": "
|
| 32 |
-
impl_func=
|
| 33 |
)
|
|
|
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
|
|
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
+
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
|
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
from kernels import get_kernel
|
| 17 |
|
| 18 |
+
# Load the flash attention 3 kernel
|
| 19 |
+
hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
|
| 20 |
|
| 21 |
|
| 22 |
+
def hf_flash_attention3(query, key, value):
|
| 23 |
+
return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
run_benchmark(
|
| 27 |
kernel_type=KernelTypeEnum.ATTENTION,
|
| 28 |
+
impl_name="hf_kernels_flash_attn3",
|
| 29 |
+
impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
|
| 30 |
+
impl_func=hf_flash_attention3,
|
| 31 |
)
|
flash_attn/impls/flash_attention.html
CHANGED
|
@@ -3888,7 +3888,7 @@ Cell: nv | 0.26s
|
|
| 3888 |
</div>
|
| 3889 |
</div>
|
| 3890 |
<div id="output-nv" class="cell-output">
|
| 3891 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3892 |
+-----------------------------------------------------------------------------------------+
|
| 3893 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3894 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3897,7 +3897,7 @@ Cell: nv | 0.26s
|
|
| 3897 |
| | | MIG M. |
|
| 3898 |
|=========================================+========================+======================|
|
| 3899 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3900 |
-
| N/A
|
| 3901 |
| | | N/A |
|
| 3902 |
+-----------------------------------------+------------------------+----------------------+
|
| 3903 |
|
|
@@ -3921,7 +3921,7 @@ Cell: nv | 0.26s
|
|
| 3921 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3922 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3923 |
</span> |
|
| 3924 |
-
Cell: benchmark | 3.
|
| 3925 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3926 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3927 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3972,29 +3972,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
|
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 3976 |
-
torch_flash_ma 6.
|
| 3977 |
-
aten::scaled_dot_product_attention 0.
|
| 3978 |
-
aten::_scaled_dot_product_flash_attention 0.51% 26.
|
| 3979 |
-
aten::_flash_attention_forward 0.
|
| 3980 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3981 |
-
aten::contiguous 0.
|
| 3982 |
-
aten::clone 0.
|
| 3983 |
-
aten::copy_ 1.
|
| 3984 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3985 |
-
Activity Buffer Request 27.
|
| 3986 |
-
aten::transpose 1.
|
| 3987 |
-
aten::as_strided 0.
|
| 3988 |
-
aten::empty_like 0.
|
| 3989 |
-
aten::empty 1.
|
| 3990 |
-
cudaLaunchKernel 2.
|
| 3991 |
-
aten::empty_strided 0.
|
| 3992 |
-
cudaDeviceGetAttribute 0.
|
| 3993 |
-
cudaFuncSetAttribute 0.17% 8.
|
| 3994 |
-
cudaDeviceSynchronize
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
-
Self CPU time total: 5.
|
| 3997 |
-
Self CUDA time total: 3.
|
| 3998 |
|
| 3999 |
|
| 4000 |
|
|
@@ -4004,29 +4004,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
|
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
-
torch_flash_ma
|
| 4008 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4009 |
-
aten::scaled_dot_product_attention 0.
|
| 4010 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4011 |
-
aten::_flash_attention_forward 0.
|
| 4012 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4013 |
-
aten::contiguous 0.
|
| 4014 |
-
aten::clone 0.
|
| 4015 |
-
aten::copy_ 1.
|
| 4016 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4017 |
-
Activity Buffer Request
|
| 4018 |
-
aten::transpose
|
| 4019 |
-
aten::as_strided 0.
|
| 4020 |
-
aten::empty_like 0.
|
| 4021 |
-
aten::empty 1.
|
| 4022 |
-
cudaLaunchKernel
|
| 4023 |
-
aten::empty_strided 0.
|
| 4024 |
-
cudaDeviceGetAttribute 0.
|
| 4025 |
-
cudaFuncSetAttribute 0.
|
| 4026 |
-
cudaDeviceSynchronize
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
-
Self CPU time total: 5.
|
| 4029 |
-
Self CUDA time total: 3.
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
@@ -4036,29 +4036,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
-
torch_flash_ma 4.
|
| 4040 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4041 |
-
aten::scaled_dot_product_attention 0.
|
| 4042 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4043 |
-
aten::_flash_attention_forward 0.
|
| 4044 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4045 |
-
aten::contiguous 0.
|
| 4046 |
-
aten::clone 0.
|
| 4047 |
-
aten::copy_ 1.
|
| 4048 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.
|
| 4049 |
-
Activity Buffer Request 27.
|
| 4050 |
-
aten::transpose
|
| 4051 |
-
aten::as_strided 0.
|
| 4052 |
-
aten::empty_like 0.
|
| 4053 |
-
aten::empty 1.
|
| 4054 |
-
cudaLaunchKernel 1.
|
| 4055 |
-
aten::empty_strided 0.
|
| 4056 |
-
cudaDeviceGetAttribute 0.
|
| 4057 |
-
cudaFuncSetAttribute 0.
|
| 4058 |
-
cudaDeviceSynchronize 58.
|
| 4059 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4060 |
-
Self CPU time total: 5.
|
| 4061 |
-
Self CUDA time total: 3.
|
| 4062 |
|
| 4063 |
|
| 4064 |
|
|
@@ -4068,29 +4068,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
|
|
| 4068 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4069 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
-
torch_flash_ma
|
| 4072 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4073 |
-
aten::scaled_dot_product_attention 0.
|
| 4074 |
-
aten::_scaled_dot_product_flash_attention 0.34% 19.
|
| 4075 |
-
aten::_flash_attention_forward 0.70% 39.
|
| 4076 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4077 |
-
aten::contiguous 0.17% 9.
|
| 4078 |
-
aten::clone 0.52%
|
| 4079 |
-
aten::copy_ 1.
|
| 4080 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4081 |
-
Activity Buffer Request
|
| 4082 |
-
aten::transpose 0.
|
| 4083 |
-
aten::as_strided 0.32%
|
| 4084 |
-
aten::empty_like 0.
|
| 4085 |
-
aten::empty 1.
|
| 4086 |
-
cudaLaunchKernel 5.
|
| 4087 |
-
aten::empty_strided 0.
|
| 4088 |
-
cudaDeviceGetAttribute 0.
|
| 4089 |
-
cudaFuncSetAttribute 0.07% 4.
|
| 4090 |
-
cudaDeviceSynchronize
|
| 4091 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4092 |
-
Self CPU time total: 5.
|
| 4093 |
-
Self CUDA time total: 3.
|
| 4094 |
|
| 4095 |
|
| 4096 |
|
|
@@ -4100,29 +4100,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
|
|
| 4100 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4101 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4102 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4103 |
-
torch_flash_ma 5.
|
| 4104 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4105 |
-
aten::scaled_dot_product_attention 0.
|
| 4106 |
-
aten::_scaled_dot_product_flash_attention 0.32% 19.
|
| 4107 |
-
aten::_flash_attention_forward 0.
|
| 4108 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4109 |
-
aten::contiguous 0.
|
| 4110 |
-
aten::clone 0.
|
| 4111 |
-
aten::copy_ 1.
|
| 4112 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4113 |
-
Activity Buffer Request
|
| 4114 |
-
aten::transpose 0.
|
| 4115 |
-
aten::as_strided 0.
|
| 4116 |
-
aten::empty_like 0.
|
| 4117 |
-
aten::empty 1.
|
| 4118 |
-
cudaLaunchKernel 4.
|
| 4119 |
-
aten::empty_strided 0.
|
| 4120 |
-
cudaDeviceGetAttribute 0.
|
| 4121 |
-
cudaFuncSetAttribute 0.
|
| 4122 |
-
cudaDeviceSynchronize
|
| 4123 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4124 |
-
Self CPU time total:
|
| 4125 |
-
Self CUDA time total: 4.
|
| 4126 |
|
| 4127 |
|
| 4128 |
|
|
@@ -4132,38 +4132,38 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
|
|
| 4132 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4133 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4134 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4135 |
-
torch_flash_ma
|
| 4136 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4137 |
-
aten::scaled_dot_product_attention 0.
|
| 4138 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4139 |
-
aten::_flash_attention_forward 0.
|
| 4140 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4141 |
-
aten::contiguous 0.
|
| 4142 |
-
aten::clone 0.
|
| 4143 |
-
aten::copy_ 1.
|
| 4144 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4145 |
-
Activity Buffer Request 23.
|
| 4146 |
-
aten::transpose 0.
|
| 4147 |
-
aten::as_strided 0.
|
| 4148 |
-
aten::empty_like 0.
|
| 4149 |
-
aten::empty 1.
|
| 4150 |
-
cudaLaunchKernel 4.
|
| 4151 |
-
aten::empty_strided 0.
|
| 4152 |
-
cudaDeviceGetAttribute 0.03% 1.
|
| 4153 |
-
cudaFuncSetAttribute 0.07% 4.
|
| 4154 |
-
cudaDeviceSynchronize
|
| 4155 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4156 |
-
Self CPU time total: 6.
|
| 4157 |
-
Self CUDA time total: 4.
|
| 4158 |
|
| 4159 |
|
| 4160 |
impl wl p50(ms) ok
|
| 4161 |
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
|
| 4162 |
torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
|
| 4163 |
-
torch_flash_ma cuda_attn_L320_bfloat16 1.
|
| 4164 |
-
torch_flash_ma cuda_attn_L384_bfloat16 1.
|
| 4165 |
-
torch_flash_ma cuda_attn_L448_bfloat16 1.
|
| 4166 |
-
torch_flash_ma cuda_attn_L512_bfloat16 1.
|
| 4167 |
</pre></div>
|
| 4168 |
<div class="cell-artifacts">
|
| 4169 |
<h4>Artifacts:</h4>
|
|
|
|
| 3888 |
</div>
|
| 3889 |
</div>
|
| 3890 |
<div id="output-nv" class="cell-output">
|
| 3891 |
+
<div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:08:39 2025
|
| 3892 |
+-----------------------------------------------------------------------------------------+
|
| 3893 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3894 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3897 |
| | | MIG M. |
|
| 3898 |
|=========================================+========================+======================|
|
| 3899 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3900 |
+
| N/A 32C P0 153W / 350W | 0MiB / 46068MiB | 26% Default |
|
| 3901 |
| | | N/A |
|
| 3902 |
+-----------------------------------------+------------------------+----------------------+
|
| 3903 |
|
|
|
|
| 3921 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3922 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3923 |
</span> |
|
| 3924 |
+
Cell: benchmark | 3.83s
|
| 3925 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3926 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3927 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.585ms 101.47% 3.585ms 3.585ms 1
|
| 3976 |
+
torch_flash_ma 6.34% 327.656us 45.53% 2.352ms 2.352ms 0.000us 0.00% 3.573ms 3.573ms 1
|
| 3977 |
+
aten::scaled_dot_product_attention 0.82% 42.312us 4.12% 213.057us 71.019us 0.000us 0.00% 2.820ms 940.062us 3
|
| 3978 |
+
aten::_scaled_dot_product_flash_attention 0.51% 26.321us 3.31% 170.745us 56.915us 0.000us 0.00% 2.820ms 940.062us 3
|
| 3979 |
+
aten::_flash_attention_forward 0.73% 37.527us 2.40% 124.015us 41.338us 2.820ms 79.83% 2.820ms 940.062us 3
|
| 3980 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.820ms 79.83% 2.820ms 940.062us 3
|
| 3981 |
+
aten::contiguous 0.27% 14.121us 33.79% 1.745ms 145.446us 0.000us 0.00% 752.928us 62.744us 12
|
| 3982 |
+
aten::clone 0.72% 37.329us 33.52% 1.731ms 144.269us 0.000us 0.00% 752.928us 62.744us 12
|
| 3983 |
+
aten::copy_ 1.68% 87.013us 31.25% 1.614ms 134.513us 712.672us 20.17% 752.928us 62.744us 12
|
| 3984 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 712.672us 20.17% 712.672us 59.389us 12
|
| 3985 |
+
Activity Buffer Request 27.64% 1.428ms 27.64% 1.428ms 1.428ms 40.256us 1.14% 40.256us 40.256us 1
|
| 3986 |
+
aten::transpose 1.24% 64.087us 1.67% 86.009us 3.584us 0.000us 0.00% 0.000us 0.000us 24
|
| 3987 |
+
aten::as_strided 0.42% 21.922us 0.42% 21.922us 0.913us 0.000us 0.00% 0.000us 0.000us 24
|
| 3988 |
+
aten::empty_like 0.48% 24.711us 1.99% 102.775us 6.852us 0.000us 0.00% 0.000us 0.000us 15
|
| 3989 |
+
aten::empty 1.74% 89.843us 1.74% 89.843us 3.743us 0.000us 0.00% 0.000us 0.000us 24
|
| 3990 |
+
cudaLaunchKernel 2.38% 122.771us 2.38% 122.771us 8.185us 0.000us 0.00% 0.000us 0.000us 15
|
| 3991 |
+
aten::empty_strided 0.34% 17.310us 0.34% 17.310us 5.770us 0.000us 0.00% 0.000us 0.000us 3
|
| 3992 |
+
cudaDeviceGetAttribute 0.04% 2.229us 0.04% 2.229us 0.372us 0.000us 0.00% 0.000us 0.000us 6
|
| 3993 |
+
cudaFuncSetAttribute 0.17% 8.900us 0.17% 8.900us 2.967us 0.000us 0.00% 0.000us 0.000us 3
|
| 3994 |
+
cudaDeviceSynchronize 54.47% 2.814ms 54.47% 2.814ms 2.814ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
+
Self CPU time total: 5.165ms
|
| 3997 |
+
Self CUDA time total: 3.533ms
|
| 3998 |
|
| 3999 |
|
| 4000 |
|
|
|
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
+
torch_flash_ma 4.84% 255.079us 41.49% 2.188ms 2.188ms 0.000us 0.00% 3.787ms 3.787ms 1
|
| 4008 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.743ms 100.29% 3.743ms 3.743ms 1
|
| 4009 |
+
aten::scaled_dot_product_attention 0.47% 24.640us 3.42% 180.356us 60.119us 0.000us 0.00% 2.967ms 989.106us 3
|
| 4010 |
+
aten::_scaled_dot_product_flash_attention 0.36% 19.241us 2.95% 155.716us 51.905us 0.000us 0.00% 2.967ms 989.106us 3
|
| 4011 |
+
aten::_flash_attention_forward 0.73% 38.683us 2.19% 115.525us 38.508us 2.967ms 79.51% 2.967ms 989.106us 3
|
| 4012 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.967ms 79.51% 2.967ms 989.106us 3
|
| 4013 |
+
aten::contiguous 0.17% 8.802us 32.41% 1.709ms 142.425us 0.000us 0.00% 819.868us 68.322us 12
|
| 4014 |
+
aten::clone 0.52% 27.349us 32.24% 1.700ms 141.692us 0.000us 0.00% 819.868us 68.322us 12
|
| 4015 |
+
aten::copy_ 1.56% 82.061us 30.60% 1.614ms 134.473us 764.892us 20.49% 819.868us 68.322us 12
|
| 4016 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 764.892us 20.49% 764.892us 63.741us 12
|
| 4017 |
+
Activity Buffer Request 27.50% 1.450ms 27.50% 1.450ms 1.450ms 54.976us 1.47% 54.976us 54.976us 1
|
| 4018 |
+
aten::transpose 0.91% 47.959us 1.22% 64.512us 2.688us 0.000us 0.00% 0.000us 0.000us 24
|
| 4019 |
+
aten::as_strided 0.31% 16.553us 0.31% 16.553us 0.690us 0.000us 0.00% 0.000us 0.000us 24
|
| 4020 |
+
aten::empty_like 0.39% 20.732us 1.52% 80.304us 5.354us 0.000us 0.00% 0.000us 0.000us 15
|
| 4021 |
+
aten::empty 1.38% 72.972us 1.38% 72.972us 3.040us 0.000us 0.00% 0.000us 0.000us 24
|
| 4022 |
+
cudaLaunchKernel 1.96% 103.146us 1.96% 103.146us 6.876us 0.000us 0.00% 0.000us 0.000us 15
|
| 4023 |
+
aten::empty_strided 0.28% 14.880us 0.28% 14.880us 4.960us 0.000us 0.00% 0.000us 0.000us 3
|
| 4024 |
+
cudaDeviceGetAttribute 0.03% 1.800us 0.03% 1.800us 0.300us 0.000us 0.00% 0.000us 0.000us 6
|
| 4025 |
+
cudaFuncSetAttribute 0.07% 3.830us 0.07% 3.830us 1.277us 0.000us 0.00% 0.000us 0.000us 3
|
| 4026 |
+
cudaDeviceSynchronize 58.51% 3.085ms 58.51% 3.085ms 3.085ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
+
Self CPU time total: 5.273ms
|
| 4029 |
+
Self CUDA time total: 3.732ms
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
+
torch_flash_ma 4.77% 251.162us 41.45% 2.184ms 2.184ms 0.000us 0.00% 3.786ms 3.786ms 1
|
| 4040 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.738ms 100.28% 3.738ms 3.738ms 1
|
| 4041 |
+
aten::scaled_dot_product_attention 0.46% 24.280us 3.42% 180.086us 60.029us 0.000us 0.00% 2.949ms 982.872us 3
|
| 4042 |
+
aten::_scaled_dot_product_flash_attention 0.34% 18.160us 2.96% 155.806us 51.935us 0.000us 0.00% 2.949ms 982.872us 3
|
| 4043 |
+
aten::_flash_attention_forward 0.73% 38.599us 2.20% 115.865us 38.622us 2.949ms 79.09% 2.949ms 982.872us 3
|
| 4044 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.949ms 79.09% 2.949ms 982.872us 3
|
| 4045 |
+
aten::contiguous 0.17% 8.991us 32.44% 1.710ms 142.465us 0.000us 0.00% 837.719us 69.810us 12
|
| 4046 |
+
aten::clone 0.53% 27.728us 32.27% 1.701ms 141.715us 0.000us 0.00% 837.719us 69.810us 12
|
| 4047 |
+
aten::copy_ 1.52% 79.873us 30.57% 1.611ms 134.242us 779.480us 20.91% 837.719us 69.810us 12
|
| 4048 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.480us 20.91% 779.480us 64.957us 12
|
| 4049 |
+
Activity Buffer Request 27.50% 1.449ms 27.50% 1.449ms 1.449ms 58.239us 1.56% 58.239us 58.239us 1
|
| 4050 |
+
aten::transpose 0.92% 48.219us 1.24% 65.252us 2.719us 0.000us 0.00% 0.000us 0.000us 24
|
| 4051 |
+
aten::as_strided 0.32% 17.033us 0.32% 17.033us 0.710us 0.000us 0.00% 0.000us 0.000us 24
|
| 4052 |
+
aten::empty_like 0.37% 19.303us 1.55% 81.795us 5.453us 0.000us 0.00% 0.000us 0.000us 15
|
| 4053 |
+
aten::empty 1.44% 76.031us 1.44% 76.031us 3.168us 0.000us 0.00% 0.000us 0.000us 24
|
| 4054 |
+
cudaLaunchKernel 1.98% 104.564us 1.98% 104.564us 6.971us 0.000us 0.00% 0.000us 0.000us 15
|
| 4055 |
+
aten::empty_strided 0.28% 14.492us 0.28% 14.492us 4.831us 0.000us 0.00% 0.000us 0.000us 3
|
| 4056 |
+
cudaDeviceGetAttribute 0.04% 1.860us 0.04% 1.860us 0.310us 0.000us 0.00% 0.000us 0.000us 6
|
| 4057 |
+
cudaFuncSetAttribute 0.10% 5.030us 0.10% 5.030us 1.677us 0.000us 0.00% 0.000us 0.000us 3
|
| 4058 |
+
cudaDeviceSynchronize 58.55% 3.085ms 58.55% 3.085ms 3.085ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4059 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4060 |
+
Self CPU time total: 5.269ms
|
| 4061 |
+
Self CUDA time total: 3.728ms
|
| 4062 |
|
| 4063 |
|
| 4064 |
|
|
|
|
| 4068 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4069 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
+
torch_flash_ma 5.01% 280.573us 44.17% 2.475ms 2.475ms 0.000us 0.00% 3.878ms 3.878ms 1
|
| 4072 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.831ms 100.27% 3.831ms 3.831ms 1
|
| 4073 |
+
aten::scaled_dot_product_attention 0.48% 26.630us 3.39% 189.956us 63.319us 0.000us 0.00% 3.032ms 1.011ms 3
|
| 4074 |
+
aten::_scaled_dot_product_flash_attention 0.34% 19.101us 2.91% 163.326us 54.442us 0.000us 0.00% 3.032ms 1.011ms 3
|
| 4075 |
+
aten::_flash_attention_forward 0.70% 39.063us 2.15% 120.325us 40.108us 3.032ms 79.37% 3.032ms 1.011ms 3
|
| 4076 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.032ms 79.37% 3.032ms 1.011ms 3
|
| 4077 |
+
aten::contiguous 0.17% 9.271us 34.98% 1.960ms 163.354us 0.000us 0.00% 845.820us 70.485us 12
|
| 4078 |
+
aten::clone 0.52% 28.974us 34.82% 1.951ms 162.581us 0.000us 0.00% 845.820us 70.485us 12
|
| 4079 |
+
aten::copy_ 1.48% 83.180us 33.17% 1.859ms 154.908us 788.284us 20.63% 845.820us 70.485us 12
|
| 4080 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 788.284us 20.63% 788.284us 65.690us 12
|
| 4081 |
+
Activity Buffer Request 26.18% 1.467ms 26.18% 1.467ms 1.467ms 57.536us 1.51% 57.536us 57.536us 1
|
| 4082 |
+
aten::transpose 0.89% 50.110us 1.21% 67.952us 2.831us 0.000us 0.00% 0.000us 0.000us 24
|
| 4083 |
+
aten::as_strided 0.32% 17.842us 0.32% 17.842us 0.743us 0.000us 0.00% 0.000us 0.000us 24
|
| 4084 |
+
aten::empty_like 0.36% 19.969us 1.53% 85.492us 5.699us 0.000us 0.00% 0.000us 0.000us 15
|
| 4085 |
+
aten::empty 1.37% 76.982us 1.37% 76.982us 3.208us 0.000us 0.00% 0.000us 0.000us 24
|
| 4086 |
+
cudaLaunchKernel 5.95% 333.480us 5.95% 333.480us 22.232us 0.000us 0.00% 0.000us 0.000us 15
|
| 4087 |
+
aten::empty_strided 0.30% 17.041us 0.30% 17.041us 5.680us 0.000us 0.00% 0.000us 0.000us 3
|
| 4088 |
+
cudaDeviceGetAttribute 0.03% 1.700us 0.03% 1.700us 0.283us 0.000us 0.00% 0.000us 0.000us 6
|
| 4089 |
+
cudaFuncSetAttribute 0.07% 4.040us 0.07% 4.040us 1.347us 0.000us 0.00% 0.000us 0.000us 3
|
| 4090 |
+
cudaDeviceSynchronize 55.83% 3.129ms 55.83% 3.129ms 3.129ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4091 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4092 |
+
Self CPU time total: 5.603ms
|
| 4093 |
+
Self CUDA time total: 3.820ms
|
| 4094 |
|
| 4095 |
|
| 4096 |
|
|
|
|
| 4100 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4101 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4102 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4103 |
+
torch_flash_ma 5.07% 303.893us 39.93% 2.395ms 2.395ms 0.000us 0.00% 4.370ms 4.370ms 1
|
| 4104 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.320ms 100.25% 4.320ms 4.320ms 1
|
| 4105 |
+
aten::scaled_dot_product_attention 0.41% 24.650us 3.07% 184.006us 61.335us 0.000us 0.00% 3.503ms 1.168ms 3
|
| 4106 |
+
aten::_scaled_dot_product_flash_attention 0.32% 19.311us 2.66% 159.356us 53.119us 0.000us 0.00% 3.503ms 1.168ms 3
|
| 4107 |
+
aten::_flash_attention_forward 0.68% 40.911us 1.97% 118.205us 39.402us 3.503ms 81.28% 3.503ms 1.168ms 3
|
| 4108 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.503ms 81.28% 3.503ms 1.168ms 3
|
| 4109 |
+
aten::contiguous 0.15% 8.977us 31.04% 1.862ms 155.201us 0.000us 0.00% 867.581us 72.298us 12
|
| 4110 |
+
aten::clone 0.47% 28.114us 30.89% 1.853ms 154.453us 0.000us 0.00% 867.581us 72.298us 12
|
| 4111 |
+
aten::copy_ 1.36% 81.500us 29.40% 1.764ms 146.991us 806.749us 18.72% 867.581us 72.298us 12
|
| 4112 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 806.749us 18.72% 806.749us 67.229us 12
|
| 4113 |
+
Activity Buffer Request 23.82% 1.429ms 23.82% 1.429ms 1.429ms 60.832us 1.41% 60.832us 60.832us 1
|
| 4114 |
+
aten::transpose 0.82% 49.363us 1.11% 66.863us 2.786us 0.000us 0.00% 0.000us 0.000us 24
|
| 4115 |
+
aten::as_strided 0.29% 17.500us 0.29% 17.500us 0.729us 0.000us 0.00% 0.000us 0.000us 24
|
| 4116 |
+
aten::empty_like 0.33% 20.081us 1.37% 82.424us 5.495us 0.000us 0.00% 0.000us 0.000us 15
|
| 4117 |
+
aten::empty 1.26% 75.593us 1.26% 75.593us 3.150us 0.000us 0.00% 0.000us 0.000us 24
|
| 4118 |
+
cudaLaunchKernel 4.60% 275.759us 4.60% 275.759us 18.384us 0.000us 0.00% 0.000us 0.000us 15
|
| 4119 |
+
aten::empty_strided 0.25% 15.251us 0.25% 15.251us 5.084us 0.000us 0.00% 0.000us 0.000us 3
|
| 4120 |
+
cudaDeviceGetAttribute 0.03% 1.740us 0.03% 1.740us 0.290us 0.000us 0.00% 0.000us 0.000us 6
|
| 4121 |
+
cudaFuncSetAttribute 0.06% 3.680us 0.06% 3.680us 1.227us 0.000us 0.00% 0.000us 0.000us 3
|
| 4122 |
+
cudaDeviceSynchronize 60.07% 3.604ms 60.07% 3.604ms 3.604ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4123 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4124 |
+
Self CPU time total: 5.999ms
|
| 4125 |
+
Self CUDA time total: 4.309ms
|
| 4126 |
|
| 4127 |
|
| 4128 |
|
|
|
|
| 4132 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4133 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4134 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4135 |
+
torch_flash_ma 3.83% 232.270us 37.82% 2.296ms 2.296ms 0.000us 0.00% 4.474ms 4.474ms 1
|
| 4136 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.423ms 100.25% 4.423ms 4.423ms 1
|
| 4137 |
+
aten::scaled_dot_product_attention 0.41% 24.850us 2.85% 172.746us 57.582us 0.000us 0.00% 3.595ms 1.198ms 3
|
| 4138 |
+
aten::_scaled_dot_product_flash_attention 0.30% 18.250us 2.44% 147.896us 49.299us 0.000us 0.00% 3.595ms 1.198ms 3
|
| 4139 |
+
aten::_flash_attention_forward 0.54% 32.692us 1.77% 107.224us 35.741us 3.595ms 81.48% 3.595ms 1.198ms 3
|
| 4140 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.595ms 81.48% 3.595ms 1.198ms 3
|
| 4141 |
+
aten::contiguous 0.14% 8.610us 30.41% 1.846ms 153.859us 0.000us 0.00% 878.139us 73.178us 12
|
| 4142 |
+
aten::clone 0.45% 27.368us 30.27% 1.838ms 153.142us 0.000us 0.00% 878.139us 73.178us 12
|
| 4143 |
+
aten::copy_ 1.35% 81.917us 28.83% 1.750ms 145.831us 817.083us 18.52% 878.139us 73.178us 12
|
| 4144 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 817.083us 18.52% 817.083us 68.090us 12
|
| 4145 |
+
Activity Buffer Request 23.72% 1.440ms 23.72% 1.440ms 1.440ms 61.056us 1.38% 61.056us 61.056us 1
|
| 4146 |
+
aten::transpose 0.82% 50.064us 1.10% 66.792us 2.783us 0.000us 0.00% 0.000us 0.000us 24
|
| 4147 |
+
aten::as_strided 0.28% 16.728us 0.28% 16.728us 0.697us 0.000us 0.00% 0.000us 0.000us 24
|
| 4148 |
+
aten::empty_like 0.32% 19.431us 1.31% 79.591us 5.306us 0.000us 0.00% 0.000us 0.000us 15
|
| 4149 |
+
aten::empty 1.21% 73.220us 1.21% 73.220us 3.051us 0.000us 0.00% 0.000us 0.000us 24
|
| 4150 |
+
cudaLaunchKernel 4.12% 249.950us 4.12% 249.950us 16.663us 0.000us 0.00% 0.000us 0.000us 15
|
| 4151 |
+
aten::empty_strided 0.24% 14.270us 0.24% 14.270us 4.757us 0.000us 0.00% 0.000us 0.000us 3
|
| 4152 |
+
cudaDeviceGetAttribute 0.03% 1.680us 0.03% 1.680us 0.280us 0.000us 0.00% 0.000us 0.000us 6
|
| 4153 |
+
cudaFuncSetAttribute 0.07% 4.380us 0.07% 4.380us 1.460us 0.000us 0.00% 0.000us 0.000us 3
|
| 4154 |
+
cudaDeviceSynchronize 62.18% 3.775ms 62.18% 3.775ms 3.775ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4155 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4156 |
+
Self CPU time total: 6.071ms
|
| 4157 |
+
Self CUDA time total: 4.413ms
|
| 4158 |
|
| 4159 |
|
| 4160 |
impl wl p50(ms) ok
|
| 4161 |
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
|
| 4162 |
torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
|
| 4163 |
+
torch_flash_ma cuda_attn_L320_bfloat16 1.28 True
|
| 4164 |
+
torch_flash_ma cuda_attn_L384_bfloat16 1.31 True
|
| 4165 |
+
torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
|
| 4166 |
+
torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
|
| 4167 |
</pre></div>
|
| 4168 |
<div class="cell-artifacts">
|
| 4169 |
<h4>Artifacts:</h4>
|
flash_attn/impls/hf_kernels_flash_attn.html
CHANGED
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark |
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3926,21 +3926,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
|
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3928 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3929 |
-
hf_kernels_flash_attn 3.
|
| 3930 |
-
_flash_attn_9e27194::fwd 1.
|
| 3931 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3932 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3933 |
-
Activity Buffer Request
|
| 3934 |
-
cudaDeviceGetAttribute 0.
|
| 3935 |
-
aten::empty_like 0.
|
| 3936 |
-
aten::empty_strided 0.
|
| 3937 |
-
aten::empty 0.
|
| 3938 |
-
cudaFuncSetAttribute 0.
|
| 3939 |
-
cudaLaunchKernel 0.
|
| 3940 |
-
cudaDeviceSynchronize 58.
|
| 3941 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3942 |
-
Self CPU time total: 4.
|
| 3943 |
-
Self CUDA time total: 2.
|
| 3944 |
|
| 3945 |
|
| 3946 |
|
|
@@ -3950,21 +3950,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
|
|
| 3950 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3951 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3952 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3953 |
-
hf_kernels_flash_attn 2.
|
| 3954 |
-
_flash_attn_9e27194::fwd 1.
|
| 3955 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3956 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3957 |
-
Activity Buffer Request 32.
|
| 3958 |
-
cudaDeviceGetAttribute 0.10% 4.
|
| 3959 |
-
aten::empty_like 0.
|
| 3960 |
-
aten::empty_strided 0.
|
| 3961 |
-
aten::empty 0.47% 21.
|
| 3962 |
-
cudaFuncSetAttribute 0.
|
| 3963 |
-
cudaLaunchKernel 0.
|
| 3964 |
-
cudaDeviceSynchronize 62.
|
| 3965 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3966 |
-
Self CPU time total: 4.
|
| 3967 |
-
Self CUDA time total: 2.
|
| 3968 |
|
| 3969 |
|
| 3970 |
|
|
@@ -3974,21 +3974,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
|
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
-
hf_kernels_flash_attn 2.58% 116.
|
| 3978 |
-
_flash_attn_9e27194::fwd 1.
|
| 3979 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 3980 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 3981 |
-
Activity Buffer Request 31.
|
| 3982 |
-
cudaDeviceGetAttribute 0.
|
| 3983 |
-
aten::empty_like 0.18% 8.151us 0.
|
| 3984 |
-
aten::empty_strided 0.
|
| 3985 |
-
aten::empty 0.
|
| 3986 |
-
cudaFuncSetAttribute 0.
|
| 3987 |
-
cudaLaunchKernel 0.
|
| 3988 |
-
cudaDeviceSynchronize 62.
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
-
Self CPU time total: 4.
|
| 3991 |
-
Self CUDA time total: 3.
|
| 3992 |
|
| 3993 |
|
| 3994 |
|
|
@@ -3998,21 +3998,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
|
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4000 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
-
hf_kernels_flash_attn 2.
|
| 4002 |
-
_flash_attn_9e27194::fwd 1.
|
| 4003 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4004 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4005 |
-
Activity Buffer Request
|
| 4006 |
-
cudaDeviceGetAttribute 0.
|
| 4007 |
-
aten::empty_like 0.16% 7.
|
| 4008 |
-
aten::empty_strided 0.
|
| 4009 |
-
aten::empty 0.
|
| 4010 |
-
cudaFuncSetAttribute 0.08% 3.
|
| 4011 |
-
cudaLaunchKernel 5.
|
| 4012 |
-
cudaDeviceSynchronize
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
-
Self CPU time total: 4.
|
| 4015 |
-
Self CUDA time total: 3.
|
| 4016 |
|
| 4017 |
|
| 4018 |
|
|
@@ -4022,21 +4022,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
|
|
| 4022 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4023 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4024 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
-
hf_kernels_flash_attn 2.
|
| 4026 |
-
_flash_attn_9e27194::fwd 1.
|
| 4027 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4028 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4029 |
-
Activity Buffer Request
|
| 4030 |
-
cudaDeviceGetAttribute 0.
|
| 4031 |
-
aten::empty_like 0.
|
| 4032 |
-
aten::empty_strided 0.
|
| 4033 |
-
aten::empty 0.
|
| 4034 |
-
cudaFuncSetAttribute 0.
|
| 4035 |
-
cudaLaunchKernel
|
| 4036 |
-
cudaDeviceSynchronize
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
-
Self CPU time total:
|
| 4039 |
-
Self CUDA time total: 3.
|
| 4040 |
|
| 4041 |
|
| 4042 |
|
|
@@ -4046,88 +4046,41 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
|
|
| 4046 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4047 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
-
hf_kernels_flash_attn 2.
|
| 4050 |
-
_flash_attn_9e27194::fwd
|
| 4051 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4052 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4053 |
-
Activity Buffer Request
|
| 4054 |
-
cudaDeviceGetAttribute 0.
|
| 4055 |
-
aten::empty_like 0.
|
| 4056 |
-
aten::empty_strided 0.
|
| 4057 |
-
aten::empty 0.
|
| 4058 |
-
cudaFuncSetAttribute 0.
|
| 4059 |
-
cudaLaunchKernel 4.
|
| 4060 |
-
cudaDeviceSynchronize
|
| 4061 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4062 |
-
Self CPU time total:
|
| 4063 |
-
Self CUDA time total: 3.
|
| 4064 |
|
| 4065 |
|
| 4066 |
impl wl p50(ms) ok
|
| 4067 |
-
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.
|
| 4068 |
-
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.
|
| 4069 |
-
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.
|
| 4070 |
-
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.
|
| 4071 |
-
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.
|
| 4072 |
-
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.
|
| 4073 |
</pre></div>
|
| 4074 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4075 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4076 |
<div class="uv-logs-content" style="display: none;">
|
| 4077 |
-
|
| 4078 |
-
Downloading hf-xet (3.2MiB)
|
| 4079 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4080 |
-
Downloading networkx (1.9MiB)
|
| 4081 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4082 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4083 |
-
Downloading kiwisolver (1.4MiB)
|
| 4084 |
-
Downloading pillow (6.7MiB)
|
| 4085 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4086 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4087 |
-
Downloading sympy (6.0MiB)
|
| 4088 |
-
Downloading setuptools (1.1MiB)
|
| 4089 |
-
Downloading matplotlib (8.3MiB)
|
| 4090 |
-
Downloading numpy (16.2MiB)
|
| 4091 |
-
Downloading triton (148.3MiB)
|
| 4092 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4093 |
-
Downloading fonttools (4.7MiB)
|
| 4094 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4095 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4096 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4097 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4098 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4099 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4100 |
-
Downloading torch (846.9MiB)
|
| 4101 |
-
Downloading nvidia-cufile-cu12
|
| 4102 |
-
Downloading kiwisolver
|
| 4103 |
-
Downloading hf-xet
|
| 4104 |
-
Downloading setuptools
|
| 4105 |
-
Downloading networkx
|
| 4106 |
-
Downloading fonttools
|
| 4107 |
-
Downloading pillow
|
| 4108 |
-
Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4109 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 4110 |
-
Downloading matplotlib
|
| 4111 |
-
Downloading numpy
|
| 4112 |
-
Downloading sympy
|
| 4113 |
-
Downloading nvidia-nvjitlink-cu12
|
| 4114 |
-
Downloading nvidia-curand-cu12
|
| 4115 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 4116 |
-
Downloading triton
|
| 4117 |
-
Downloading nvidia-cufft-cu12
|
| 4118 |
-
Downloading nvidia-cusolver-cu12
|
| 4119 |
-
Downloading nvidia-cusparselt-cu12
|
| 4120 |
-
Downloading nvidia-cusparse-cu12
|
| 4121 |
-
Downloading nvidia-nccl-cu12
|
| 4122 |
-
Downloading nvidia-cublas-cu12
|
| 4123 |
-
Downloading nvidia-cudnn-cu12
|
| 4124 |
-
Downloading torch
|
| 4125 |
-
Installed 52 packages in 223ms
|
| 4126 |
</div>
|
| 4127 |
</div>
|
| 4128 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4129 |
-
Fetching 20 files:
|
| 4130 |
-
Fetching 20 files:
|
|
|
|
| 4131 |
<div class="cell-artifacts">
|
| 4132 |
<h4>Artifacts:</h4>
|
| 4133 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 6.08s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3928 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3929 |
+
hf_kernels_flash_attn 3.64% 160.058us 41.50% 1.823ms 1.823ms 0.000us 0.00% 3.744ms 3.744ms 1
|
| 3930 |
+
_flash_attn_9e27194::fwd 1.78% 78.347us 37.86% 1.663ms 554.208us 2.792ms 100.00% 3.744ms 1.248ms 3
|
| 3931 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.794ms 100.05% 2.794ms 2.794ms 1
|
| 3932 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.792ms 100.00% 2.792ms 930.800us 3
|
| 3933 |
+
Activity Buffer Request 33.00% 1.449ms 33.00% 1.449ms 1.449ms 951.685us 34.08% 951.685us 951.685us 1
|
| 3934 |
+
cudaDeviceGetAttribute 0.13% 5.638us 0.13% 5.638us 0.376us 0.000us 0.00% 0.000us 0.000us 15
|
| 3935 |
+
aten::empty_like 0.40% 17.551us 1.19% 52.122us 17.374us 0.000us 0.00% 0.000us 0.000us 3
|
| 3936 |
+
aten::empty_strided 0.79% 34.571us 0.79% 34.571us 11.524us 0.000us 0.00% 0.000us 0.000us 3
|
| 3937 |
+
aten::empty 0.57% 24.890us 0.57% 24.890us 2.766us 0.000us 0.00% 0.000us 0.000us 9
|
| 3938 |
+
cudaFuncSetAttribute 0.28% 12.210us 0.28% 12.210us 4.070us 0.000us 0.00% 0.000us 0.000us 3
|
| 3939 |
+
cudaLaunchKernel 0.92% 40.292us 0.92% 40.292us 13.431us 0.000us 0.00% 0.000us 0.000us 3
|
| 3940 |
+
cudaDeviceSynchronize 58.50% 2.569ms 58.50% 2.569ms 2.569ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3941 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3942 |
+
Self CPU time total: 4.392ms
|
| 3943 |
+
Self CUDA time total: 2.792ms
|
| 3944 |
|
| 3945 |
|
| 3946 |
|
|
|
|
| 3950 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3951 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3952 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3953 |
+
hf_kernels_flash_attn 2.22% 99.144us 37.48% 1.673ms 1.673ms 0.000us 0.00% 3.949ms 3.949ms 1
|
| 3954 |
+
_flash_attn_9e27194::fwd 1.20% 53.462us 35.26% 1.574ms 524.654us 2.953ms 100.00% 3.949ms 1.316ms 3
|
| 3955 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.955ms 100.05% 2.955ms 2.955ms 1
|
| 3956 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.953ms 100.00% 2.953ms 984.436us 3
|
| 3957 |
+
Activity Buffer Request 32.23% 1.439ms 32.23% 1.439ms 1.439ms 995.807us 33.72% 995.807us 995.807us 1
|
| 3958 |
+
cudaDeviceGetAttribute 0.10% 4.621us 0.10% 4.621us 0.308us 0.000us 0.00% 0.000us 0.000us 15
|
| 3959 |
+
aten::empty_like 0.17% 7.710us 0.56% 24.861us 8.287us 0.000us 0.00% 0.000us 0.000us 3
|
| 3960 |
+
aten::empty_strided 0.38% 17.151us 0.38% 17.151us 5.717us 0.000us 0.00% 0.000us 0.000us 3
|
| 3961 |
+
aten::empty 0.47% 21.122us 0.47% 21.122us 2.347us 0.000us 0.00% 0.000us 0.000us 9
|
| 3962 |
+
cudaFuncSetAttribute 0.08% 3.791us 0.08% 3.791us 1.264us 0.000us 0.00% 0.000us 0.000us 3
|
| 3963 |
+
cudaLaunchKernel 0.61% 27.380us 0.61% 27.380us 9.127us 0.000us 0.00% 0.000us 0.000us 3
|
| 3964 |
+
cudaDeviceSynchronize 62.52% 2.791ms 62.52% 2.791ms 2.791ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3965 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3966 |
+
Self CPU time total: 4.464ms
|
| 3967 |
+
Self CUDA time total: 2.953ms
|
| 3968 |
|
| 3969 |
|
| 3970 |
|
|
|
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
+
hf_kernels_flash_attn 2.58% 116.955us 37.54% 1.702ms 1.702ms 0.000us 0.00% 4.041ms 4.041ms 1
|
| 3978 |
+
_flash_attn_9e27194::fwd 1.53% 69.255us 34.96% 1.585ms 528.314us 3.010ms 100.00% 4.041ms 1.347ms 3
|
| 3979 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.012ms 100.05% 3.012ms 3.012ms 1
|
| 3980 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.010ms 100.00% 3.010ms 1.003ms 3
|
| 3981 |
+
Activity Buffer Request 31.53% 1.430ms 31.53% 1.430ms 1.430ms 1.031ms 34.26% 1.031ms 1.031ms 1
|
| 3982 |
+
cudaDeviceGetAttribute 0.10% 4.450us 0.10% 4.450us 0.297us 0.000us 0.00% 0.000us 0.000us 15
|
| 3983 |
+
aten::empty_like 0.18% 8.151us 0.57% 25.801us 8.600us 0.000us 0.00% 0.000us 0.000us 3
|
| 3984 |
+
aten::empty_strided 0.39% 17.650us 0.39% 17.650us 5.883us 0.000us 0.00% 0.000us 0.000us 3
|
| 3985 |
+
aten::empty 0.48% 21.771us 0.48% 21.771us 2.419us 0.000us 0.00% 0.000us 0.000us 9
|
| 3986 |
+
cudaFuncSetAttribute 0.10% 4.360us 0.10% 4.360us 1.453us 0.000us 0.00% 0.000us 0.000us 3
|
| 3987 |
+
cudaLaunchKernel 0.66% 29.790us 0.66% 29.790us 9.930us 0.000us 0.00% 0.000us 0.000us 3
|
| 3988 |
+
cudaDeviceSynchronize 62.46% 2.832ms 62.46% 2.832ms 2.832ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
+
Self CPU time total: 4.534ms
|
| 3991 |
+
Self CUDA time total: 3.010ms
|
| 3992 |
|
| 3993 |
|
| 3994 |
|
|
|
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4000 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
+
hf_kernels_flash_attn 2.39% 114.805us 40.03% 1.925ms 1.925ms 0.000us 0.00% 4.094ms 4.094ms 1
|
| 4002 |
+
_flash_attn_9e27194::fwd 1.09% 52.653us 37.65% 1.810ms 603.407us 3.063ms 100.00% 4.094ms 1.365ms 3
|
| 4003 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.065ms 100.05% 3.065ms 3.065ms 1
|
| 4004 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.063ms 100.00% 3.063ms 1.021ms 3
|
| 4005 |
+
Activity Buffer Request 29.78% 1.432ms 29.78% 1.432ms 1.432ms 1.031ms 33.65% 1.031ms 1.031ms 1
|
| 4006 |
+
cudaDeviceGetAttribute 0.10% 4.861us 0.10% 4.861us 0.324us 0.000us 0.00% 0.000us 0.000us 15
|
| 4007 |
+
aten::empty_like 0.16% 7.720us 0.55% 26.331us 8.777us 0.000us 0.00% 0.000us 0.000us 3
|
| 4008 |
+
aten::empty_strided 0.39% 18.611us 0.39% 18.611us 6.204us 0.000us 0.00% 0.000us 0.000us 3
|
| 4009 |
+
aten::empty 0.45% 21.731us 0.45% 21.731us 2.415us 0.000us 0.00% 0.000us 0.000us 9
|
| 4010 |
+
cudaFuncSetAttribute 0.08% 3.728us 0.08% 3.728us 1.243us 0.000us 0.00% 0.000us 0.000us 3
|
| 4011 |
+
cudaLaunchKernel 5.59% 268.862us 5.59% 268.862us 89.621us 0.000us 0.00% 0.000us 0.000us 3
|
| 4012 |
+
cudaDeviceSynchronize 59.97% 2.884ms 59.97% 2.884ms 2.884ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
+
Self CPU time total: 4.809ms
|
| 4015 |
+
Self CUDA time total: 3.063ms
|
| 4016 |
|
| 4017 |
|
| 4018 |
|
|
|
|
| 4022 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4023 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4024 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
+
hf_kernels_flash_attn 2.13% 113.755us 35.84% 1.918ms 1.918ms 0.000us 0.00% 4.786ms 4.786ms 1
|
| 4026 |
+
_flash_attn_9e27194::fwd 1.02% 54.483us 33.71% 1.804ms 601.364us 3.588ms 100.00% 4.786ms 1.595ms 3
|
| 4027 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.590ms 100.04% 3.590ms 3.590ms 1
|
| 4028 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.588ms 100.00% 3.588ms 1.196ms 3
|
| 4029 |
+
Activity Buffer Request 26.99% 1.445ms 26.99% 1.445ms 1.445ms 1.198ms 33.38% 1.198ms 1.198ms 1
|
| 4030 |
+
cudaDeviceGetAttribute 0.08% 4.270us 0.08% 4.270us 0.285us 0.000us 0.00% 0.000us 0.000us 15
|
| 4031 |
+
aten::empty_like 0.15% 8.039us 0.48% 25.640us 8.547us 0.000us 0.00% 0.000us 0.000us 3
|
| 4032 |
+
aten::empty_strided 0.33% 17.601us 0.33% 17.601us 5.867us 0.000us 0.00% 0.000us 0.000us 3
|
| 4033 |
+
aten::empty 0.40% 21.582us 0.40% 21.582us 2.398us 0.000us 0.00% 0.000us 0.000us 9
|
| 4034 |
+
cudaFuncSetAttribute 0.07% 3.700us 0.07% 3.700us 1.233us 0.000us 0.00% 0.000us 0.000us 3
|
| 4035 |
+
cudaLaunchKernel 4.67% 249.891us 4.67% 249.891us 83.297us 0.000us 0.00% 0.000us 0.000us 3
|
| 4036 |
+
cudaDeviceSynchronize 64.16% 3.434ms 64.16% 3.434ms 3.434ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
+
Self CPU time total: 5.351ms
|
| 4039 |
+
Self CUDA time total: 3.588ms
|
| 4040 |
|
| 4041 |
|
| 4042 |
|
|
|
|
| 4046 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4047 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
+
hf_kernels_flash_attn 2.08% 111.044us 35.25% 1.879ms 1.879ms 0.000us 0.00% 4.816ms 4.816ms 1
|
| 4050 |
+
_flash_attn_9e27194::fwd 0.99% 52.834us 33.17% 1.768ms 589.427us 3.606ms 100.00% 4.816ms 1.605ms 3
|
| 4051 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.607ms 100.05% 3.607ms 3.607ms 1
|
| 4052 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.606ms 100.00% 3.606ms 1.202ms 3
|
| 4053 |
+
Activity Buffer Request 26.56% 1.416ms 26.56% 1.416ms 1.416ms 1.210ms 33.55% 1.210ms 1.210ms 1
|
| 4054 |
+
cudaDeviceGetAttribute 0.08% 4.460us 0.08% 4.460us 0.297us 0.000us 0.00% 0.000us 0.000us 15
|
| 4055 |
+
aten::empty_like 0.14% 7.500us 0.49% 26.051us 8.684us 0.000us 0.00% 0.000us 0.000us 3
|
| 4056 |
+
aten::empty_strided 0.35% 18.551us 0.35% 18.551us 6.184us 0.000us 0.00% 0.000us 0.000us 3
|
| 4057 |
+
aten::empty 0.41% 21.960us 0.41% 21.960us 2.440us 0.000us 0.00% 0.000us 0.000us 9
|
| 4058 |
+
cudaFuncSetAttribute 0.08% 4.009us 0.08% 4.009us 1.336us 0.000us 0.00% 0.000us 0.000us 3
|
| 4059 |
+
cudaLaunchKernel 4.55% 242.792us 4.55% 242.792us 80.931us 0.000us 0.00% 0.000us 0.000us 3
|
| 4060 |
+
cudaDeviceSynchronize 64.75% 3.452ms 64.75% 3.452ms 3.452ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4061 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4062 |
+
Self CPU time total: 5.332ms
|
| 4063 |
+
Self CUDA time total: 3.606ms
|
| 4064 |
|
| 4065 |
|
| 4066 |
impl wl p50(ms) ok
|
| 4067 |
+
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
|
| 4068 |
+
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
|
| 4069 |
+
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
|
| 4070 |
+
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True
|
| 4071 |
+
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
|
| 4072 |
+
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.21 True
|
| 4073 |
</pre></div>
|
| 4074 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4075 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4076 |
<div class="uv-logs-content" style="display: none;">
|
| 4077 |
+
Installed 15 packages in 13ms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4078 |
</div>
|
| 4079 |
</div>
|
| 4080 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4081 |
+
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:04, 4.26it/s]
|
| 4082 |
+
Fetching 20 files: 10%|█ | 2/20 [00:01<00:17, 1.03it/s]
|
| 4083 |
+
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 11.64it/s]</div>
|
| 4084 |
<div class="cell-artifacts">
|
| 4085 |
<h4>Artifacts:</h4>
|
| 4086 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/hf_kernels_flash_attn3.html
CHANGED
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark | 5.
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3925,19 +3925,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
|
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3927 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3928 |
-
hf_kernels_flash_attn3 3.
|
| 3929 |
-
FlashAttnFunc
|
| 3930 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 3931 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3932 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3933 |
-
Activity Buffer Request 33.
|
| 3934 |
-
aten::empty 1.
|
| 3935 |
-
cudaFuncSetAttribute 0.
|
| 3936 |
-
cudaLaunchKernel 1.
|
| 3937 |
-
cudaDeviceSynchronize 55.
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
-
Self CPU time total: 4.
|
| 3940 |
-
Self CUDA time total: 2.
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
@@ -3947,19 +3947,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
-
hf_kernels_flash_attn3
|
| 3951 |
-
FlashAttnFunc 2.
|
| 3952 |
-
_flash_attn3_48fe103_dirty::fwd 1.23%
|
| 3953 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3954 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3955 |
-
Activity Buffer Request
|
| 3956 |
-
aten::empty 0.
|
| 3957 |
-
cudaFuncSetAttribute 0.
|
| 3958 |
-
cudaLaunchKernel 0.
|
| 3959 |
-
cudaDeviceSynchronize
|
| 3960 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3961 |
-
Self CPU time total: 4.
|
| 3962 |
-
Self CUDA time total: 2.
|
| 3963 |
|
| 3964 |
|
| 3965 |
|
|
@@ -3969,19 +3969,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
|
|
| 3969 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3970 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
-
hf_kernels_flash_attn3 2.
|
| 3973 |
-
FlashAttnFunc 2.
|
| 3974 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 3975 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3976 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3977 |
-
Activity Buffer Request 32.
|
| 3978 |
-
aten::empty 0.
|
| 3979 |
-
cudaFuncSetAttribute 0.
|
| 3980 |
-
cudaLaunchKernel 0.
|
| 3981 |
-
cudaDeviceSynchronize 60.
|
| 3982 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3983 |
-
Self CPU time total: 4.
|
| 3984 |
-
Self CUDA time total: 2.
|
| 3985 |
|
| 3986 |
|
| 3987 |
|
|
@@ -3991,19 +3991,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
|
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
-
hf_kernels_flash_attn3 2.
|
| 3995 |
-
FlashAttnFunc
|
| 3996 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 3997 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3998 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3999 |
-
Activity Buffer Request
|
| 4000 |
-
aten::empty 0.
|
| 4001 |
-
cudaFuncSetAttribute 0.
|
| 4002 |
-
cudaLaunchKernel 5.
|
| 4003 |
-
cudaDeviceSynchronize
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
-
Self CPU time total: 4.
|
| 4006 |
-
Self CUDA time total:
|
| 4007 |
|
| 4008 |
|
| 4009 |
|
|
@@ -4013,19 +4013,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
|
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
-
hf_kernels_flash_attn3 2.45%
|
| 4017 |
-
FlashAttnFunc 1.
|
| 4018 |
-
_flash_attn3_48fe103_dirty::fwd
|
| 4019 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4020 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4021 |
-
Activity Buffer Request 27.
|
| 4022 |
-
aten::empty 0.
|
| 4023 |
-
cudaFuncSetAttribute 0.
|
| 4024 |
-
cudaLaunchKernel 3.
|
| 4025 |
-
cudaDeviceSynchronize 62.
|
| 4026 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4027 |
-
Self CPU time total: 5.
|
| 4028 |
-
Self CUDA time total: 3.
|
| 4029 |
|
| 4030 |
|
| 4031 |
|
|
@@ -4035,33 +4035,33 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
|
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
-
hf_kernels_flash_attn3 2.
|
| 4039 |
-
FlashAttnFunc 1.
|
| 4040 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 4041 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4042 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4043 |
-
Activity Buffer Request 27.
|
| 4044 |
-
aten::empty 0.
|
| 4045 |
-
cudaFuncSetAttribute 0.10% 5.
|
| 4046 |
-
cudaLaunchKernel 3.
|
| 4047 |
-
cudaDeviceSynchronize 63.
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
-
Self CPU time total: 5.
|
| 4050 |
-
Self CUDA time total: 3.
|
| 4051 |
|
| 4052 |
|
| 4053 |
impl wl p50(ms) ok
|
| 4054 |
-
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.
|
| 4055 |
-
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.
|
| 4056 |
-
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.
|
| 4057 |
-
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.
|
| 4058 |
-
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.
|
| 4059 |
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
|
| 4060 |
</pre></div>
|
| 4061 |
<div class="cell-stderr">
|
| 4062 |
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4063 |
-
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.
|
| 4064 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.
|
| 4065 |
</div>
|
| 4066 |
<div class="cell-artifacts">
|
| 4067 |
<h4>Artifacts:</h4>
|
|
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 5.68s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3927 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3928 |
+
hf_kernels_flash_attn3 3.89% 167.076us 44.49% 1.911ms 1.911ms 0.000us 0.00% 3.576ms 3.576ms 1
|
| 3929 |
+
FlashAttnFunc 3.00% 128.934us 40.60% 1.744ms 581.290us 0.000us 0.00% 3.576ms 1.192ms 3
|
| 3930 |
+
_flash_attn3_48fe103_dirty::fwd 1.82% 78.184us 37.60% 1.615ms 538.312us 2.688ms 100.00% 3.576ms 1.192ms 3
|
| 3931 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.690ms 100.05% 2.690ms 2.690ms 1
|
| 3932 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.688ms 100.00% 2.688ms 896.117us 3
|
| 3933 |
+
Activity Buffer Request 33.29% 1.430ms 33.29% 1.430ms 1.430ms 887.327us 33.01% 887.327us 887.327us 1
|
| 3934 |
+
aten::empty 1.08% 46.281us 1.08% 46.281us 7.714us 0.000us 0.00% 0.000us 0.000us 6
|
| 3935 |
+
cudaFuncSetAttribute 0.37% 15.900us 0.37% 15.900us 5.300us 0.000us 0.00% 0.000us 0.000us 3
|
| 3936 |
+
cudaLaunchKernel 1.04% 44.671us 1.04% 44.671us 14.890us 0.000us 0.00% 0.000us 0.000us 3
|
| 3937 |
+
cudaDeviceSynchronize 55.51% 2.384ms 55.51% 2.384ms 2.384ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
+
Self CPU time total: 4.295ms
|
| 3940 |
+
Self CUDA time total: 2.688ms
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
+
hf_kernels_flash_attn3 3.06% 130.754us 41.10% 1.758ms 1.758ms 0.000us 0.00% 3.668ms 3.668ms 1
|
| 3951 |
+
FlashAttnFunc 2.23% 95.572us 38.05% 1.627ms 542.455us 0.000us 0.00% 3.668ms 1.223ms 3
|
| 3952 |
+
_flash_attn3_48fe103_dirty::fwd 1.23% 52.754us 35.81% 1.532ms 510.598us 2.747ms 100.00% 3.668ms 1.223ms 3
|
| 3953 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.748ms 100.05% 2.748ms 2.748ms 1
|
| 3954 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.747ms 100.00% 2.747ms 915.501us 3
|
| 3955 |
+
Activity Buffer Request 33.10% 1.416ms 33.10% 1.416ms 1.416ms 921.272us 33.54% 921.272us 921.272us 1
|
| 3956 |
+
aten::empty 0.63% 26.890us 0.63% 26.890us 4.482us 0.000us 0.00% 0.000us 0.000us 6
|
| 3957 |
+
cudaFuncSetAttribute 0.12% 4.970us 0.12% 4.970us 1.657us 0.000us 0.00% 0.000us 0.000us 3
|
| 3958 |
+
cudaLaunchKernel 0.73% 31.351us 0.73% 31.351us 10.450us 0.000us 0.00% 0.000us 0.000us 3
|
| 3959 |
+
cudaDeviceSynchronize 58.90% 2.519ms 58.90% 2.519ms 2.519ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3960 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3961 |
+
Self CPU time total: 4.277ms
|
| 3962 |
+
Self CUDA time total: 2.747ms
|
| 3963 |
|
| 3964 |
|
| 3965 |
|
|
|
|
| 3969 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3970 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
+
hf_kernels_flash_attn3 2.33% 101.653us 39.53% 1.727ms 1.727ms 0.000us 0.00% 3.829ms 3.829ms 1
|
| 3973 |
+
FlashAttnFunc 2.05% 89.593us 37.20% 1.625ms 541.619us 0.000us 0.00% 3.829ms 1.276ms 3
|
| 3974 |
+
_flash_attn3_48fe103_dirty::fwd 1.17% 51.051us 35.15% 1.535ms 511.754us 2.856ms 100.00% 3.829ms 1.276ms 3
|
| 3975 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.858ms 100.06% 2.858ms 2.858ms 1
|
| 3976 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.856ms 100.00% 2.856ms 952.136us 3
|
| 3977 |
+
Activity Buffer Request 32.54% 1.421ms 32.54% 1.421ms 1.421ms 972.574us 34.05% 972.574us 972.574us 1
|
| 3978 |
+
aten::empty 0.62% 27.231us 0.62% 27.231us 4.538us 0.000us 0.00% 0.000us 0.000us 6
|
| 3979 |
+
cudaFuncSetAttribute 0.12% 5.411us 0.12% 5.411us 1.804us 0.000us 0.00% 0.000us 0.000us 3
|
| 3980 |
+
cudaLaunchKernel 0.69% 30.341us 0.69% 30.341us 10.114us 0.000us 0.00% 0.000us 0.000us 3
|
| 3981 |
+
cudaDeviceSynchronize 60.47% 2.642ms 60.47% 2.642ms 2.642ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3982 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3983 |
+
Self CPU time total: 4.368ms
|
| 3984 |
+
Self CUDA time total: 2.856ms
|
| 3985 |
|
| 3986 |
|
| 3987 |
|
|
|
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
+
hf_kernels_flash_attn3 2.61% 122.474us 42.62% 2.001ms 2.001ms 0.000us 0.00% 3.906ms 3.906ms 1
|
| 3995 |
+
FlashAttnFunc 1.99% 93.683us 40.01% 1.879ms 626.332us 0.000us 0.00% 3.906ms 1.302ms 3
|
| 3996 |
+
_flash_attn3_48fe103_dirty::fwd 1.17% 54.872us 38.02% 1.785ms 595.104us 2.915ms 100.00% 3.906ms 1.302ms 3
|
| 3997 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.917ms 100.05% 2.917ms 2.917ms 1
|
| 3998 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.915ms 100.00% 2.915ms 971.727us 3
|
| 3999 |
+
Activity Buffer Request 31.11% 1.461ms 31.11% 1.461ms 1.461ms 991.129us 34.00% 991.129us 991.129us 1
|
| 4000 |
+
aten::empty 0.59% 27.622us 0.59% 27.622us 4.604us 0.000us 0.00% 0.000us 0.000us 6
|
| 4001 |
+
cudaFuncSetAttribute 0.12% 5.820us 0.12% 5.820us 1.940us 0.000us 0.00% 0.000us 0.000us 3
|
| 4002 |
+
cudaLaunchKernel 5.03% 236.178us 5.03% 236.178us 78.726us 0.000us 0.00% 0.000us 0.000us 3
|
| 4003 |
+
cudaDeviceSynchronize 57.38% 2.695ms 57.38% 2.695ms 2.695ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
+
Self CPU time total: 4.696ms
|
| 4006 |
+
Self CUDA time total: 2.915ms
|
| 4007 |
|
| 4008 |
|
| 4009 |
|
|
|
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
+
hf_kernels_flash_attn3 2.45% 124.235us 37.18% 1.882ms 1.882ms 0.000us 0.00% 4.537ms 4.537ms 1
|
| 4017 |
+
FlashAttnFunc 1.83% 92.522us 34.73% 1.758ms 585.897us 0.000us 0.00% 4.537ms 1.512ms 3
|
| 4018 |
+
_flash_attn3_48fe103_dirty::fwd 1.03% 52.313us 32.90% 1.665ms 555.056us 3.398ms 100.00% 4.537ms 1.512ms 3
|
| 4019 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.399ms 100.05% 3.399ms 3.399ms 1
|
| 4020 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.398ms 100.00% 3.398ms 1.133ms 3
|
| 4021 |
+
Activity Buffer Request 27.82% 1.408ms 27.82% 1.408ms 1.408ms 1.139ms 33.52% 1.139ms 1.139ms 1
|
| 4022 |
+
aten::empty 0.54% 27.441us 0.54% 27.441us 4.573us 0.000us 0.00% 0.000us 0.000us 6
|
| 4023 |
+
cudaFuncSetAttribute 0.12% 5.839us 0.12% 5.839us 1.946us 0.000us 0.00% 0.000us 0.000us 3
|
| 4024 |
+
cudaLaunchKernel 3.39% 171.646us 3.39% 171.646us 57.215us 0.000us 0.00% 0.000us 0.000us 3
|
| 4025 |
+
cudaDeviceSynchronize 62.82% 3.179ms 62.82% 3.179ms 3.179ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4026 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4027 |
+
Self CPU time total: 5.061ms
|
| 4028 |
+
Self CUDA time total: 3.398ms
|
| 4029 |
|
| 4030 |
|
| 4031 |
|
|
|
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
+
hf_kernels_flash_attn3 2.74% 138.223us 36.95% 1.864ms 1.864ms 0.000us 0.00% 4.557ms 4.557ms 1
|
| 4039 |
+
FlashAttnFunc 1.84% 92.725us 34.21% 1.726ms 575.197us 0.000us 0.00% 4.557ms 1.519ms 3
|
| 4040 |
+
_flash_attn3_48fe103_dirty::fwd 1.03% 52.171us 32.37% 1.633ms 544.289us 3.424ms 100.00% 4.557ms 1.519ms 3
|
| 4041 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.425ms 100.04% 3.425ms 3.425ms 1
|
| 4042 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.424ms 100.00% 3.424ms 1.141ms 3
|
| 4043 |
+
Activity Buffer Request 27.34% 1.379ms 27.34% 1.379ms 1.379ms 1.133ms 33.10% 1.133ms 1.133ms 1
|
| 4044 |
+
aten::empty 0.57% 28.661us 0.57% 28.661us 4.777us 0.000us 0.00% 0.000us 0.000us 6
|
| 4045 |
+
cudaFuncSetAttribute 0.10% 5.240us 0.10% 5.240us 1.747us 0.000us 0.00% 0.000us 0.000us 3
|
| 4046 |
+
cudaLaunchKernel 3.33% 167.776us 3.33% 167.776us 55.925us 0.000us 0.00% 0.000us 0.000us 3
|
| 4047 |
+
cudaDeviceSynchronize 63.05% 3.181ms 63.05% 3.181ms 3.181ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
+
Self CPU time total: 5.045ms
|
| 4050 |
+
Self CUDA time total: 3.424ms
|
| 4051 |
|
| 4052 |
|
| 4053 |
impl wl p50(ms) ok
|
| 4054 |
+
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
|
| 4055 |
+
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
|
| 4056 |
+
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True
|
| 4057 |
+
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
|
| 4058 |
+
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
|
| 4059 |
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
|
| 4060 |
</pre></div>
|
| 4061 |
<div class="cell-stderr">
|
| 4062 |
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4063 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.27it/s]
|
| 4064 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.55it/s]
|
| 4065 |
</div>
|
| 4066 |
<div class="cell-artifacts">
|
| 4067 |
<h4>Artifacts:</h4>
|
flash_attn/impls/mem_efficient_attention.html
CHANGED
|
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
-
<span id="uv-indicator-benchmark" style="cursor:
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark |
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3924,28 +3924,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
|
|
| 3924 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3925 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
-
torch_mem_eff 4.
|
| 3928 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3929 |
-
aten::scaled_dot_product_attention 0.
|
| 3930 |
-
aten::_scaled_dot_product_efficient_attention 0.35%
|
| 3931 |
-
aten::_efficient_attention_forward 0.
|
| 3932 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3933 |
-
aten::contiguous 0.
|
| 3934 |
-
aten::clone 0.
|
| 3935 |
-
aten::copy_ 1.
|
| 3936 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3937 |
-
Activity Buffer Request 20.
|
| 3938 |
-
aten::transpose
|
| 3939 |
-
aten::as_strided 0.
|
| 3940 |
-
aten::empty_like 0.
|
| 3941 |
-
aten::empty 1.
|
| 3942 |
-
cudaLaunchKernel 1.
|
| 3943 |
-
cudaStreamIsCapturing 0.04%
|
| 3944 |
-
cudaFuncSetAttribute 0.
|
| 3945 |
-
cudaDeviceSynchronize 67.
|
| 3946 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3947 |
-
Self CPU time total: 7.
|
| 3948 |
-
Self CUDA time total: 5.
|
| 3949 |
|
| 3950 |
|
| 3951 |
|
|
@@ -3955,28 +3955,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
|
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3957 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3958 |
-
torch_mem_eff 3.
|
| 3959 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3960 |
-
aten::scaled_dot_product_attention 0.
|
| 3961 |
-
aten::_scaled_dot_product_efficient_attention 0.26% 19.
|
| 3962 |
-
aten::_efficient_attention_forward 0.
|
| 3963 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3964 |
-
aten::contiguous 0.
|
| 3965 |
-
aten::clone 0.31% 23.
|
| 3966 |
-
aten::copy_ 0.
|
| 3967 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3968 |
-
Activity Buffer Request 19.
|
| 3969 |
-
aten::transpose 0.
|
| 3970 |
-
aten::as_strided 0.22% 16.
|
| 3971 |
-
aten::empty_like 0.
|
| 3972 |
-
aten::empty 0.
|
| 3973 |
-
cudaLaunchKernel 1.
|
| 3974 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 3975 |
-
cudaFuncSetAttribute 0.
|
| 3976 |
-
cudaDeviceSynchronize
|
| 3977 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3978 |
-
Self CPU time total: 7.
|
| 3979 |
-
Self CUDA time total: 5.
|
| 3980 |
|
| 3981 |
|
| 3982 |
|
|
@@ -3986,28 +3986,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
|
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
-
torch_mem_eff 3.
|
| 3990 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 3991 |
-
aten::scaled_dot_product_attention 0.24% 18.220us 1.
|
| 3992 |
-
aten::_scaled_dot_product_efficient_attention 0.24% 18.
|
| 3993 |
-
aten::_efficient_attention_forward 0.
|
| 3994 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3995 |
-
aten::contiguous 0.
|
| 3996 |
-
aten::clone 0.
|
| 3997 |
-
aten::copy_ 0.
|
| 3998 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3999 |
-
Activity Buffer Request 18.
|
| 4000 |
-
aten::transpose 0.
|
| 4001 |
-
aten::as_strided 0.21% 16.
|
| 4002 |
-
aten::empty_like 0.15% 11.
|
| 4003 |
-
aten::empty 0.
|
| 4004 |
-
cudaLaunchKernel 1.
|
| 4005 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4006 |
-
cudaFuncSetAttribute 0.04% 3.
|
| 4007 |
-
cudaDeviceSynchronize
|
| 4008 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4009 |
-
Self CPU time total: 7.
|
| 4010 |
-
Self CUDA time total: 6.
|
| 4011 |
|
| 4012 |
|
| 4013 |
|
|
@@ -4017,28 +4017,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
|
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
-
torch_mem_eff
|
| 4021 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4022 |
-
aten::scaled_dot_product_attention 0.
|
| 4023 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4024 |
-
aten::_efficient_attention_forward 0.
|
| 4025 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4026 |
-
aten::contiguous 0.
|
| 4027 |
-
aten::clone 0.
|
| 4028 |
-
aten::copy_
|
| 4029 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4030 |
-
Activity Buffer Request
|
| 4031 |
-
aten::transpose 0.
|
| 4032 |
-
aten::as_strided 0.
|
| 4033 |
-
aten::empty_like 0.
|
| 4034 |
-
aten::empty
|
| 4035 |
-
cudaLaunchKernel 4.
|
| 4036 |
-
cudaStreamIsCapturing 0.
|
| 4037 |
-
cudaFuncSetAttribute 0.
|
| 4038 |
-
cudaDeviceSynchronize
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
-
Self CPU time total:
|
| 4041 |
-
Self CUDA time total: 6.
|
| 4042 |
|
| 4043 |
|
| 4044 |
|
|
@@ -4048,28 +4048,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
|
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4050 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4051 |
-
torch_mem_eff
|
| 4052 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4053 |
-
aten::scaled_dot_product_attention 0.
|
| 4054 |
-
aten::_scaled_dot_product_efficient_attention 0.23%
|
| 4055 |
-
aten::_efficient_attention_forward 0.
|
| 4056 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4057 |
-
aten::contiguous 0.09% 7.
|
| 4058 |
-
aten::clone 0.
|
| 4059 |
-
aten::copy_ 0.
|
| 4060 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4061 |
-
Activity Buffer Request
|
| 4062 |
-
aten::transpose 0.61%
|
| 4063 |
-
aten::as_strided 0.
|
| 4064 |
-
aten::empty_like 0.14% 11.
|
| 4065 |
-
aten::empty 0.
|
| 4066 |
-
cudaLaunchKernel 3.
|
| 4067 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4068 |
-
cudaFuncSetAttribute 0.
|
| 4069 |
-
cudaDeviceSynchronize 71.
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
-
Self CPU time total:
|
| 4072 |
-
Self CUDA time total: 6.
|
| 4073 |
|
| 4074 |
|
| 4075 |
|
|
@@ -4079,38 +4079,90 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
|
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4081 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4082 |
-
torch_mem_eff
|
| 4083 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4084 |
-
aten::scaled_dot_product_attention 0.
|
| 4085 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4086 |
-
aten::_efficient_attention_forward 0.
|
| 4087 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4088 |
-
aten::contiguous 0.09% 7.
|
| 4089 |
-
aten::clone 0.
|
| 4090 |
-
aten::copy_ 0.
|
| 4091 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4092 |
-
Activity Buffer Request
|
| 4093 |
-
aten::transpose 0.
|
| 4094 |
-
aten::as_strided 0.
|
| 4095 |
-
aten::empty_like 0.
|
| 4096 |
-
aten::empty 0.
|
| 4097 |
-
cudaLaunchKernel
|
| 4098 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4099 |
-
cudaFuncSetAttribute 0.
|
| 4100 |
-
cudaDeviceSynchronize
|
| 4101 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4102 |
-
Self CPU time total: 8.
|
| 4103 |
-
Self CUDA time total: 6.
|
| 4104 |
|
| 4105 |
|
| 4106 |
impl wl p50(ms) ok
|
| 4107 |
-
torch_mem_eff cuda_attn_L128_bfloat16 1.
|
| 4108 |
-
torch_mem_eff cuda_attn_L256_bfloat16 1.
|
| 4109 |
-
torch_mem_eff cuda_attn_L320_bfloat16 2.
|
| 4110 |
-
torch_mem_eff cuda_attn_L384_bfloat16 2.
|
| 4111 |
-
torch_mem_eff cuda_attn_L448_bfloat16 2.
|
| 4112 |
-
torch_mem_eff cuda_attn_L512_bfloat16 2.
|
| 4113 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4114 |
<div class="cell-artifacts">
|
| 4115 |
<h4>Artifacts:</h4>
|
| 4116 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
+
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 32.68s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3924 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3925 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
+
torch_mem_eff 4.77% 340.490us 32.91% 2.350ms 2.350ms 0.000us 0.00% 5.530ms 5.530ms 1
|
| 3928 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.523ms 100.81% 5.523ms 5.523ms 1
|
| 3929 |
+
aten::scaled_dot_product_attention 0.44% 31.421us 2.67% 190.938us 63.646us 0.000us 0.00% 4.861ms 1.620ms 3
|
| 3930 |
+
aten::_scaled_dot_product_efficient_attention 0.35% 24.771us 2.23% 159.517us 53.172us 0.000us 0.00% 4.861ms 1.620ms 3
|
| 3931 |
+
aten::_efficient_attention_forward 0.51% 36.163us 1.50% 107.413us 35.804us 4.861ms 88.73% 4.861ms 1.620ms 3
|
| 3932 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.861ms 88.73% 4.861ms 1.620ms 3
|
| 3933 |
+
aten::contiguous 0.17% 12.232us 24.52% 1.751ms 194.525us 0.000us 0.00% 668.128us 74.236us 9
|
| 3934 |
+
aten::clone 0.48% 34.579us 24.35% 1.738ms 193.165us 0.000us 0.00% 668.128us 74.236us 9
|
| 3935 |
+
aten::copy_ 1.16% 82.494us 22.79% 1.628ms 180.845us 617.312us 11.27% 668.128us 74.236us 9
|
| 3936 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 617.312us 11.27% 617.312us 68.590us 9
|
| 3937 |
+
Activity Buffer Request 20.35% 1.453ms 20.35% 1.453ms 1.453ms 50.816us 0.93% 50.816us 50.816us 1
|
| 3938 |
+
aten::transpose 1.00% 71.754us 1.33% 95.065us 3.961us 0.000us 0.00% 0.000us 0.000us 24
|
| 3939 |
+
aten::as_strided 0.33% 23.311us 0.33% 23.311us 0.971us 0.000us 0.00% 0.000us 0.000us 24
|
| 3940 |
+
aten::empty_like 0.27% 19.481us 1.07% 76.301us 8.478us 0.000us 0.00% 0.000us 0.000us 9
|
| 3941 |
+
aten::empty 1.26% 89.759us 1.26% 89.759us 4.274us 0.000us 0.00% 0.000us 0.000us 21
|
| 3942 |
+
cudaLaunchKernel 1.62% 115.656us 1.62% 115.656us 9.638us 0.000us 0.00% 0.000us 0.000us 12
|
| 3943 |
+
cudaStreamIsCapturing 0.04% 2.980us 0.04% 2.980us 0.993us 0.000us 0.00% 0.000us 0.000us 3
|
| 3944 |
+
cudaFuncSetAttribute 0.16% 11.490us 0.16% 11.490us 3.830us 0.000us 0.00% 0.000us 0.000us 3
|
| 3945 |
+
cudaDeviceSynchronize 67.09% 4.790ms 67.09% 4.790ms 4.790ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3946 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3947 |
+
Self CPU time total: 7.140ms
|
| 3948 |
+
Self CUDA time total: 5.479ms
|
| 3949 |
|
| 3950 |
|
| 3951 |
|
|
|
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3957 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3958 |
+
torch_mem_eff 3.38% 251.986us 27.98% 2.086ms 2.086ms 0.000us 0.00% 6.014ms 6.014ms 1
|
| 3959 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.969ms 100.15% 5.969ms 5.969ms 1
|
| 3960 |
+
aten::scaled_dot_product_attention 0.27% 19.962us 1.97% 146.646us 48.882us 0.000us 0.00% 5.323ms 1.774ms 3
|
| 3961 |
+
aten::_scaled_dot_product_efficient_attention 0.26% 19.141us 1.70% 126.684us 42.228us 0.000us 0.00% 5.323ms 1.774ms 3
|
| 3962 |
+
aten::_efficient_attention_forward 0.39% 29.281us 1.12% 83.514us 27.838us 5.323ms 89.32% 5.323ms 1.774ms 3
|
| 3963 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.323ms 89.32% 5.323ms 1.774ms 3
|
| 3964 |
+
aten::contiguous 0.10% 7.510us 22.05% 1.644ms 182.655us 0.000us 0.00% 690.909us 76.768us 9
|
| 3965 |
+
aten::clone 0.31% 23.251us 21.95% 1.636ms 181.821us 0.000us 0.00% 690.909us 76.768us 9
|
| 3966 |
+
aten::copy_ 0.91% 68.131us 20.95% 1.562ms 173.540us 636.478us 10.68% 690.909us 76.768us 9
|
| 3967 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.478us 10.68% 636.478us 70.720us 9
|
| 3968 |
+
Activity Buffer Request 19.09% 1.423ms 19.09% 1.423ms 1.423ms 54.431us 0.91% 54.431us 54.431us 1
|
| 3969 |
+
aten::transpose 0.68% 50.542us 0.90% 67.292us 2.804us 0.000us 0.00% 0.000us 0.000us 24
|
| 3970 |
+
aten::as_strided 0.22% 16.750us 0.22% 16.750us 0.698us 0.000us 0.00% 0.000us 0.000us 24
|
| 3971 |
+
aten::empty_like 0.17% 12.371us 0.69% 51.272us 5.697us 0.000us 0.00% 0.000us 0.000us 9
|
| 3972 |
+
aten::empty 0.87% 64.771us 0.87% 64.771us 3.084us 0.000us 0.00% 0.000us 0.000us 21
|
| 3973 |
+
cudaLaunchKernel 1.25% 93.466us 1.25% 93.466us 7.789us 0.000us 0.00% 0.000us 0.000us 12
|
| 3974 |
+
cudaStreamIsCapturing 0.03% 2.400us 0.03% 2.400us 0.800us 0.000us 0.00% 0.000us 0.000us 3
|
| 3975 |
+
cudaFuncSetAttribute 0.05% 3.371us 0.05% 3.371us 1.124us 0.000us 0.00% 0.000us 0.000us 3
|
| 3976 |
+
cudaDeviceSynchronize 72.02% 5.368ms 72.02% 5.368ms 5.368ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3977 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3978 |
+
Self CPU time total: 7.454ms
|
| 3979 |
+
Self CUDA time total: 5.959ms
|
| 3980 |
|
| 3981 |
|
| 3982 |
|
|
|
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
+
torch_mem_eff 3.08% 235.490us 27.25% 2.083ms 2.083ms 0.000us 0.00% 6.182ms 6.182ms 1
|
| 3990 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.132ms 100.15% 6.132ms 6.132ms 1
|
| 3991 |
+
aten::scaled_dot_product_attention 0.24% 18.220us 1.86% 142.046us 47.349us 0.000us 0.00% 5.466ms 1.822ms 3
|
| 3992 |
+
aten::_scaled_dot_product_efficient_attention 0.24% 18.131us 1.62% 123.826us 41.275us 0.000us 0.00% 5.466ms 1.822ms 3
|
| 3993 |
+
aten::_efficient_attention_forward 0.37% 27.940us 1.08% 82.291us 27.430us 5.466ms 89.28% 5.466ms 1.822ms 3
|
| 3994 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.466ms 89.28% 5.466ms 1.822ms 3
|
| 3995 |
+
aten::contiguous 0.10% 7.272us 21.47% 1.642ms 182.409us 0.000us 0.00% 715.197us 79.466us 9
|
| 3996 |
+
aten::clone 0.29% 22.290us 21.38% 1.634ms 181.601us 0.000us 0.00% 715.197us 79.466us 9
|
| 3997 |
+
aten::copy_ 0.83% 63.251us 20.39% 1.559ms 173.182us 656.318us 10.72% 715.197us 79.466us 9
|
| 3998 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 656.318us 10.72% 656.318us 72.924us 9
|
| 3999 |
+
Activity Buffer Request 18.70% 1.430ms 18.70% 1.430ms 1.430ms 58.879us 0.96% 58.879us 58.879us 1
|
| 4000 |
+
aten::transpose 0.93% 71.209us 1.15% 87.625us 3.651us 0.000us 0.00% 0.000us 0.000us 24
|
| 4001 |
+
aten::as_strided 0.21% 16.416us 0.21% 16.416us 0.684us 0.000us 0.00% 0.000us 0.000us 24
|
| 4002 |
+
aten::empty_like 0.15% 11.741us 0.70% 53.481us 5.942us 0.000us 0.00% 0.000us 0.000us 9
|
| 4003 |
+
aten::empty 0.89% 67.840us 0.89% 67.840us 3.230us 0.000us 0.00% 0.000us 0.000us 21
|
| 4004 |
+
cudaLaunchKernel 1.15% 88.022us 1.15% 88.022us 7.335us 0.000us 0.00% 0.000us 0.000us 12
|
| 4005 |
+
cudaStreamIsCapturing 0.03% 2.651us 0.03% 2.651us 0.884us 0.000us 0.00% 0.000us 0.000us 3
|
| 4006 |
+
cudaFuncSetAttribute 0.04% 3.370us 0.04% 3.370us 1.123us 0.000us 0.00% 0.000us 0.000us 3
|
| 4007 |
+
cudaDeviceSynchronize 72.75% 5.562ms 72.75% 5.562ms 5.562ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4008 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4009 |
+
Self CPU time total: 7.646ms
|
| 4010 |
+
Self CUDA time total: 6.123ms
|
| 4011 |
|
| 4012 |
|
| 4013 |
|
|
|
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
+
torch_mem_eff 2.84% 224.838us 29.78% 2.354ms 2.354ms 0.000us 0.00% 6.170ms 6.170ms 1
|
| 4021 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.121ms 100.15% 6.121ms 6.121ms 1
|
| 4022 |
+
aten::scaled_dot_product_attention 0.24% 18.891us 1.82% 143.646us 47.882us 0.000us 0.00% 5.458ms 1.819ms 3
|
| 4023 |
+
aten::_scaled_dot_product_efficient_attention 0.24% 19.093us 1.58% 124.755us 41.585us 0.000us 0.00% 5.458ms 1.819ms 3
|
| 4024 |
+
aten::_efficient_attention_forward 0.36% 28.140us 1.04% 82.213us 27.404us 5.458ms 89.30% 5.458ms 1.819ms 3
|
| 4025 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.458ms 89.30% 5.458ms 1.819ms 3
|
| 4026 |
+
aten::contiguous 0.10% 7.739us 24.57% 1.942ms 215.806us 0.000us 0.00% 711.998us 79.111us 9
|
| 4027 |
+
aten::clone 0.31% 24.450us 24.47% 1.935ms 214.946us 0.000us 0.00% 711.998us 79.111us 9
|
| 4028 |
+
aten::copy_ 0.86% 68.064us 23.51% 1.859ms 206.523us 653.982us 10.70% 711.998us 79.111us 9
|
| 4029 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 653.982us 10.70% 653.982us 72.665us 9
|
| 4030 |
+
Activity Buffer Request 18.84% 1.489ms 18.84% 1.489ms 1.489ms 58.016us 0.95% 58.016us 58.016us 1
|
| 4031 |
+
aten::transpose 0.62% 49.288us 0.84% 66.489us 2.770us 0.000us 0.00% 0.000us 0.000us 24
|
| 4032 |
+
aten::as_strided 0.22% 17.201us 0.22% 17.201us 0.717us 0.000us 0.00% 0.000us 0.000us 24
|
| 4033 |
+
aten::empty_like 0.15% 12.041us 0.65% 51.362us 5.707us 0.000us 0.00% 0.000us 0.000us 9
|
| 4034 |
+
aten::empty 0.83% 65.351us 0.83% 65.351us 3.112us 0.000us 0.00% 0.000us 0.000us 21
|
| 4035 |
+
cudaLaunchKernel 4.09% 323.234us 4.09% 323.234us 26.936us 0.000us 0.00% 0.000us 0.000us 12
|
| 4036 |
+
cudaStreamIsCapturing 0.03% 2.670us 0.03% 2.670us 0.890us 0.000us 0.00% 0.000us 0.000us 3
|
| 4037 |
+
cudaFuncSetAttribute 0.04% 3.430us 0.04% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3
|
| 4038 |
+
cudaDeviceSynchronize 70.22% 5.551ms 70.22% 5.551ms 5.551ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
+
Self CPU time total: 7.905ms
|
| 4041 |
+
Self CUDA time total: 6.112ms
|
| 4042 |
|
| 4043 |
|
| 4044 |
|
|
|
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4050 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4051 |
+
torch_mem_eff 2.78% 220.799us 28.42% 2.258ms 2.258ms 0.000us 0.00% 6.296ms 6.296ms 1
|
| 4052 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.245ms 100.15% 6.245ms 6.245ms 1
|
| 4053 |
+
aten::scaled_dot_product_attention 0.24% 19.311us 1.79% 142.116us 47.372us 0.000us 0.00% 5.574ms 1.858ms 3
|
| 4054 |
+
aten::_scaled_dot_product_efficient_attention 0.23% 17.909us 1.55% 122.805us 40.935us 0.000us 0.00% 5.574ms 1.858ms 3
|
| 4055 |
+
aten::_efficient_attention_forward 0.36% 28.682us 1.03% 82.073us 27.358us 5.574ms 89.39% 5.574ms 1.858ms 3
|
| 4056 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.574ms 89.39% 5.574ms 1.858ms 3
|
| 4057 |
+
aten::contiguous 0.09% 7.009us 23.32% 1.852ms 205.811us 0.000us 0.00% 721.599us 80.178us 9
|
| 4058 |
+
aten::clone 0.28% 22.450us 23.23% 1.845ms 205.033us 0.000us 0.00% 721.599us 80.178us 9
|
| 4059 |
+
aten::copy_ 0.87% 68.713us 22.33% 1.774ms 197.096us 661.695us 10.61% 721.599us 80.178us 9
|
| 4060 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 661.695us 10.61% 661.695us 73.522us 9
|
| 4061 |
+
Activity Buffer Request 17.91% 1.422ms 17.91% 1.422ms 1.422ms 59.904us 0.96% 59.904us 59.904us 1
|
| 4062 |
+
aten::transpose 0.61% 48.435us 0.82% 65.304us 2.721us 0.000us 0.00% 0.000us 0.000us 24
|
| 4063 |
+
aten::as_strided 0.21% 16.869us 0.21% 16.869us 0.703us 0.000us 0.00% 0.000us 0.000us 24
|
| 4064 |
+
aten::empty_like 0.14% 11.511us 0.62% 48.982us 5.442us 0.000us 0.00% 0.000us 0.000us 9
|
| 4065 |
+
aten::empty 0.78% 61.691us 0.78% 61.691us 2.938us 0.000us 0.00% 0.000us 0.000us 21
|
| 4066 |
+
cudaLaunchKernel 3.85% 305.580us 3.85% 305.580us 25.465us 0.000us 0.00% 0.000us 0.000us 12
|
| 4067 |
+
cudaStreamIsCapturing 0.03% 2.440us 0.03% 2.440us 0.813us 0.000us 0.00% 0.000us 0.000us 3
|
| 4068 |
+
cudaFuncSetAttribute 0.05% 3.920us 0.05% 3.920us 1.307us 0.000us 0.00% 0.000us 0.000us 3
|
| 4069 |
+
cudaDeviceSynchronize 71.58% 5.685ms 71.58% 5.685ms 5.685ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
+
Self CPU time total: 7.943ms
|
| 4072 |
+
Self CUDA time total: 6.236ms
|
| 4073 |
|
| 4074 |
|
| 4075 |
|
|
|
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4081 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4082 |
+
torch_mem_eff 3.27% 267.711us 29.30% 2.401ms 2.401ms 0.000us 0.00% 6.459ms 6.459ms 1
|
| 4083 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.406ms 100.13% 6.406ms 6.406ms 1
|
| 4084 |
+
aten::scaled_dot_product_attention 0.24% 19.643us 1.85% 151.176us 50.392us 0.000us 0.00% 5.726ms 1.909ms 3
|
| 4085 |
+
aten::_scaled_dot_product_efficient_attention 0.26% 20.920us 1.61% 131.533us 43.844us 0.000us 0.00% 5.726ms 1.909ms 3
|
| 4086 |
+
aten::_efficient_attention_forward 0.37% 30.563us 1.03% 84.603us 28.201us 5.726ms 89.50% 5.726ms 1.909ms 3
|
| 4087 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.726ms 89.50% 5.726ms 1.909ms 3
|
| 4088 |
+
aten::contiguous 0.09% 7.670us 23.58% 1.932ms 214.647us 0.000us 0.00% 733.247us 81.472us 9
|
| 4089 |
+
aten::clone 0.31% 25.042us 23.48% 1.924ms 213.795us 0.000us 0.00% 733.247us 81.472us 9
|
| 4090 |
+
aten::copy_ 0.88% 72.162us 22.52% 1.845ms 205.052us 671.711us 10.50% 733.247us 81.472us 9
|
| 4091 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 671.711us 10.50% 671.711us 74.635us 9
|
| 4092 |
+
Activity Buffer Request 17.78% 1.456ms 17.78% 1.456ms 1.456ms 61.536us 0.96% 61.536us 61.536us 1
|
| 4093 |
+
aten::transpose 0.71% 58.110us 0.93% 75.842us 3.160us 0.000us 0.00% 0.000us 0.000us 24
|
| 4094 |
+
aten::as_strided 0.22% 17.732us 0.22% 17.732us 0.739us 0.000us 0.00% 0.000us 0.000us 24
|
| 4095 |
+
aten::empty_like 0.15% 12.319us 0.65% 53.641us 5.960us 0.000us 0.00% 0.000us 0.000us 9
|
| 4096 |
+
aten::empty 0.81% 66.513us 0.81% 66.513us 3.167us 0.000us 0.00% 0.000us 0.000us 21
|
| 4097 |
+
cudaLaunchKernel 4.14% 339.159us 4.14% 339.159us 28.263us 0.000us 0.00% 0.000us 0.000us 12
|
| 4098 |
+
cudaStreamIsCapturing 0.03% 2.379us 0.03% 2.379us 0.793us 0.000us 0.00% 0.000us 0.000us 3
|
| 4099 |
+
cudaFuncSetAttribute 0.05% 4.230us 0.05% 4.230us 1.410us 0.000us 0.00% 0.000us 0.000us 3
|
| 4100 |
+
cudaDeviceSynchronize 70.70% 5.793ms 70.70% 5.793ms 5.793ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4101 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4102 |
+
Self CPU time total: 8.193ms
|
| 4103 |
+
Self CUDA time total: 6.398ms
|
| 4104 |
|
| 4105 |
|
| 4106 |
impl wl p50(ms) ok
|
| 4107 |
+
torch_mem_eff cuda_attn_L128_bfloat16 1.86 True
|
| 4108 |
+
torch_mem_eff cuda_attn_L256_bfloat16 1.97 True
|
| 4109 |
+
torch_mem_eff cuda_attn_L320_bfloat16 2.04 True
|
| 4110 |
+
torch_mem_eff cuda_attn_L384_bfloat16 2.06 True
|
| 4111 |
+
torch_mem_eff cuda_attn_L448_bfloat16 2.03 True
|
| 4112 |
+
torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
|
| 4113 |
</pre></div>
|
| 4114 |
+
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4115 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4116 |
+
<div class="uv-logs-content" style="display: none;">
|
| 4117 |
+
Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4118 |
+
Downloading networkx (1.9MiB)
|
| 4119 |
+
Downloading matplotlib (8.3MiB)
|
| 4120 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4121 |
+
Downloading sympy (6.0MiB)
|
| 4122 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4123 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4124 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4125 |
+
Downloading numpy (16.2MiB)
|
| 4126 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4127 |
+
Downloading setuptools (1.1MiB)
|
| 4128 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4129 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4130 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4131 |
+
Downloading kiwisolver (1.4MiB)
|
| 4132 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4133 |
+
Downloading fonttools (4.7MiB)
|
| 4134 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4135 |
+
Downloading pillow (6.7MiB)
|
| 4136 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4137 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4138 |
+
Downloading triton (148.3MiB)
|
| 4139 |
+
Downloading torch (846.9MiB)
|
| 4140 |
+
Downloading nvidia-cufile-cu12
|
| 4141 |
+
Downloading kiwisolver
|
| 4142 |
+
Downloading setuptools
|
| 4143 |
+
Downloading fonttools
|
| 4144 |
+
Downloading networkx
|
| 4145 |
+
Downloading pillow
|
| 4146 |
+
Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4147 |
+
Downloading nvidia-cuda-cupti-cu12
|
| 4148 |
+
Downloading matplotlib
|
| 4149 |
+
Downloading numpy
|
| 4150 |
+
Downloading sympy
|
| 4151 |
+
Downloading nvidia-nvjitlink-cu12
|
| 4152 |
+
Downloading nvidia-curand-cu12
|
| 4153 |
+
Downloading nvidia-cuda-nvrtc-cu12
|
| 4154 |
+
Downloading triton
|
| 4155 |
+
Downloading nvidia-cufft-cu12
|
| 4156 |
+
Downloading nvidia-cusolver-cu12
|
| 4157 |
+
Downloading nvidia-cusparse-cu12
|
| 4158 |
+
Downloading nvidia-cusparselt-cu12
|
| 4159 |
+
Downloading nvidia-nccl-cu12
|
| 4160 |
+
Downloading nvidia-cublas-cu12
|
| 4161 |
+
Downloading nvidia-cudnn-cu12
|
| 4162 |
+
Downloading torch
|
| 4163 |
+
Installed 37 packages in 216ms
|
| 4164 |
+
</div>
|
| 4165 |
+
</div>
|
| 4166 |
<div class="cell-artifacts">
|
| 4167 |
<h4>Artifacts:</h4>
|
| 4168 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/sage_attention.html
CHANGED
|
@@ -3869,15 +3869,15 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
-
<span id="uv-indicator-benchmark"
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark | 4.
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3878 |
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/sage_attention.md" target="_blank" class="github-btn">GitHub</a>
|
| 3879 |
</div>
|
| 3880 |
-
<div id="code-benchmark" class="cell-code" data-lines="
|
| 3881 |
<div class="code-wrap">
|
| 3882 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 3883 |
<span class="c1"># requires-python = ">=3.10"</span>
|
|
@@ -3886,7 +3886,6 @@ Cell: benchmark | 4.37s
|
|
| 3886 |
<span class="c1"># "torch==2.8.0",</span>
|
| 3887 |
<span class="c1"># "kernels",</span>
|
| 3888 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3889 |
-
<span class="c1"># "sageattention",</span>
|
| 3890 |
<span class="c1"># ]</span>
|
| 3891 |
<span class="c1">#</span>
|
| 3892 |
<span class="c1"># [tool.uv.sources]</span>
|
|
@@ -3921,28 +3920,23 @@ Cell: benchmark | 4.37s
|
|
| 3921 |
<div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
|
| 3922 |
impl wl p50(ms) ok
|
| 3923 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 3924 |
-
Error: module '
|
| 3925 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 3926 |
-
Error: module '
|
| 3927 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 3928 |
-
Error: module '
|
| 3929 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 3930 |
-
Error: module '
|
| 3931 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 3932 |
-
Error: module '
|
| 3933 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 3934 |
-
Error: module '
|
| 3935 |
</pre></div>
|
| 3936 |
-
<div class="
|
| 3937 |
-
|
| 3938 |
-
|
| 3939 |
-
|
| 3940 |
</div>
|
| 3941 |
-
</div>
|
| 3942 |
-
<div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 3943 |
-
Fetching 11 files: 27%|██▋ | 3/11 [00:00<00:00, 14.92it/s]
|
| 3944 |
-
Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 14.19it/s]
|
| 3945 |
-
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 19.60it/s]</div>
|
| 3946 |
<div class="cell-artifacts">
|
| 3947 |
<h4>Artifacts:</h4>
|
| 3948 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 4.22s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3878 |
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/sage_attention.md" target="_blank" class="github-btn">GitHub</a>
|
| 3879 |
</div>
|
| 3880 |
+
<div id="code-benchmark" class="cell-code" data-lines="32">
|
| 3881 |
<div class="code-wrap">
|
| 3882 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 3883 |
<span class="c1"># requires-python = ">=3.10"</span>
|
|
|
|
| 3886 |
<span class="c1"># "torch==2.8.0",</span>
|
| 3887 |
<span class="c1"># "kernels",</span>
|
| 3888 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
|
|
|
| 3889 |
<span class="c1"># ]</span>
|
| 3890 |
<span class="c1">#</span>
|
| 3891 |
<span class="c1"># [tool.uv.sources]</span>
|
|
|
|
| 3920 |
<div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
|
| 3921 |
impl wl p50(ms) ok
|
| 3922 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 3923 |
+
Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
|
| 3924 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 3925 |
+
Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
|
| 3926 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 3927 |
+
Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
|
| 3928 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 3929 |
+
Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
|
| 3930 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 3931 |
+
Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
|
| 3932 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 3933 |
+
Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
|
| 3934 |
</pre></div>
|
| 3935 |
+
<div class="cell-stderr">
|
| 3936 |
+
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 3937 |
+
Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 13.92it/s]
|
| 3938 |
+
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 19.13it/s]
|
| 3939 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3940 |
<div class="cell-artifacts">
|
| 3941 |
<h4>Artifacts:</h4>
|
| 3942 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/xformers.html
CHANGED
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark | 5.
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3923,21 +3923,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
|
|
| 3923 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3924 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
-
xformers_meff
|
| 3927 |
-
xformers_flash3::flash_fwd 4.
|
| 3928 |
-
flash_attn_3::fwd 1.
|
| 3929 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3930 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3931 |
-
Activity Buffer Request 31.
|
| 3932 |
-
aten::empty 0.
|
| 3933 |
-
cudaFuncSetAttribute 0.
|
| 3934 |
-
cudaLaunchKernel 0.
|
| 3935 |
-
aten::reshape 0.
|
| 3936 |
-
aten::view 0.
|
| 3937 |
-
cudaDeviceSynchronize
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
-
Self CPU time total: 4.
|
| 3940 |
-
Self CUDA time total: 2.
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
@@ -3947,21 +3947,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
-
xformers_meff
|
| 3951 |
-
xformers_flash3::flash_fwd 3.
|
| 3952 |
-
flash_attn_3::fwd 1.
|
| 3953 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3954 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3955 |
-
Activity Buffer Request
|
| 3956 |
-
aten::empty 0.
|
| 3957 |
-
cudaFuncSetAttribute 0.
|
| 3958 |
-
cudaLaunchKernel 0.
|
| 3959 |
-
aten::reshape 0.20% 8.951us 0.
|
| 3960 |
-
aten::view 0.
|
| 3961 |
-
cudaDeviceSynchronize
|
| 3962 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3963 |
-
Self CPU time total: 4.
|
| 3964 |
-
Self CUDA time total: 2.
|
| 3965 |
|
| 3966 |
|
| 3967 |
|
|
@@ -3971,21 +3971,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
|
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3973 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3974 |
-
xformers_meff 6.
|
| 3975 |
-
xformers_flash3::flash_fwd 3.
|
| 3976 |
-
flash_attn_3::fwd 1.
|
| 3977 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3978 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3979 |
-
Activity Buffer Request
|
| 3980 |
-
aten::empty 0.
|
| 3981 |
-
cudaFuncSetAttribute 0.
|
| 3982 |
-
cudaLaunchKernel 0.73%
|
| 3983 |
-
aten::reshape 0.
|
| 3984 |
-
aten::view 0.31%
|
| 3985 |
-
cudaDeviceSynchronize
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
-
Self CPU time total: 4.
|
| 3988 |
-
Self CUDA time total: 2.
|
| 3989 |
|
| 3990 |
|
| 3991 |
|
|
@@ -3995,21 +3995,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
|
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3997 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3998 |
-
xformers_meff 6.
|
| 3999 |
-
xformers_flash3::flash_fwd
|
| 4000 |
-
flash_attn_3::fwd 1.
|
| 4001 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4002 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4003 |
-
Activity Buffer Request
|
| 4004 |
-
aten::empty 0.
|
| 4005 |
-
cudaFuncSetAttribute 0.
|
| 4006 |
-
cudaLaunchKernel
|
| 4007 |
-
aten::reshape 0.
|
| 4008 |
-
aten::view 0.
|
| 4009 |
-
cudaDeviceSynchronize 53.
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
-
Self CPU time total: 4.
|
| 4012 |
-
Self CUDA time total: 2.
|
| 4013 |
|
| 4014 |
|
| 4015 |
|
|
@@ -4019,21 +4019,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
|
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
-
xformers_meff 5.
|
| 4023 |
-
xformers_flash3::flash_fwd 2.
|
| 4024 |
-
flash_attn_3::fwd
|
| 4025 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4026 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4027 |
-
Activity Buffer Request 27.
|
| 4028 |
-
aten::empty 0.
|
| 4029 |
-
cudaFuncSetAttribute 0.
|
| 4030 |
-
cudaLaunchKernel 3.
|
| 4031 |
-
aten::reshape 0.
|
| 4032 |
-
aten::view 0.
|
| 4033 |
-
cudaDeviceSynchronize
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
-
Self CPU time total: 5.
|
| 4036 |
-
Self CUDA time total: 3.
|
| 4037 |
|
| 4038 |
|
| 4039 |
|
|
@@ -4043,30 +4043,30 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
|
|
| 4043 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4044 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4045 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4046 |
-
xformers_meff 5.
|
| 4047 |
-
xformers_flash3::flash_fwd 2.75%
|
| 4048 |
-
flash_attn_3::fwd 1.04% 53.
|
| 4049 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4050 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4051 |
-
Activity Buffer Request 27.
|
| 4052 |
-
aten::empty 0.
|
| 4053 |
-
cudaFuncSetAttribute 0.
|
| 4054 |
-
cudaLaunchKernel 3.
|
| 4055 |
-
aten::reshape 0.
|
| 4056 |
-
aten::view 0.
|
| 4057 |
-
cudaDeviceSynchronize 58.
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
-
Self CPU time total: 5.
|
| 4060 |
-
Self CUDA time total: 3.
|
| 4061 |
|
| 4062 |
|
| 4063 |
impl wl p50(ms) ok
|
| 4064 |
-
xformers_meff cuda_attn_L128_bfloat16
|
| 4065 |
-
xformers_meff cuda_attn_L256_bfloat16 1.
|
| 4066 |
-
xformers_meff cuda_attn_L320_bfloat16 1.
|
| 4067 |
-
xformers_meff cuda_attn_L384_bfloat16 1.
|
| 4068 |
-
xformers_meff cuda_attn_L448_bfloat16 1.
|
| 4069 |
-
xformers_meff cuda_attn_L512_bfloat16 1.
|
| 4070 |
</pre></div>
|
| 4071 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4072 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
|
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 5.02s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3923 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3924 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
+
xformers_meff 9.93% 451.937us 49.71% 2.262ms 2.262ms 0.000us 0.00% 3.695ms 3.695ms 1
|
| 3927 |
+
xformers_flash3::flash_fwd 4.26% 193.656us 38.96% 1.773ms 590.904us 0.000us 0.00% 3.695ms 1.232ms 3
|
| 3928 |
+
flash_attn_3::fwd 1.62% 73.841us 34.71% 1.579ms 526.352us 2.795ms 100.00% 3.695ms 1.232ms 3
|
| 3929 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.797ms 100.05% 2.797ms 2.797ms 1
|
| 3930 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.00% 2.795ms 931.773us 3
|
| 3931 |
+
Activity Buffer Request 31.17% 1.418ms 31.17% 1.418ms 1.418ms 899.421us 32.18% 899.421us 899.421us 1
|
| 3932 |
+
aten::empty 0.76% 34.741us 0.76% 34.741us 5.790us 0.000us 0.00% 0.000us 0.000us 6
|
| 3933 |
+
cudaFuncSetAttribute 0.30% 13.732us 0.30% 13.732us 4.577us 0.000us 0.00% 0.000us 0.000us 3
|
| 3934 |
+
cudaLaunchKernel 0.85% 38.662us 0.85% 38.662us 12.887us 0.000us 0.00% 0.000us 0.000us 3
|
| 3935 |
+
aten::reshape 0.35% 15.860us 0.82% 37.181us 6.197us 0.000us 0.00% 0.000us 0.000us 6
|
| 3936 |
+
aten::view 0.47% 21.321us 0.47% 21.321us 3.553us 0.000us 0.00% 0.000us 0.000us 6
|
| 3937 |
+
cudaDeviceSynchronize 50.29% 2.288ms 50.29% 2.288ms 2.288ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
+
Self CPU time total: 4.550ms
|
| 3940 |
+
Self CUDA time total: 2.795ms
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
+
xformers_meff 6.95% 312.321us 44.96% 2.021ms 2.021ms 0.000us 0.00% 3.832ms 3.832ms 1
|
| 3951 |
+
xformers_flash3::flash_fwd 3.14% 141.315us 37.51% 1.686ms 561.970us 0.000us 0.00% 3.832ms 1.277ms 3
|
| 3952 |
+
flash_attn_3::fwd 1.18% 53.030us 34.37% 1.545ms 514.865us 2.890ms 100.00% 3.832ms 1.277ms 3
|
| 3953 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.892ms 100.05% 2.892ms 2.892ms 1
|
| 3954 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.890ms 100.00% 2.890ms 963.329us 3
|
| 3955 |
+
Activity Buffer Request 31.64% 1.422ms 31.64% 1.422ms 1.422ms 942.465us 32.61% 942.465us 942.465us 1
|
| 3956 |
+
aten::empty 0.68% 30.660us 0.68% 30.660us 5.110us 0.000us 0.00% 0.000us 0.000us 6
|
| 3957 |
+
cudaFuncSetAttribute 0.12% 5.592us 0.12% 5.592us 1.864us 0.000us 0.00% 0.000us 0.000us 3
|
| 3958 |
+
cudaLaunchKernel 0.74% 33.432us 0.74% 33.432us 11.144us 0.000us 0.00% 0.000us 0.000us 3
|
| 3959 |
+
aten::reshape 0.20% 8.951us 0.50% 22.691us 3.782us 0.000us 0.00% 0.000us 0.000us 6
|
| 3960 |
+
aten::view 0.31% 13.740us 0.31% 13.740us 2.290us 0.000us 0.00% 0.000us 0.000us 6
|
| 3961 |
+
cudaDeviceSynchronize 55.04% 2.474ms 55.04% 2.474ms 2.474ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3962 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3963 |
+
Self CPU time total: 4.495ms
|
| 3964 |
+
Self CUDA time total: 2.890ms
|
| 3965 |
|
| 3966 |
|
| 3967 |
|
|
|
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3973 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3974 |
+
xformers_meff 6.65% 298.008us 44.73% 2.006ms 2.006ms 0.000us 0.00% 3.867ms 3.867ms 1
|
| 3975 |
+
xformers_flash3::flash_fwd 3.15% 141.235us 37.58% 1.685ms 561.690us 0.000us 0.00% 3.867ms 1.289ms 3
|
| 3976 |
+
flash_attn_3::fwd 1.18% 53.120us 34.43% 1.544ms 514.611us 2.888ms 100.00% 3.867ms 1.289ms 3
|
| 3977 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.890ms 100.06% 2.890ms 2.890ms 1
|
| 3978 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.888ms 100.00% 2.888ms 962.683us 3
|
| 3979 |
+
Activity Buffer Request 31.72% 1.422ms 31.72% 1.422ms 1.422ms 978.939us 33.90% 978.939us 978.939us 1
|
| 3980 |
+
aten::empty 0.67% 30.192us 0.67% 30.192us 5.032us 0.000us 0.00% 0.000us 0.000us 6
|
| 3981 |
+
cudaFuncSetAttribute 0.12% 5.491us 0.12% 5.491us 1.830us 0.000us 0.00% 0.000us 0.000us 3
|
| 3982 |
+
cudaLaunchKernel 0.73% 32.901us 0.73% 32.901us 10.967us 0.000us 0.00% 0.000us 0.000us 3
|
| 3983 |
+
aten::reshape 0.20% 8.773us 0.50% 22.603us 3.767us 0.000us 0.00% 0.000us 0.000us 6
|
| 3984 |
+
aten::view 0.31% 13.830us 0.31% 13.830us 2.305us 0.000us 0.00% 0.000us 0.000us 6
|
| 3985 |
+
cudaDeviceSynchronize 55.27% 2.478ms 55.27% 2.478ms 2.478ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
+
Self CPU time total: 4.484ms
|
| 3988 |
+
Self CUDA time total: 2.888ms
|
| 3989 |
|
| 3990 |
|
| 3991 |
|
|
|
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3997 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3998 |
+
xformers_meff 6.31% 299.042us 46.56% 2.205ms 2.205ms 0.000us 0.00% 3.936ms 3.936ms 1
|
| 3999 |
+
xformers_flash3::flash_fwd 2.97% 140.784us 39.75% 1.883ms 627.609us 0.000us 0.00% 3.936ms 1.312ms 3
|
| 4000 |
+
flash_attn_3::fwd 1.10% 52.191us 36.78% 1.742ms 580.681us 2.941ms 100.00% 3.936ms 1.312ms 3
|
| 4001 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.943ms 100.05% 2.943ms 2.943ms 1
|
| 4002 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.941ms 100.00% 2.941ms 980.445us 3
|
| 4003 |
+
Activity Buffer Request 30.11% 1.426ms 30.11% 1.426ms 1.426ms 994.973us 33.83% 994.973us 994.973us 1
|
| 4004 |
+
aten::empty 0.64% 30.333us 0.64% 30.333us 5.055us 0.000us 0.00% 0.000us 0.000us 6
|
| 4005 |
+
cudaFuncSetAttribute 0.11% 5.440us 0.11% 5.440us 1.813us 0.000us 0.00% 0.000us 0.000us 3
|
| 4006 |
+
cudaLaunchKernel 4.81% 227.898us 4.81% 227.898us 75.966us 0.000us 0.00% 0.000us 0.000us 3
|
| 4007 |
+
aten::reshape 0.19% 8.769us 0.49% 23.220us 3.870us 0.000us 0.00% 0.000us 0.000us 6
|
| 4008 |
+
aten::view 0.31% 14.451us 0.31% 14.451us 2.409us 0.000us 0.00% 0.000us 0.000us 6
|
| 4009 |
+
cudaDeviceSynchronize 53.44% 2.531ms 53.44% 2.531ms 2.531ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
+
Self CPU time total: 4.736ms
|
| 4012 |
+
Self CUDA time total: 2.941ms
|
| 4013 |
|
| 4014 |
|
| 4015 |
|
|
|
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
+
xformers_meff 5.82% 299.962us 41.73% 2.152ms 2.152ms 0.000us 0.00% 4.566ms 4.566ms 1
|
| 4023 |
+
xformers_flash3::flash_fwd 2.76% 142.114us 35.47% 1.829ms 609.751us 0.000us 0.00% 4.566ms 1.522ms 3
|
| 4024 |
+
flash_attn_3::fwd 1.04% 53.631us 32.71% 1.687ms 562.380us 3.419ms 100.00% 4.566ms 1.522ms 3
|
| 4025 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.420ms 100.05% 3.420ms 3.420ms 1
|
| 4026 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.419ms 100.00% 3.419ms 1.140ms 3
|
| 4027 |
+
Activity Buffer Request 27.56% 1.422ms 27.56% 1.422ms 1.422ms 1.148ms 33.58% 1.148ms 1.148ms 1
|
| 4028 |
+
aten::empty 0.60% 31.172us 0.60% 31.172us 5.195us 0.000us 0.00% 0.000us 0.000us 6
|
| 4029 |
+
cudaFuncSetAttribute 0.11% 5.431us 0.11% 5.431us 1.810us 0.000us 0.00% 0.000us 0.000us 3
|
| 4030 |
+
cudaLaunchKernel 3.40% 175.366us 3.40% 175.366us 58.455us 0.000us 0.00% 0.000us 0.000us 3
|
| 4031 |
+
aten::reshape 0.17% 8.849us 0.45% 23.030us 3.838us 0.000us 0.00% 0.000us 0.000us 6
|
| 4032 |
+
aten::view 0.27% 14.181us 0.27% 14.181us 2.363us 0.000us 0.00% 0.000us 0.000us 6
|
| 4033 |
+
cudaDeviceSynchronize 58.27% 3.005ms 58.27% 3.005ms 3.005ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
+
Self CPU time total: 5.157ms
|
| 4036 |
+
Self CUDA time total: 3.419ms
|
| 4037 |
|
| 4038 |
|
| 4039 |
|
|
|
|
| 4043 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4044 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4045 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4046 |
+
xformers_meff 5.76% 295.800us 41.67% 2.139ms 2.139ms 0.000us 0.00% 4.557ms 4.557ms 1
|
| 4047 |
+
xformers_flash3::flash_fwd 2.75% 141.044us 35.47% 1.821ms 606.924us 0.000us 0.00% 4.557ms 1.519ms 3
|
| 4048 |
+
flash_attn_3::fwd 1.04% 53.523us 32.72% 1.680ms 559.910us 3.405ms 100.00% 4.557ms 1.519ms 3
|
| 4049 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.407ms 100.05% 3.407ms 3.407ms 1
|
| 4050 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.405ms 100.00% 3.405ms 1.135ms 3
|
| 4051 |
+
Activity Buffer Request 27.67% 1.420ms 27.67% 1.420ms 1.420ms 1.152ms 33.82% 1.152ms 1.152ms 1
|
| 4052 |
+
aten::empty 0.60% 30.610us 0.60% 30.610us 5.102us 0.000us 0.00% 0.000us 0.000us 6
|
| 4053 |
+
cudaFuncSetAttribute 0.12% 6.310us 0.12% 6.310us 2.103us 0.000us 0.00% 0.000us 0.000us 3
|
| 4054 |
+
cudaLaunchKernel 3.29% 168.946us 3.29% 168.946us 56.315us 0.000us 0.00% 0.000us 0.000us 3
|
| 4055 |
+
aten::reshape 0.17% 8.721us 0.44% 22.392us 3.732us 0.000us 0.00% 0.000us 0.000us 6
|
| 4056 |
+
aten::view 0.27% 13.671us 0.27% 13.671us 2.279us 0.000us 0.00% 0.000us 0.000us 6
|
| 4057 |
+
cudaDeviceSynchronize 58.33% 2.994ms 58.33% 2.994ms 2.994ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
+
Self CPU time total: 5.133ms
|
| 4060 |
+
Self CUDA time total: 3.405ms
|
| 4061 |
|
| 4062 |
|
| 4063 |
impl wl p50(ms) ok
|
| 4064 |
+
xformers_meff cuda_attn_L128_bfloat16 0.98 True
|
| 4065 |
+
xformers_meff cuda_attn_L256_bfloat16 1.03 True
|
| 4066 |
+
xformers_meff cuda_attn_L320_bfloat16 1.08 True
|
| 4067 |
+
xformers_meff cuda_attn_L384_bfloat16 1.10 True
|
| 4068 |
+
xformers_meff cuda_attn_L448_bfloat16 1.23 True
|
| 4069 |
+
xformers_meff cuda_attn_L512_bfloat16 1.22 True
|
| 4070 |
</pre></div>
|
| 4071 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4072 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
flash_attn/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
flash_attn/results/combined_results.html
CHANGED
|
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -3982,96 +3982,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3982 |
<g id="matplotlib.axis_2">
|
| 3983 |
<g id="ytick_1">
|
| 3984 |
<g id="grid-y--2" class="grid grid-y">
|
| 3985 |
-
<path d="M 47.81
|
| 3986 |
</g>
|
| 3987 |
<g id="line2d_7">
|
| 3988 |
<defs>
|
| 3989 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3990 |
</defs>
|
| 3991 |
<g>
|
| 3992 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="text_7">
|
| 3996 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="ytick_2">
|
| 4000 |
<g id="grid-y--3" class="grid grid-y">
|
| 4001 |
-
<path d="M 47.81
|
| 4002 |
</g>
|
| 4003 |
<g id="line2d_8">
|
| 4004 |
<g>
|
| 4005 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="text_8">
|
| 4009 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="ytick_3">
|
| 4013 |
<g id="grid-y--4" class="grid grid-y">
|
| 4014 |
-
<path d="M 47.81
|
| 4015 |
</g>
|
| 4016 |
<g id="line2d_9">
|
| 4017 |
<g>
|
| 4018 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="text_9">
|
| 4022 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="ytick_4">
|
| 4026 |
<g id="grid-y--5" class="grid grid-y">
|
| 4027 |
-
<path d="M 47.81
|
| 4028 |
</g>
|
| 4029 |
<g id="line2d_10">
|
| 4030 |
<g>
|
| 4031 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_5">
|
| 4039 |
<g id="grid-y--6" class="grid grid-y">
|
| 4040 |
-
<path d="M 47.81
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_6">
|
| 4052 |
<g id="grid-y--7" class="grid grid-y">
|
| 4053 |
-
<path d="M 47.81
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="ytick_7">
|
| 4065 |
<g id="grid-y--8" class="grid grid-y">
|
| 4066 |
-
<path d="M 47.81
|
| 4067 |
</g>
|
| 4068 |
<g id="line2d_13">
|
| 4069 |
<g>
|
| 4070 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4071 |
</g>
|
| 4072 |
</g>
|
| 4073 |
<g id="text_13">
|
| 4074 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="label--y" class="ylabel">
|
|
@@ -4079,73 +4079,73 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="series--torch-flash-ma" class="series">
|
| 4082 |
-
<path d="M 83.607806
|
| 4083 |
<defs>
|
| 4084 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4085 |
</defs>
|
| 4086 |
<g clip-path="url(#p09feef2583)">
|
| 4087 |
-
<use ns4:href="#md7efaf3aec" x="83.607806" y="
|
| 4088 |
-
<use ns4:href="#md7efaf3aec" x="226.799032" y="
|
| 4089 |
-
<use ns4:href="#md7efaf3aec" x="369.990258" y="
|
| 4090 |
-
<use ns4:href="#md7efaf3aec" x="513.181484" y="
|
| 4091 |
-
<use ns4:href="#md7efaf3aec" x="656.37271" y="
|
| 4092 |
-
<use ns4:href="#md7efaf3aec" x="799.563935" y="
|
| 4093 |
</g>
|
| 4094 |
</g>
|
| 4095 |
<g id="series--torch-mem-eff" class="series">
|
| 4096 |
-
<path d="M 83.607806
|
| 4097 |
<defs>
|
| 4098 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4099 |
</defs>
|
| 4100 |
<g clip-path="url(#p09feef2583)">
|
| 4101 |
-
<use ns4:href="#m9b8c54d372" x="83.607806" y="
|
| 4102 |
-
<use ns4:href="#m9b8c54d372" x="226.799032" y="
|
| 4103 |
-
<use ns4:href="#m9b8c54d372" x="369.990258" y="
|
| 4104 |
-
<use ns4:href="#m9b8c54d372" x="513.181484" y="
|
| 4105 |
-
<use ns4:href="#m9b8c54d372" x="656.37271" y="
|
| 4106 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4107 |
</g>
|
| 4108 |
</g>
|
| 4109 |
<g id="series--xformers-meff" class="series">
|
| 4110 |
-
<path d="M 83.607806
|
| 4111 |
<defs>
|
| 4112 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4113 |
</defs>
|
| 4114 |
<g clip-path="url(#p09feef2583)">
|
| 4115 |
-
<use ns4:href="#mc655281e0b" x="83.607806" y="
|
| 4116 |
-
<use ns4:href="#mc655281e0b" x="226.799032" y="
|
| 4117 |
-
<use ns4:href="#mc655281e0b" x="369.990258" y="
|
| 4118 |
-
<use ns4:href="#mc655281e0b" x="513.181484" y="
|
| 4119 |
-
<use ns4:href="#mc655281e0b" x="656.37271" y="
|
| 4120 |
-
<use ns4:href="#mc655281e0b" x="799.563935" y="
|
| 4121 |
</g>
|
| 4122 |
</g>
|
| 4123 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4124 |
-
<path d="M 83.607806
|
| 4125 |
<defs>
|
| 4126 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4127 |
</defs>
|
| 4128 |
<g clip-path="url(#p09feef2583)">
|
| 4129 |
-
<use ns4:href="#m61c8040d7e" x="83.607806" y="
|
| 4130 |
-
<use ns4:href="#m61c8040d7e" x="226.799032" y="
|
| 4131 |
-
<use ns4:href="#m61c8040d7e" x="369.990258" y="
|
| 4132 |
-
<use ns4:href="#m61c8040d7e" x="513.181484" y="
|
| 4133 |
-
<use ns4:href="#m61c8040d7e" x="656.37271" y="
|
| 4134 |
-
<use ns4:href="#m61c8040d7e" x="799.563935" y="
|
| 4135 |
</g>
|
| 4136 |
</g>
|
| 4137 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4138 |
-
<path d="M 83.607806 428.387702 L 226.799032
|
| 4139 |
<defs>
|
| 4140 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4141 |
</defs>
|
| 4142 |
<g clip-path="url(#p09feef2583)">
|
| 4143 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4144 |
-
<use ns4:href="#m7cd35be9cc" x="226.799032" y="
|
| 4145 |
-
<use ns4:href="#m7cd35be9cc" x="369.990258" y="
|
| 4146 |
-
<use ns4:href="#m7cd35be9cc" x="513.181484" y="
|
| 4147 |
-
<use ns4:href="#m7cd35be9cc" x="656.37271" y="
|
| 4148 |
-
<use ns4:href="#m7cd35be9cc" x="799.563935" y="
|
| 4149 |
</g>
|
| 4150 |
</g>
|
| 4151 |
<g id="patch_3">
|
|
@@ -4230,7 +4230,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4230 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4231 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4232 |
</span> |
|
| 4233 |
-
Cell: combine | 4.
|
| 4234 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4235 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4236 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4315,7 +4315,7 @@ LOADING BENCHMARK DATA
|
|
| 4315 |
✓ xFormers : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/269846603898e0ee1872d7a8b40fca43ba558b2f3400f8a7bedb1ee79df7da58
|
| 4316 |
✓ HF Kernels Flash Attn : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/c1c92a22d205ca145ffb0083188c0f8eef512cfd6aa091b1e49d6329fbd08849
|
| 4317 |
✓ HF Kernels Flash Attn3 : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20
|
| 4318 |
-
✓ SageAttention : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/
|
| 4319 |
|
| 4320 |
✓ Found Flash (PyTorch SDPA)
|
| 4321 |
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/4b81c2b991fc4a0f70c4117e933abc4007fd7f3f55394d7778a4074adf29df04/attention.jsonl
|
|
@@ -4328,7 +4328,7 @@ LOADING BENCHMARK DATA
|
|
| 4328 |
✓ Found HF Kernels Flash Attn3
|
| 4329 |
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20/attention.jsonl
|
| 4330 |
✓ Found SageAttention
|
| 4331 |
-
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/
|
| 4332 |
|
| 4333 |
======================================================================
|
| 4334 |
Summary: 6 found, 0 skipped, 0 missing
|
|
@@ -4337,48 +4337,48 @@ Summary: 6 found, 0 skipped, 0 missing
|
|
| 4337 |
COMBINED BENCHMARK SUMMARY
|
| 4338 |
|
| 4339 |
impl wl p50(ms) ok
|
| 4340 |
-
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.
|
| 4341 |
-
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.
|
| 4342 |
-
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.
|
| 4343 |
-
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.
|
| 4344 |
-
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.
|
| 4345 |
-
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.
|
| 4346 |
-
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.
|
| 4347 |
-
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.
|
| 4348 |
-
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.
|
| 4349 |
-
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.
|
| 4350 |
-
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.
|
| 4351 |
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
|
| 4352 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4353 |
-
Error: module '
|
| 4354 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4355 |
-
Error: module '
|
| 4356 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4357 |
-
Error: module '
|
| 4358 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4359 |
-
Error: module '
|
| 4360 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4361 |
-
Error: module '
|
| 4362 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4363 |
-
Error: module '
|
| 4364 |
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
|
| 4365 |
torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
|
| 4366 |
-
torch_flash_ma cuda_attn_L320_bfloat16 1.
|
| 4367 |
-
torch_flash_ma cuda_attn_L384_bfloat16 1.
|
| 4368 |
-
torch_flash_ma cuda_attn_L448_bfloat16 1.
|
| 4369 |
-
torch_flash_ma cuda_attn_L512_bfloat16 1.
|
| 4370 |
-
torch_mem_eff cuda_attn_L128_bfloat16 1.
|
| 4371 |
-
torch_mem_eff cuda_attn_L256_bfloat16 1.
|
| 4372 |
-
torch_mem_eff cuda_attn_L320_bfloat16 2.
|
| 4373 |
-
torch_mem_eff cuda_attn_L384_bfloat16 2.
|
| 4374 |
-
torch_mem_eff cuda_attn_L448_bfloat16 2.
|
| 4375 |
-
torch_mem_eff cuda_attn_L512_bfloat16 2.
|
| 4376 |
-
xformers_meff cuda_attn_L128_bfloat16
|
| 4377 |
-
xformers_meff cuda_attn_L256_bfloat16 1.
|
| 4378 |
-
xformers_meff cuda_attn_L320_bfloat16 1.
|
| 4379 |
-
xformers_meff cuda_attn_L384_bfloat16 1.
|
| 4380 |
-
xformers_meff cuda_attn_L448_bfloat16 1.
|
| 4381 |
-
xformers_meff cuda_attn_L512_bfloat16 1.
|
| 4382 |
|
| 4383 |
GENERATING COMBINED VISUALIZATION
|
| 4384 |
|
|
@@ -4402,7 +4402,7 @@ Implementations included:
|
|
| 4402 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4403 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4404 |
<div class="uv-logs-content" style="display: none;">
|
| 4405 |
-
Installed 37 packages in
|
| 4406 |
</div>
|
| 4407 |
</div>
|
| 4408 |
<div class="cell-artifacts">
|
|
@@ -4415,7 +4415,7 @@ Installed 37 packages in 259ms
|
|
| 4415 |
<rdf:RDF>
|
| 4416 |
<ns2:Work>
|
| 4417 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4418 |
-
<dc:date>2025-10-
|
| 4419 |
<dc:format>image/svg+xml</dc:format>
|
| 4420 |
<dc:creator>
|
| 4421 |
<ns2:Agent>
|
|
@@ -4525,96 +4525,96 @@ Installed 37 packages in 259ms
|
|
| 4525 |
<g id="matplotlib.axis_2">
|
| 4526 |
<g id="ytick_1">
|
| 4527 |
<g id="grid-y--2" class="grid grid-y">
|
| 4528 |
-
<path d="M 47.81
|
| 4529 |
</g>
|
| 4530 |
<g id="line2d_7">
|
| 4531 |
<defs>
|
| 4532 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4533 |
</defs>
|
| 4534 |
<g>
|
| 4535 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4536 |
</g>
|
| 4537 |
</g>
|
| 4538 |
<g id="text_7">
|
| 4539 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4540 |
</g>
|
| 4541 |
</g>
|
| 4542 |
<g id="ytick_2">
|
| 4543 |
<g id="grid-y--3" class="grid grid-y">
|
| 4544 |
-
<path d="M 47.81
|
| 4545 |
</g>
|
| 4546 |
<g id="line2d_8">
|
| 4547 |
<g>
|
| 4548 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4549 |
</g>
|
| 4550 |
</g>
|
| 4551 |
<g id="text_8">
|
| 4552 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4553 |
</g>
|
| 4554 |
</g>
|
| 4555 |
<g id="ytick_3">
|
| 4556 |
<g id="grid-y--4" class="grid grid-y">
|
| 4557 |
-
<path d="M 47.81
|
| 4558 |
</g>
|
| 4559 |
<g id="line2d_9">
|
| 4560 |
<g>
|
| 4561 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4562 |
</g>
|
| 4563 |
</g>
|
| 4564 |
<g id="text_9">
|
| 4565 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4566 |
</g>
|
| 4567 |
</g>
|
| 4568 |
<g id="ytick_4">
|
| 4569 |
<g id="grid-y--5" class="grid grid-y">
|
| 4570 |
-
<path d="M 47.81
|
| 4571 |
</g>
|
| 4572 |
<g id="line2d_10">
|
| 4573 |
<g>
|
| 4574 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4575 |
</g>
|
| 4576 |
</g>
|
| 4577 |
<g id="text_10">
|
| 4578 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4579 |
</g>
|
| 4580 |
</g>
|
| 4581 |
<g id="ytick_5">
|
| 4582 |
<g id="grid-y--6" class="grid grid-y">
|
| 4583 |
-
<path d="M 47.81
|
| 4584 |
</g>
|
| 4585 |
<g id="line2d_11">
|
| 4586 |
<g>
|
| 4587 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4588 |
</g>
|
| 4589 |
</g>
|
| 4590 |
<g id="text_11">
|
| 4591 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4592 |
</g>
|
| 4593 |
</g>
|
| 4594 |
<g id="ytick_6">
|
| 4595 |
<g id="grid-y--7" class="grid grid-y">
|
| 4596 |
-
<path d="M 47.81
|
| 4597 |
</g>
|
| 4598 |
<g id="line2d_12">
|
| 4599 |
<g>
|
| 4600 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4601 |
</g>
|
| 4602 |
</g>
|
| 4603 |
<g id="text_12">
|
| 4604 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4605 |
</g>
|
| 4606 |
</g>
|
| 4607 |
<g id="ytick_7">
|
| 4608 |
<g id="grid-y--8" class="grid grid-y">
|
| 4609 |
-
<path d="M 47.81
|
| 4610 |
</g>
|
| 4611 |
<g id="line2d_13">
|
| 4612 |
<g>
|
| 4613 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4614 |
</g>
|
| 4615 |
</g>
|
| 4616 |
<g id="text_13">
|
| 4617 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4618 |
</g>
|
| 4619 |
</g>
|
| 4620 |
<g id="label--y" class="ylabel">
|
|
@@ -4622,73 +4622,73 @@ Installed 37 packages in 259ms
|
|
| 4622 |
</g>
|
| 4623 |
</g>
|
| 4624 |
<g id="series--torch-flash-ma" class="series">
|
| 4625 |
-
<path d="M 83.607806
|
| 4626 |
<defs>
|
| 4627 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4628 |
</defs>
|
| 4629 |
<g clip-path="url(#p09feef2583)">
|
| 4630 |
-
<use ns4:href="#md7efaf3aec" x="83.607806" y="
|
| 4631 |
-
<use ns4:href="#md7efaf3aec" x="226.799032" y="
|
| 4632 |
-
<use ns4:href="#md7efaf3aec" x="369.990258" y="
|
| 4633 |
-
<use ns4:href="#md7efaf3aec" x="513.181484" y="
|
| 4634 |
-
<use ns4:href="#md7efaf3aec" x="656.37271" y="
|
| 4635 |
-
<use ns4:href="#md7efaf3aec" x="799.563935" y="
|
| 4636 |
</g>
|
| 4637 |
</g>
|
| 4638 |
<g id="series--torch-mem-eff" class="series">
|
| 4639 |
-
<path d="M 83.607806
|
| 4640 |
<defs>
|
| 4641 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4642 |
</defs>
|
| 4643 |
<g clip-path="url(#p09feef2583)">
|
| 4644 |
-
<use ns4:href="#m9b8c54d372" x="83.607806" y="
|
| 4645 |
-
<use ns4:href="#m9b8c54d372" x="226.799032" y="
|
| 4646 |
-
<use ns4:href="#m9b8c54d372" x="369.990258" y="
|
| 4647 |
-
<use ns4:href="#m9b8c54d372" x="513.181484" y="
|
| 4648 |
-
<use ns4:href="#m9b8c54d372" x="656.37271" y="
|
| 4649 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4650 |
</g>
|
| 4651 |
</g>
|
| 4652 |
<g id="series--xformers-meff" class="series">
|
| 4653 |
-
<path d="M 83.607806
|
| 4654 |
<defs>
|
| 4655 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4656 |
</defs>
|
| 4657 |
<g clip-path="url(#p09feef2583)">
|
| 4658 |
-
<use ns4:href="#mc655281e0b" x="83.607806" y="
|
| 4659 |
-
<use ns4:href="#mc655281e0b" x="226.799032" y="
|
| 4660 |
-
<use ns4:href="#mc655281e0b" x="369.990258" y="
|
| 4661 |
-
<use ns4:href="#mc655281e0b" x="513.181484" y="
|
| 4662 |
-
<use ns4:href="#mc655281e0b" x="656.37271" y="
|
| 4663 |
-
<use ns4:href="#mc655281e0b" x="799.563935" y="
|
| 4664 |
</g>
|
| 4665 |
</g>
|
| 4666 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4667 |
-
<path d="M 83.607806
|
| 4668 |
<defs>
|
| 4669 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4670 |
</defs>
|
| 4671 |
<g clip-path="url(#p09feef2583)">
|
| 4672 |
-
<use ns4:href="#m61c8040d7e" x="83.607806" y="
|
| 4673 |
-
<use ns4:href="#m61c8040d7e" x="226.799032" y="
|
| 4674 |
-
<use ns4:href="#m61c8040d7e" x="369.990258" y="
|
| 4675 |
-
<use ns4:href="#m61c8040d7e" x="513.181484" y="
|
| 4676 |
-
<use ns4:href="#m61c8040d7e" x="656.37271" y="
|
| 4677 |
-
<use ns4:href="#m61c8040d7e" x="799.563935" y="
|
| 4678 |
</g>
|
| 4679 |
</g>
|
| 4680 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4681 |
-
<path d="M 83.607806 428.387702 L 226.799032
|
| 4682 |
<defs>
|
| 4683 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4684 |
</defs>
|
| 4685 |
<g clip-path="url(#p09feef2583)">
|
| 4686 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4687 |
-
<use ns4:href="#m7cd35be9cc" x="226.799032" y="
|
| 4688 |
-
<use ns4:href="#m7cd35be9cc" x="369.990258" y="
|
| 4689 |
-
<use ns4:href="#m7cd35be9cc" x="513.181484" y="
|
| 4690 |
-
<use ns4:href="#m7cd35be9cc" x="656.37271" y="
|
| 4691 |
-
<use ns4:href="#m7cd35be9cc" x="799.563935" y="
|
| 4692 |
</g>
|
| 4693 |
</g>
|
| 4694 |
<g id="patch_3">
|
|
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
+
<dc:date>2025-10-28T14:09:17.505622</dc:date>
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
|
|
| 3982 |
<g id="matplotlib.axis_2">
|
| 3983 |
<g id="ytick_1">
|
| 3984 |
<g id="grid-y--2" class="grid grid-y">
|
| 3985 |
+
<path d="M 47.81 403.521712 L 835.361742 403.521712 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3986 |
</g>
|
| 3987 |
<g id="line2d_7">
|
| 3988 |
<defs>
|
| 3989 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3990 |
</defs>
|
| 3991 |
<g>
|
| 3992 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="403.521712" style="stroke: #000000; stroke-width: 0.8" />
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="text_7">
|
| 3996 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="407.320931" transform="rotate(-0 40.81 407.320931)">1.0</text>
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="ytick_2">
|
| 4000 |
<g id="grid-y--3" class="grid grid-y">
|
| 4001 |
+
<path d="M 47.81 343.523424 L 835.361742 343.523424 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4002 |
</g>
|
| 4003 |
<g id="line2d_8">
|
| 4004 |
<g>
|
| 4005 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="343.523424" style="stroke: #000000; stroke-width: 0.8" />
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="text_8">
|
| 4009 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="347.322643" transform="rotate(-0 40.81 347.322643)">1.2</text>
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="ytick_3">
|
| 4013 |
<g id="grid-y--4" class="grid grid-y">
|
| 4014 |
+
<path d="M 47.81 283.525136 L 835.361742 283.525136 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4015 |
</g>
|
| 4016 |
<g id="line2d_9">
|
| 4017 |
<g>
|
| 4018 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="283.525136" style="stroke: #000000; stroke-width: 0.8" />
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="text_9">
|
| 4022 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="287.324355" transform="rotate(-0 40.81 287.324355)">1.4</text>
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="ytick_4">
|
| 4026 |
<g id="grid-y--5" class="grid grid-y">
|
| 4027 |
+
<path d="M 47.81 223.526848 L 835.361742 223.526848 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4028 |
</g>
|
| 4029 |
<g id="line2d_10">
|
| 4030 |
<g>
|
| 4031 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="223.526848" style="stroke: #000000; stroke-width: 0.8" />
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="227.326067" transform="rotate(-0 40.81 227.326067)">1.6</text>
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_5">
|
| 4039 |
<g id="grid-y--6" class="grid grid-y">
|
| 4040 |
+
<path d="M 47.81 163.52856 L 835.361742 163.52856 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="163.52856" style="stroke: #000000; stroke-width: 0.8" />
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="167.327779" transform="rotate(-0 40.81 167.327779)">1.8</text>
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_6">
|
| 4052 |
<g id="grid-y--7" class="grid grid-y">
|
| 4053 |
+
<path d="M 47.81 103.530273 L 835.361742 103.530273 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="103.530273" style="stroke: #000000; stroke-width: 0.8" />
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.329491" transform="rotate(-0 40.81 107.329491)">2.0</text>
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="ytick_7">
|
| 4065 |
<g id="grid-y--8" class="grid grid-y">
|
| 4066 |
+
<path d="M 47.81 43.531985 L 835.361742 43.531985 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4067 |
</g>
|
| 4068 |
<g id="line2d_13">
|
| 4069 |
<g>
|
| 4070 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="43.531985" style="stroke: #000000; stroke-width: 0.8" />
|
| 4071 |
</g>
|
| 4072 |
</g>
|
| 4073 |
<g id="text_13">
|
| 4074 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="47.331204" transform="rotate(-0 40.81 47.331204)">2.2</text>
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="series--torch-flash-ma" class="series">
|
| 4082 |
+
<path d="M 83.607806 337.456697 L 226.799032 322.330829 L 369.990258 318.592935 L 513.181484 311.965825 L 656.37271 262.663131 L 799.563935 254.692359 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4083 |
<defs>
|
| 4084 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4085 |
</defs>
|
| 4086 |
<g clip-path="url(#p09feef2583)">
|
| 4087 |
+
<use ns4:href="#md7efaf3aec" x="83.607806" y="337.456697" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4088 |
+
<use ns4:href="#md7efaf3aec" x="226.799032" y="322.330829" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4089 |
+
<use ns4:href="#md7efaf3aec" x="369.990258" y="318.592935" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4090 |
+
<use ns4:href="#md7efaf3aec" x="513.181484" y="311.965825" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4091 |
+
<use ns4:href="#md7efaf3aec" x="656.37271" y="262.663131" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4092 |
+
<use ns4:href="#md7efaf3aec" x="799.563935" y="254.692359" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4093 |
</g>
|
| 4094 |
</g>
|
| 4095 |
<g id="series--torch-mem-eff" class="series">
|
| 4096 |
+
<path d="M 83.607806 144.033917 L 226.799032 111.747638 L 369.990258 92.42159 L 513.181484 85.353791 L 656.37271 94.728524 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4097 |
<defs>
|
| 4098 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4099 |
</defs>
|
| 4100 |
<g clip-path="url(#p09feef2583)">
|
| 4101 |
+
<use ns4:href="#m9b8c54d372" x="83.607806" y="144.033917" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4102 |
+
<use ns4:href="#m9b8c54d372" x="226.799032" y="111.747638" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4103 |
+
<use ns4:href="#m9b8c54d372" x="369.990258" y="92.42159" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4104 |
+
<use ns4:href="#m9b8c54d372" x="513.181484" y="85.353791" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4105 |
+
<use ns4:href="#m9b8c54d372" x="656.37271" y="94.728524" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4106 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4107 |
</g>
|
| 4108 |
</g>
|
| 4109 |
<g id="series--xformers-meff" class="series">
|
| 4110 |
+
<path d="M 83.607806 408.245077 L 226.799032 395.990127 L 369.990258 378.455027 L 513.181484 373.43287 L 656.37271 333.571508 L 799.563935 337.423698 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4111 |
<defs>
|
| 4112 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4113 |
</defs>
|
| 4114 |
<g clip-path="url(#p09feef2583)">
|
| 4115 |
+
<use ns4:href="#mc655281e0b" x="83.607806" y="408.245077" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4116 |
+
<use ns4:href="#mc655281e0b" x="226.799032" y="395.990127" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4117 |
+
<use ns4:href="#mc655281e0b" x="369.990258" y="378.455027" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4118 |
+
<use ns4:href="#mc655281e0b" x="513.181484" y="373.43287" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4119 |
+
<use ns4:href="#mc655281e0b" x="656.37271" y="333.571508" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4120 |
+
<use ns4:href="#mc655281e0b" x="799.563935" y="337.423698" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4121 |
</g>
|
| 4122 |
</g>
|
| 4123 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4124 |
+
<path d="M 83.607806 415.568468 L 226.799032 400.735991 L 369.990258 386.008812 L 513.181484 387.284075 L 656.37271 338.461368 L 799.563935 341.493982 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4125 |
<defs>
|
| 4126 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4127 |
</defs>
|
| 4128 |
<g clip-path="url(#p09feef2583)">
|
| 4129 |
+
<use ns4:href="#m61c8040d7e" x="83.607806" y="415.568468" style="fill: #d62728; stroke: #d62728" />
|
| 4130 |
+
<use ns4:href="#m61c8040d7e" x="226.799032" y="400.735991" style="fill: #d62728; stroke: #d62728" />
|
| 4131 |
+
<use ns4:href="#m61c8040d7e" x="369.990258" y="386.008812" style="fill: #d62728; stroke: #d62728" />
|
| 4132 |
+
<use ns4:href="#m61c8040d7e" x="513.181484" y="387.284075" style="fill: #d62728; stroke: #d62728" />
|
| 4133 |
+
<use ns4:href="#m61c8040d7e" x="656.37271" y="338.461368" style="fill: #d62728; stroke: #d62728" />
|
| 4134 |
+
<use ns4:href="#m61c8040d7e" x="799.563935" y="341.493982" style="fill: #d62728; stroke: #d62728" />
|
| 4135 |
</g>
|
| 4136 |
</g>
|
| 4137 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4138 |
+
<path d="M 83.607806 428.387702 L 226.799032 415.50217 L 369.990258 397.727077 L 513.181484 397.526383 L 656.37271 348.148992 L 799.563935 348.55398 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4139 |
<defs>
|
| 4140 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4141 |
</defs>
|
| 4142 |
<g clip-path="url(#p09feef2583)">
|
| 4143 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4144 |
+
<use ns4:href="#m7cd35be9cc" x="226.799032" y="415.50217" style="fill: #9467bd; stroke: #9467bd" />
|
| 4145 |
+
<use ns4:href="#m7cd35be9cc" x="369.990258" y="397.727077" style="fill: #9467bd; stroke: #9467bd" />
|
| 4146 |
+
<use ns4:href="#m7cd35be9cc" x="513.181484" y="397.526383" style="fill: #9467bd; stroke: #9467bd" />
|
| 4147 |
+
<use ns4:href="#m7cd35be9cc" x="656.37271" y="348.148992" style="fill: #9467bd; stroke: #9467bd" />
|
| 4148 |
+
<use ns4:href="#m7cd35be9cc" x="799.563935" y="348.55398" style="fill: #9467bd; stroke: #9467bd" />
|
| 4149 |
</g>
|
| 4150 |
</g>
|
| 4151 |
<g id="patch_3">
|
|
|
|
| 4230 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4231 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4232 |
</span> |
|
| 4233 |
+
Cell: combine | 4.25s
|
| 4234 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4235 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4236 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4315 |
✓ xFormers : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/269846603898e0ee1872d7a8b40fca43ba558b2f3400f8a7bedb1ee79df7da58
|
| 4316 |
✓ HF Kernels Flash Attn : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/c1c92a22d205ca145ffb0083188c0f8eef512cfd6aa091b1e49d6329fbd08849
|
| 4317 |
✓ HF Kernels Flash Attn3 : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20
|
| 4318 |
+
✓ SageAttention : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/f6be24aff45575cad8d1df490ac5fe9ec944103fb255665c71719ca2d7efea4e
|
| 4319 |
|
| 4320 |
✓ Found Flash (PyTorch SDPA)
|
| 4321 |
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/4b81c2b991fc4a0f70c4117e933abc4007fd7f3f55394d7778a4074adf29df04/attention.jsonl
|
|
|
|
| 4328 |
✓ Found HF Kernels Flash Attn3
|
| 4329 |
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20/attention.jsonl
|
| 4330 |
✓ Found SageAttention
|
| 4331 |
+
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/f6be24aff45575cad8d1df490ac5fe9ec944103fb255665c71719ca2d7efea4e/attention.jsonl
|
| 4332 |
|
| 4333 |
======================================================================
|
| 4334 |
Summary: 6 found, 0 skipped, 0 missing
|
|
|
|
| 4337 |
COMBINED BENCHMARK SUMMARY
|
| 4338 |
|
| 4339 |
impl wl p50(ms) ok
|
| 4340 |
+
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
|
| 4341 |
+
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
|
| 4342 |
+
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
|
| 4343 |
+
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True
|
| 4344 |
+
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
|
| 4345 |
+
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.21 True
|
| 4346 |
+
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
|
| 4347 |
+
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
|
| 4348 |
+
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True
|
| 4349 |
+
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
|
| 4350 |
+
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
|
| 4351 |
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
|
| 4352 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4353 |
+
Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
|
| 4354 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4355 |
+
Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
|
| 4356 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4357 |
+
Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
|
| 4358 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4359 |
+
Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
|
| 4360 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4361 |
+
Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
|
| 4362 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4363 |
+
Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
|
| 4364 |
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
|
| 4365 |
torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
|
| 4366 |
+
torch_flash_ma cuda_attn_L320_bfloat16 1.28 True
|
| 4367 |
+
torch_flash_ma cuda_attn_L384_bfloat16 1.31 True
|
| 4368 |
+
torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
|
| 4369 |
+
torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
|
| 4370 |
+
torch_mem_eff cuda_attn_L128_bfloat16 1.86 True
|
| 4371 |
+
torch_mem_eff cuda_attn_L256_bfloat16 1.97 True
|
| 4372 |
+
torch_mem_eff cuda_attn_L320_bfloat16 2.04 True
|
| 4373 |
+
torch_mem_eff cuda_attn_L384_bfloat16 2.06 True
|
| 4374 |
+
torch_mem_eff cuda_attn_L448_bfloat16 2.03 True
|
| 4375 |
+
torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
|
| 4376 |
+
xformers_meff cuda_attn_L128_bfloat16 0.98 True
|
| 4377 |
+
xformers_meff cuda_attn_L256_bfloat16 1.03 True
|
| 4378 |
+
xformers_meff cuda_attn_L320_bfloat16 1.08 True
|
| 4379 |
+
xformers_meff cuda_attn_L384_bfloat16 1.10 True
|
| 4380 |
+
xformers_meff cuda_attn_L448_bfloat16 1.23 True
|
| 4381 |
+
xformers_meff cuda_attn_L512_bfloat16 1.22 True
|
| 4382 |
|
| 4383 |
GENERATING COMBINED VISUALIZATION
|
| 4384 |
|
|
|
|
| 4402 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4403 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4404 |
<div class="uv-logs-content" style="display: none;">
|
| 4405 |
+
Installed 37 packages in 187ms
|
| 4406 |
</div>
|
| 4407 |
</div>
|
| 4408 |
<div class="cell-artifacts">
|
|
|
|
| 4415 |
<rdf:RDF>
|
| 4416 |
<ns2:Work>
|
| 4417 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4418 |
+
<dc:date>2025-10-28T14:09:17.505622</dc:date>
|
| 4419 |
<dc:format>image/svg+xml</dc:format>
|
| 4420 |
<dc:creator>
|
| 4421 |
<ns2:Agent>
|
|
|
|
| 4525 |
<g id="matplotlib.axis_2">
|
| 4526 |
<g id="ytick_1">
|
| 4527 |
<g id="grid-y--2" class="grid grid-y">
|
| 4528 |
+
<path d="M 47.81 403.521712 L 835.361742 403.521712 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4529 |
</g>
|
| 4530 |
<g id="line2d_7">
|
| 4531 |
<defs>
|
| 4532 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4533 |
</defs>
|
| 4534 |
<g>
|
| 4535 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="403.521712" style="stroke: #000000; stroke-width: 0.8" />
|
| 4536 |
</g>
|
| 4537 |
</g>
|
| 4538 |
<g id="text_7">
|
| 4539 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="407.320931" transform="rotate(-0 40.81 407.320931)">1.0</text>
|
| 4540 |
</g>
|
| 4541 |
</g>
|
| 4542 |
<g id="ytick_2">
|
| 4543 |
<g id="grid-y--3" class="grid grid-y">
|
| 4544 |
+
<path d="M 47.81 343.523424 L 835.361742 343.523424 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4545 |
</g>
|
| 4546 |
<g id="line2d_8">
|
| 4547 |
<g>
|
| 4548 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="343.523424" style="stroke: #000000; stroke-width: 0.8" />
|
| 4549 |
</g>
|
| 4550 |
</g>
|
| 4551 |
<g id="text_8">
|
| 4552 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="347.322643" transform="rotate(-0 40.81 347.322643)">1.2</text>
|
| 4553 |
</g>
|
| 4554 |
</g>
|
| 4555 |
<g id="ytick_3">
|
| 4556 |
<g id="grid-y--4" class="grid grid-y">
|
| 4557 |
+
<path d="M 47.81 283.525136 L 835.361742 283.525136 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4558 |
</g>
|
| 4559 |
<g id="line2d_9">
|
| 4560 |
<g>
|
| 4561 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="283.525136" style="stroke: #000000; stroke-width: 0.8" />
|
| 4562 |
</g>
|
| 4563 |
</g>
|
| 4564 |
<g id="text_9">
|
| 4565 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="287.324355" transform="rotate(-0 40.81 287.324355)">1.4</text>
|
| 4566 |
</g>
|
| 4567 |
</g>
|
| 4568 |
<g id="ytick_4">
|
| 4569 |
<g id="grid-y--5" class="grid grid-y">
|
| 4570 |
+
<path d="M 47.81 223.526848 L 835.361742 223.526848 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4571 |
</g>
|
| 4572 |
<g id="line2d_10">
|
| 4573 |
<g>
|
| 4574 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="223.526848" style="stroke: #000000; stroke-width: 0.8" />
|
| 4575 |
</g>
|
| 4576 |
</g>
|
| 4577 |
<g id="text_10">
|
| 4578 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="227.326067" transform="rotate(-0 40.81 227.326067)">1.6</text>
|
| 4579 |
</g>
|
| 4580 |
</g>
|
| 4581 |
<g id="ytick_5">
|
| 4582 |
<g id="grid-y--6" class="grid grid-y">
|
| 4583 |
+
<path d="M 47.81 163.52856 L 835.361742 163.52856 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4584 |
</g>
|
| 4585 |
<g id="line2d_11">
|
| 4586 |
<g>
|
| 4587 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="163.52856" style="stroke: #000000; stroke-width: 0.8" />
|
| 4588 |
</g>
|
| 4589 |
</g>
|
| 4590 |
<g id="text_11">
|
| 4591 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="167.327779" transform="rotate(-0 40.81 167.327779)">1.8</text>
|
| 4592 |
</g>
|
| 4593 |
</g>
|
| 4594 |
<g id="ytick_6">
|
| 4595 |
<g id="grid-y--7" class="grid grid-y">
|
| 4596 |
+
<path d="M 47.81 103.530273 L 835.361742 103.530273 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4597 |
</g>
|
| 4598 |
<g id="line2d_12">
|
| 4599 |
<g>
|
| 4600 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="103.530273" style="stroke: #000000; stroke-width: 0.8" />
|
| 4601 |
</g>
|
| 4602 |
</g>
|
| 4603 |
<g id="text_12">
|
| 4604 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.329491" transform="rotate(-0 40.81 107.329491)">2.0</text>
|
| 4605 |
</g>
|
| 4606 |
</g>
|
| 4607 |
<g id="ytick_7">
|
| 4608 |
<g id="grid-y--8" class="grid grid-y">
|
| 4609 |
+
<path d="M 47.81 43.531985 L 835.361742 43.531985 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4610 |
</g>
|
| 4611 |
<g id="line2d_13">
|
| 4612 |
<g>
|
| 4613 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="43.531985" style="stroke: #000000; stroke-width: 0.8" />
|
| 4614 |
</g>
|
| 4615 |
</g>
|
| 4616 |
<g id="text_13">
|
| 4617 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="47.331204" transform="rotate(-0 40.81 47.331204)">2.2</text>
|
| 4618 |
</g>
|
| 4619 |
</g>
|
| 4620 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4622 |
</g>
|
| 4623 |
</g>
|
| 4624 |
<g id="series--torch-flash-ma" class="series">
|
| 4625 |
+
<path d="M 83.607806 337.456697 L 226.799032 322.330829 L 369.990258 318.592935 L 513.181484 311.965825 L 656.37271 262.663131 L 799.563935 254.692359 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4626 |
<defs>
|
| 4627 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4628 |
</defs>
|
| 4629 |
<g clip-path="url(#p09feef2583)">
|
| 4630 |
+
<use ns4:href="#md7efaf3aec" x="83.607806" y="337.456697" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4631 |
+
<use ns4:href="#md7efaf3aec" x="226.799032" y="322.330829" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4632 |
+
<use ns4:href="#md7efaf3aec" x="369.990258" y="318.592935" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4633 |
+
<use ns4:href="#md7efaf3aec" x="513.181484" y="311.965825" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4634 |
+
<use ns4:href="#md7efaf3aec" x="656.37271" y="262.663131" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4635 |
+
<use ns4:href="#md7efaf3aec" x="799.563935" y="254.692359" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4636 |
</g>
|
| 4637 |
</g>
|
| 4638 |
<g id="series--torch-mem-eff" class="series">
|
| 4639 |
+
<path d="M 83.607806 144.033917 L 226.799032 111.747638 L 369.990258 92.42159 L 513.181484 85.353791 L 656.37271 94.728524 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4640 |
<defs>
|
| 4641 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4642 |
</defs>
|
| 4643 |
<g clip-path="url(#p09feef2583)">
|
| 4644 |
+
<use ns4:href="#m9b8c54d372" x="83.607806" y="144.033917" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4645 |
+
<use ns4:href="#m9b8c54d372" x="226.799032" y="111.747638" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4646 |
+
<use ns4:href="#m9b8c54d372" x="369.990258" y="92.42159" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4647 |
+
<use ns4:href="#m9b8c54d372" x="513.181484" y="85.353791" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4648 |
+
<use ns4:href="#m9b8c54d372" x="656.37271" y="94.728524" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4649 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4650 |
</g>
|
| 4651 |
</g>
|
| 4652 |
<g id="series--xformers-meff" class="series">
|
| 4653 |
+
<path d="M 83.607806 408.245077 L 226.799032 395.990127 L 369.990258 378.455027 L 513.181484 373.43287 L 656.37271 333.571508 L 799.563935 337.423698 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4654 |
<defs>
|
| 4655 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4656 |
</defs>
|
| 4657 |
<g clip-path="url(#p09feef2583)">
|
| 4658 |
+
<use ns4:href="#mc655281e0b" x="83.607806" y="408.245077" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4659 |
+
<use ns4:href="#mc655281e0b" x="226.799032" y="395.990127" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4660 |
+
<use ns4:href="#mc655281e0b" x="369.990258" y="378.455027" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4661 |
+
<use ns4:href="#mc655281e0b" x="513.181484" y="373.43287" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4662 |
+
<use ns4:href="#mc655281e0b" x="656.37271" y="333.571508" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4663 |
+
<use ns4:href="#mc655281e0b" x="799.563935" y="337.423698" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4664 |
</g>
|
| 4665 |
</g>
|
| 4666 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4667 |
+
<path d="M 83.607806 415.568468 L 226.799032 400.735991 L 369.990258 386.008812 L 513.181484 387.284075 L 656.37271 338.461368 L 799.563935 341.493982 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4668 |
<defs>
|
| 4669 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4670 |
</defs>
|
| 4671 |
<g clip-path="url(#p09feef2583)">
|
| 4672 |
+
<use ns4:href="#m61c8040d7e" x="83.607806" y="415.568468" style="fill: #d62728; stroke: #d62728" />
|
| 4673 |
+
<use ns4:href="#m61c8040d7e" x="226.799032" y="400.735991" style="fill: #d62728; stroke: #d62728" />
|
| 4674 |
+
<use ns4:href="#m61c8040d7e" x="369.990258" y="386.008812" style="fill: #d62728; stroke: #d62728" />
|
| 4675 |
+
<use ns4:href="#m61c8040d7e" x="513.181484" y="387.284075" style="fill: #d62728; stroke: #d62728" />
|
| 4676 |
+
<use ns4:href="#m61c8040d7e" x="656.37271" y="338.461368" style="fill: #d62728; stroke: #d62728" />
|
| 4677 |
+
<use ns4:href="#m61c8040d7e" x="799.563935" y="341.493982" style="fill: #d62728; stroke: #d62728" />
|
| 4678 |
</g>
|
| 4679 |
</g>
|
| 4680 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4681 |
+
<path d="M 83.607806 428.387702 L 226.799032 415.50217 L 369.990258 397.727077 L 513.181484 397.526383 L 656.37271 348.148992 L 799.563935 348.55398 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4682 |
<defs>
|
| 4683 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4684 |
</defs>
|
| 4685 |
<g clip-path="url(#p09feef2583)">
|
| 4686 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4687 |
+
<use ns4:href="#m7cd35be9cc" x="226.799032" y="415.50217" style="fill: #9467bd; stroke: #9467bd" />
|
| 4688 |
+
<use ns4:href="#m7cd35be9cc" x="369.990258" y="397.727077" style="fill: #9467bd; stroke: #9467bd" />
|
| 4689 |
+
<use ns4:href="#m7cd35be9cc" x="513.181484" y="397.526383" style="fill: #9467bd; stroke: #9467bd" />
|
| 4690 |
+
<use ns4:href="#m7cd35be9cc" x="656.37271" y="348.148992" style="fill: #9467bd; stroke: #9467bd" />
|
| 4691 |
+
<use ns4:href="#m7cd35be9cc" x="799.563935" y="348.55398" style="fill: #9467bd; stroke: #9467bd" />
|
| 4692 |
</g>
|
| 4693 |
</g>
|
| 4694 |
<g id="patch_3">
|
index.html
CHANGED
|
@@ -80,8 +80,10 @@
|
|
| 80 |
<h1>Index of /</h1>
|
| 81 |
<ul>
|
| 82 |
<li><a href='activation/index.html' class='dir'>activation/</a></li>
|
|
|
|
| 83 |
<li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
|
| 84 |
<li><a href='layer_norm/index.html' class='dir'>layer_norm/</a></li>
|
|
|
|
| 85 |
</ul>
|
| 86 |
</body>
|
| 87 |
</html>
|
|
|
|
| 80 |
<h1>Index of /</h1>
|
| 81 |
<ul>
|
| 82 |
<li><a href='activation/index.html' class='dir'>activation/</a></li>
|
| 83 |
+
<li><a href='causal_conv1d/index.html' class='dir'>causal_conv1d/</a></li>
|
| 84 |
<li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
|
| 85 |
<li><a href='layer_norm/index.html' class='dir'>layer_norm/</a></li>
|
| 86 |
+
<li><a href='rotary/index.html' class='dir'>rotary/</a></li>
|
| 87 |
</ul>
|
| 88 |
</body>
|
| 89 |
</html>
|
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
CHANGED
|
@@ -1,48 +1,4 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D1024", "batch": 1, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04395000019030704, "p50": 0.045061000037094345, "p90": 0.046920999920985196, "mean": 0.04563460001918429, "iqr": 0.0018609998733154498, "raw_times": [0.04718099989986513, 0.046920999920985196, 0.045060000047669746, 0.045061000037094345, 0.04395000019030704], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05090100012239418, "peak_bytes": 9441280, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 6 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D2048", "batch": 1, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04656000010072603, "p50": 0.046920999920985196, "p90": 0.04878100003224972, "mean": 0.04884479999418545, "iqr": 0.0020300001324358163, "raw_times": [0.04656000010072603, 0.046750999899813905, 0.04878100003224972, 0.0552110000171524, 0.046920999920985196], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0497399998948822, "peak_bytes": 18882560, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 7 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D4096", "batch": 1, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04567099995256285, "p50": 0.04622100004780805, "p90": 0.04798100007974426, "mean": 0.047496800016233465, "iqr": 0.0018200000795332016, "raw_times": [0.04567099995256285, 0.0514500000008411, 0.04616100000021106, 0.04798100007974426, 0.04622100004780805], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04885000021204178, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 8 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D8192", "batch": 1, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04487000001063279, "p50": 0.045961000068928115, "p90": 0.046200000042517786, "mean": 0.04860060003011313, "iqr": 0.000509000074089272, "raw_times": [0.06028100006005843, 0.04487000001063279, 0.045690999968428514, 0.045961000068928115, 0.046200000042517786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05061100000602892, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 9 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D1024", "batch": 1, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.043511000058060745, "p50": 0.046270999973785365, "p90": 0.04790999992110301, "mean": 0.047574600012012525, "iqr": 0.002919999815276242, "raw_times": [0.044990000105826766, 0.04790999992110301, 0.043511000058060745, 0.05519100000128674, 0.046270999973785365], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.048970999841913, "peak_bytes": 21008384, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 10 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D2048", "batch": 1, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.043170000026293565, "p50": 0.04767099994751334, "p90": 0.0476899999739544, "mean": 0.04691639996963204, "iqr": 0.0009390000741404947, "raw_times": [0.043170000026293565, 0.04930000000058499, 0.04767099994751334, 0.046750999899813905, 0.0476899999739544], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05084099984742352, "peak_bytes": 37756928, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 11 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D4096", "batch": 1, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044720999994751764, "p50": 0.045860000000175205, "p90": 0.046411000084845, "mean": 0.04585680003401649, "iqr": 0.0012000000424450263, "raw_times": [0.044720999994751764, 0.04708100004791049, 0.046411000084845, 0.045860000000175205, 0.045211000042399974], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05302099998516496, "peak_bytes": 75513856, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0016021728515625, "mse": 1.1682510375976562e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 12 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D8192", "batch": 1, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04476999993130448, "p50": 0.04614999988916679, "p90": 0.04633100002138235, "mean": 0.04639259996110923, "iqr": 0.00019000003703695256, "raw_times": [0.04476999993130448, 0.04614999988916679, 0.04857099997934711, 0.0461409999843454, 0.04633100002138235], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.047730999995110324, "peak_bytes": 151027712, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 13 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D1024", "batch": 1, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04606099992088275, "p50": 0.046679999968546326, "p90": 0.04687099999500788, "mean": 0.0466285999664251, "iqr": 0.0006509999366244301, "raw_times": [0.04606099992088275, 0.047310999889305094, 0.04687099999500788, 0.046679999968546326, 0.04622000005838345], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.050389999842082034, "peak_bytes": 41979904, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 14 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D2048", "batch": 1, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04560100001071987, "p50": 0.04617999979927845, "p90": 0.04656999999497202, "mean": 0.0462445999346528, "iqr": 0.0007090000053722179, "raw_times": [0.04617999979927845, 0.045860999989599804, 0.04560100001071987, 0.04701099987869384, 0.04656999999497202], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.049061000026995316, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 15 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D4096", "batch": 1, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04474100001061743, "p50": 0.04615000011654047, "p90": 0.04696099995271652, "mean": 0.046176800060493406, "iqr": 0.0009599998520570807, "raw_times": [0.047031000121933175, 0.04474100001061743, 0.04600100010065944, 0.04615000011654047, 0.04696099995271652], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.051490000032572425, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 16 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D8192", "batch": 1, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051341000016691396, "p50": 0.05155100006959401, "p90": 0.05226099983701715, "mean": 0.051880799992432, "iqr": 0.0007709998044447275, "raw_times": [0.051341000016691396, 0.051490000032572425, 0.05226099983701715, 0.05155100006959401, 0.05276100000628503], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053531000048678834, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 17 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D1024", "batch": 4, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044059999936507666, "p50": 0.04549100003714557, "p90": 0.045540999963122886, "mean": 0.04540859999906388, "iqr": 0.0004099999841855606, "raw_times": [0.04549100003714557, 0.044059999936507666, 0.04682000007960596, 0.045130999978937325, 0.045540999963122886], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.048860999868338695, "peak_bytes": 69242880, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 18 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D2048", "batch": 4, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04604099990501709, "p50": 0.04642099997909099, "p90": 0.04698099996858218, "mean": 0.05290099998092046, "iqr": 0.0009299999419454252, "raw_times": [0.07901100002527528, 0.04698099996858218, 0.04604099990501709, 0.04605100002663676, 0.04642099997909099], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.048481000021638465, "peak_bytes": 18882560, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 19 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D4096", "batch": 4, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04422100005285756, "p50": 0.045961000068928115, "p90": 0.04607100004250242, "mean": 0.04557280003609776, "iqr": 0.0010700000530050602, "raw_times": [0.04500099998949736, 0.045961000068928115, 0.04422100005285756, 0.046610000026703347, 0.04607100004250242], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05000100009056041, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 20 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D8192", "batch": 4, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044550999973580474, "p50": 0.04615100010596507, "p90": 0.04661999992094934, "mean": 0.04619880000973353, "iqr": 0.0006089999260439072, "raw_times": [0.04661999992094934, 0.047661000053267344, 0.04615100010596507, 0.04601099999490543, 0.044550999973580474], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05021999982091074, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 21 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D1024", "batch": 4, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04479100016396842, "p50": 0.04570999999486958, "p90": 0.04578100015351083, "mean": 0.04546060008578934, "iqr": 0.0006410000423784368, "raw_times": [0.045881000005465467, 0.04578100015351083, 0.045140000111132395, 0.04479100016396842, 0.04570999999486958], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05074099999546888, "peak_bytes": 37752832, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 22 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D2048", "batch": 4, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04388000002109038, "p50": 0.046260999852165696, "p90": 0.047070999926290824, "mean": 0.046070799999142764, "iqr": 0.0010899998414970469, "raw_times": [0.04716100011137314, 0.04598100008479378, 0.04388000002109038, 0.046260999852165696, 0.047070999926290824], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05007100003240339, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 23 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D4096", "batch": 4, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04435100004229753, "p50": 0.045130999978937325, "p90": 0.04698099996858218, "mean": 0.04562479998639901, "iqr": 0.0023600000531587284, "raw_times": [0.044620999915423454, 0.04698099996858218, 0.04704000002675457, 0.04435100004229753, 0.045130999978937325], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04849099991588446, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 24 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D8192", "batch": 4, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05119000002196117, "p50": 0.05123000005369249, "p90": 0.05150099991624302, "mean": 0.051574400004028575, "iqr": 0.00027999999474559445, "raw_times": [0.05122099992149742, 0.05150099991624302, 0.05123000005369249, 0.052730000106748776, 0.05119000002196117], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05633099999613478, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 25 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D1024", "batch": 4, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04580100016937649, "p50": 0.04708999995273189, "p90": 0.04770099985762499, "mean": 0.05188039999666216, "iqr": 0.00096099984148168, "raw_times": [0.07206999998743413, 0.04674000001614331, 0.04580100016937649, 0.04708999995273189, 0.04770099985762499], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04944000011164462, "peak_bytes": 83922944, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 26 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D2048", "batch": 4, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04320099992582982, "p50": 0.04512100008469133, "p90": 0.04604099990501709, "mean": 0.04527500000222062, "iqr": 0.001329999804511317, "raw_times": [0.04320099992582982, 0.04604099990501709, 0.04471100010050577, 0.0473009999950591, 0.04512100008469133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.051181000117139774, "peak_bytes": 151003136, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 27 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D4096", "batch": 4, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04984999986845651, "p50": 0.050290999979552, "p90": 0.050490999910834944, "mean": 0.050288599959458224, "iqr": 0.0005399999736255268, "raw_times": [0.04995099993720942, 0.050490999910834944, 0.050290999979552, 0.050860000101238256, 0.04984999986845651], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052241000048525166, "peak_bytes": 302006272, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 28 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D8192", "batch": 4, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2064129998871067, "p50": 0.2123330000358692, "p90": 0.218262999851504, "mean": 0.2148927999769512, "iqr": 0.010130999726243317, "raw_times": [0.20813200012526067, 0.218262999851504, 0.2123330000358692, 0.2064129998871067, 0.22932299998501549], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21481299995684822, "peak_bytes": 604012544, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 29 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D1024", "batch": 4, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04353000008450181, "p50": 0.04543000000012398, "p90": 0.04657099998439662, "mean": 0.04557060001388891, "iqr": 0.001390000079481979, "raw_times": [0.04543000000012398, 0.04518099990491464, 0.04714100009550748, 0.04657099998439662, 0.04353000008450181], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0490809998154873, "peak_bytes": 167809024, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 30 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D2048", "batch": 4, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.054420999958892935, "p50": 0.05506100001184677, "p90": 0.055460999874412664, "mean": 0.055042999929355574, "iqr": 0.0008699998943484388, "raw_times": [0.054420999958892935, 0.055460999874412664, 0.05568099982156127, 0.054590999980064225, 0.05506100001184677], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05802099985885434, "peak_bytes": 301998080, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 31 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D4096", "batch": 4, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.20996300008846447, "p50": 0.2102230000673444, "p90": 0.21053299997220165, "mean": 0.21050080003988114, "iqr": 0.0004209998678561533, "raw_times": [0.20996300008846447, 0.21053299997220165, 0.2102230000673444, 0.2101120001043455, 0.2116729999670497], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21157300011509506, "peak_bytes": 603996160, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 32 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D8192", "batch": 4, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4341660001045966, "p50": 0.4372359999251785, "p90": 0.4383160000998032, "mean": 0.437980000015159, "iqr": 0.004120000085094944, "raw_times": [0.4383160000998032, 0.4372359999251785, 0.4341660001045966, 0.43419600001470826, 0.44598599993150856], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.44448700009525055, "peak_bytes": 1207992320, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 33 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D1024", "batch": 16, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04514099987318332, "p50": 0.0465299999632407, "p90": 0.04655099996853096, "mean": 0.04629059999388119, "iqr": 0.0011309998626529705, "raw_times": [0.04514099987318332, 0.04655099996853096, 0.04781100005857297, 0.04542000010587799, 0.0465299999632407], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04811100006918423, "peak_bytes": 276860928, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 34 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D2048", "batch": 16, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.043820999962917995, "p50": 0.045551000084742554, "p90": 0.04633000003195775, "mean": 0.04580079998959263, "iqr": 0.0007890000688348664, "raw_times": [0.043820999962917995, 0.04776099990522198, 0.045551000084742554, 0.045540999963122886, 0.04633000003195775], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05054000007476134, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 35 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D4096", "batch": 16, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04511099996307166, "p50": 0.04610100017998775, "p90": 0.04624100006367371, "mean": 0.04598500008796691, "iqr": 0.0004099999841855606, "raw_times": [0.04583100007948815, 0.04664100015361328, 0.04511099996307166, 0.04610100017998775, 0.04624100006367371], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04932000001645065, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 36 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D8192", "batch": 16, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05165100014892232, "p50": 0.052130999847577186, "p90": 0.05317099999047059, "mean": 0.05250480003269331, "iqr": 0.0012309999419812812, "raw_times": [0.052130999847577186, 0.05165100014892232, 0.053631000128007145, 0.05317099999047059, 0.05194000004848931], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055000999964249786, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 37 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D1024", "batch": 16, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.045381000063571264, "p50": 0.045759999920846894, "p90": 0.04781100005857297, "mean": 0.04770240002471837, "iqr": 0.00238100005844899, "raw_times": [0.045759999920846894, 0.04781100005857297, 0.045381000063571264, 0.04543000000012398, 0.05413000008047675], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04919000002701068, "peak_bytes": 150999040, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 38 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D2048", "batch": 16, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05421099990599032, "p50": 0.054861000080563826, "p90": 0.05564100001720362, "mean": 0.05508300000656163, "iqr": 0.0010100000054080738, "raw_times": [0.056071000017254846, 0.054861000080563826, 0.05564100001720362, 0.05421099990599032, 0.05463100001179555], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05805000000691507, "peak_bytes": 301998080, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 39 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D4096", "batch": 16, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.20916299990858533, "p50": 0.21016300001974741, "p90": 0.21141399997759436, "mean": 0.21107719999235997, "iqr": 0.0015210000583465444, "raw_times": [0.21141399997759436, 0.2147530001366249, 0.2098929999192478, 0.21016300001974741, 0.20916299990858533], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21191299993006396, "peak_bytes": 603996160, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 40 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D8192", "batch": 16, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.43155599996680394, "p50": 0.43475600000419945, "p90": 0.4373360000045068, "mean": 0.43558200000006764, "iqr": 0.003800000058618025, "raw_times": [0.43475600000419945, 0.44072600007893925, 0.4373360000045068, 0.43353599994588876, 0.43155599996680394], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.44892699997944874, "peak_bytes": 1207992320, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 41 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D1024", "batch": 16, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0483500000427739, "p50": 0.049099999841928366, "p90": 0.04950099992129253, "mean": 0.050544599935165024, "iqr": 0.0011199999789823778, "raw_times": [0.048380999942310154, 0.04950099992129253, 0.05739099992752017, 0.049099999841928366, 0.0483500000427739], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05153099982635467, "peak_bytes": 335581184, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 42 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D2048", "batch": 16, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2181429999836837, "p50": 0.2215729998624738, "p90": 0.2217329999893991, "mean": 0.22086119997766218, "iqr": 0.003440000000409782, "raw_times": [0.2181429999836837, 0.2217329999893991, 0.21829299998898932, 0.2215729998624738, 0.22456400006376498], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22583300005862839, "peak_bytes": 603987968, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 43 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D4096", "batch": 16, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.43596600016826415, "p50": 0.4398270000365301, "p90": 0.4409260000102222, "mean": 0.4390922000766295, "iqr": 0.003549999973984086, "raw_times": [0.4398270000365301, 0.4409260000102222, 0.4413660001318931, 0.43596600016826415, 0.4373760000362381], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.44040700004188693, "peak_bytes": 1207975936, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 44 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D8192", "batch": 16, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8329219999723136, "p50": 0.8419220000632777, "p90": 0.8434520000264456, "mean": 0.84072780000497, "iqr": 0.002130000211764127, "raw_times": [0.8329219999723136, 0.8419220000632777, 0.8440210001481319, 0.8434520000264456, 0.8413219998146815], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8442119999472197, "peak_bytes": 2415951872, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00151824951171875, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 45 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D1024", "batch": 16, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21230300012575753, "p50": 0.2135429999725602, "p90": 0.2142630000889767, "mean": 0.21426700000120036, "iqr": 0.0008800002433417831, "raw_times": [0.21230300012575753, 0.2133829998456349, 0.2135429999725602, 0.2142630000889767, 0.21784299997307244], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22175300000526477, "peak_bytes": 671125504, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 46 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D2048", "batch": 16, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4536460000963416, "p50": 0.45670700001210207, "p90": 0.4569770001126017, "mean": 0.45669080004699936, "iqr": 0.00113999999484804, "raw_times": [0.4536460000963416, 0.4569770001126017, 0.45583700011775363, 0.45670700001210207, 0.4602869998961978], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4546860000118613, "peak_bytes": 1207967744, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 47 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8352710001418018, "p50": 0.8370320001631626, "p90": 0.8388319999994565, "mean": 0.8375798000997747, "iqr": 0.0019899998733308166, "raw_times": [0.8352710001418018, 0.8368420001261256, 0.8399220000683272, 0.8370320001631626, 0.8388319999994565], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.849921000053655, "peak_bytes": 2415935488, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 48 |
-
{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6451530000267667, "p50": 1.6546740000649152, "p90": 1.6553830000702874, "mean": 1.6516054000476288, "iqr": 0.008870000101524056, "raw_times": [1.6553830000702874, 1.6465129999687633, 1.6563040001074114, 1.6546740000649152, 1.6451530000267667], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.655194000022675, "peak_bytes": 4831870976, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null}
|
|
|
|
| 1 |
+
{"ts": "2025-10-28T14:08:59Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8268990000033227, "p50": 0.8360890000176369, "p90": 0.8378790000733716, "mean": 0.8358750000070359, "iqr": 0.002010000116570154, "raw_times": [0.8426389999840467, 0.8268990000033227, 0.8378790000733716, 0.8360890000176369, 0.8358689999568014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8452999999235544, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6477070000746608, "p50": 1.6516379999984565, "p90": 1.6565669999408783, "mean": 1.6533151999965412, "iqr": 0.006360999918797461, "raw_times": [1.6565669999408783, 1.6516379999984565, 1.6477070000746608, 1.6604579999466296, 1.6502060000220808], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6544470000781075, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6441269999631913, "p50": 1.6532669999378413, "p90": 1.6534970000066096, "mean": 1.6500411999913922, "iqr": 0.009149999982582813, "raw_times": [1.6441269999631913, 1.6534970000066096, 1.6532669999378413, 1.6443470000240268, 1.654968000025292], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6665570000213847, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.251962999911484, "p50": 3.270412999995642, "p90": 3.2735430000911947, "mean": 3.2660931999998866, "iqr": 0.01632000009976764, "raw_times": [3.2735430000911947, 3.251962999911484, 3.257222999991427, 3.277324000009685, 3.270412999995642], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2640430000583365, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
layer_norm/impls/hf_kernels_layer_norm.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
layer_norm/impls/torch_layer_norm.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
layer_norm/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
layer_norm/results/combined_results.html
CHANGED
|
@@ -3867,12 +3867,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3867 |
<h2>Combined Summary and Visualization</h2>
|
| 3868 |
<div class="artifact-preview">
|
| 3869 |
<?xml version='1.0' encoding='utf-8'?>
|
| 3870 |
-
<svg xmlns="http://www.w3.org/2000/svg" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ns2="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" width="
|
| 3871 |
<metadata>
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -3887,9 +3887,214 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3887 |
</defs>
|
| 3888 |
<g id="figure--latency" class="figure">
|
| 3889 |
<g id="patch_1">
|
| 3890 |
-
<path d="M 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3891 |
</g>
|
| 3892 |
</g>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3893 |
</svg>
|
| 3894 |
</div>
|
| 3895 |
|
|
@@ -3900,7 +4105,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3900 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 3901 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3902 |
</span> |
|
| 3903 |
-
Cell: combine | 4.
|
| 3904 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 3905 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 3906 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3987,107 +4192,20 @@ Summary: 2 found, 0 skipped, 0 missing
|
|
| 3987 |
COMBINED BENCHMARK SUMMARY
|
| 3988 |
|
| 3989 |
impl wl p50(ms) ok
|
| 3990 |
-
hf_kernels_layer_norm
|
| 3991 |
-
hf_kernels_layer_norm
|
| 3992 |
-
hf_kernels_layer_norm
|
| 3993 |
-
hf_kernels_layer_norm
|
| 3994 |
-
|
| 3995 |
-
|
| 3996 |
-
|
| 3997 |
-
|
| 3998 |
-
hf_kernels_layer_norm LN_B16_S2048_D1024 0.21 False
|
| 3999 |
-
hf_kernels_layer_norm LN_B16_S2048_D2048 0.46 False
|
| 4000 |
-
hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 False
|
| 4001 |
-
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 False
|
| 4002 |
-
hf_kernels_layer_norm LN_B16_S512_D1024 0.05 False
|
| 4003 |
-
hf_kernels_layer_norm LN_B16_S512_D2048 0.05 False
|
| 4004 |
-
hf_kernels_layer_norm LN_B16_S512_D4096 0.21 False
|
| 4005 |
-
hf_kernels_layer_norm LN_B16_S512_D8192 0.43 False
|
| 4006 |
-
hf_kernels_layer_norm LN_B1_S1024_D1024 0.05 False
|
| 4007 |
-
hf_kernels_layer_norm LN_B1_S1024_D2048 0.05 False
|
| 4008 |
-
hf_kernels_layer_norm LN_B1_S1024_D4096 0.05 False
|
| 4009 |
-
hf_kernels_layer_norm LN_B1_S1024_D8192 0.05 False
|
| 4010 |
-
hf_kernels_layer_norm LN_B1_S128_D1024 0.04 False
|
| 4011 |
-
hf_kernels_layer_norm LN_B1_S128_D2048 0.05 False
|
| 4012 |
-
hf_kernels_layer_norm LN_B1_S128_D4096 0.05 False
|
| 4013 |
-
hf_kernels_layer_norm LN_B1_S128_D8192 0.05 False
|
| 4014 |
-
hf_kernels_layer_norm LN_B1_S2048_D1024 0.05 False
|
| 4015 |
-
hf_kernels_layer_norm LN_B1_S2048_D2048 0.05 False
|
| 4016 |
-
hf_kernels_layer_norm LN_B1_S2048_D4096 0.05 False
|
| 4017 |
-
hf_kernels_layer_norm LN_B1_S2048_D8192 0.05 False
|
| 4018 |
-
hf_kernels_layer_norm LN_B1_S512_D1024 0.05 False
|
| 4019 |
-
hf_kernels_layer_norm LN_B1_S512_D2048 0.05 False
|
| 4020 |
-
hf_kernels_layer_norm LN_B1_S512_D4096 0.05 False
|
| 4021 |
-
hf_kernels_layer_norm LN_B1_S512_D8192 0.05 False
|
| 4022 |
-
hf_kernels_layer_norm LN_B4_S1024_D1024 0.05 False
|
| 4023 |
-
hf_kernels_layer_norm LN_B4_S1024_D2048 0.05 False
|
| 4024 |
-
hf_kernels_layer_norm LN_B4_S1024_D4096 0.05 False
|
| 4025 |
-
hf_kernels_layer_norm LN_B4_S1024_D8192 0.21 False
|
| 4026 |
-
hf_kernels_layer_norm LN_B4_S128_D1024 0.05 False
|
| 4027 |
-
hf_kernels_layer_norm LN_B4_S128_D2048 0.05 False
|
| 4028 |
-
hf_kernels_layer_norm LN_B4_S128_D4096 0.05 False
|
| 4029 |
-
hf_kernels_layer_norm LN_B4_S128_D8192 0.05 False
|
| 4030 |
-
hf_kernels_layer_norm LN_B4_S2048_D1024 0.05 False
|
| 4031 |
-
hf_kernels_layer_norm LN_B4_S2048_D2048 0.06 False
|
| 4032 |
-
hf_kernels_layer_norm LN_B4_S2048_D4096 0.21 False
|
| 4033 |
-
hf_kernels_layer_norm LN_B4_S2048_D8192 0.44 False
|
| 4034 |
-
hf_kernels_layer_norm LN_B4_S512_D1024 0.05 False
|
| 4035 |
-
hf_kernels_layer_norm LN_B4_S512_D2048 0.05 False
|
| 4036 |
-
hf_kernels_layer_norm LN_B4_S512_D4096 0.05 False
|
| 4037 |
-
hf_kernels_layer_norm LN_B4_S512_D8192 0.05 False
|
| 4038 |
-
torch_layer_norm LN_B16_S1024_D1024 0.05 False
|
| 4039 |
-
torch_layer_norm LN_B16_S1024_D2048 0.21 False
|
| 4040 |
-
torch_layer_norm LN_B16_S1024_D4096 0.42 False
|
| 4041 |
-
torch_layer_norm LN_B16_S1024_D8192 0.85 False
|
| 4042 |
-
torch_layer_norm LN_B16_S128_D1024 0.03 False
|
| 4043 |
-
torch_layer_norm LN_B16_S128_D2048 0.03 False
|
| 4044 |
-
torch_layer_norm LN_B16_S128_D4096 0.04 False
|
| 4045 |
-
torch_layer_norm LN_B16_S128_D8192 0.05 False
|
| 4046 |
-
torch_layer_norm LN_B16_S2048_D1024 0.21 False
|
| 4047 |
-
torch_layer_norm LN_B16_S2048_D2048 0.42 False
|
| 4048 |
-
torch_layer_norm LN_B16_S2048_D4096 0.82 False
|
| 4049 |
-
torch_layer_norm LN_B16_S2048_D8192 1.68 False
|
| 4050 |
-
torch_layer_norm LN_B16_S512_D1024 0.04 False
|
| 4051 |
-
torch_layer_norm LN_B16_S512_D2048 0.05 False
|
| 4052 |
-
torch_layer_norm LN_B16_S512_D4096 0.21 False
|
| 4053 |
-
torch_layer_norm LN_B16_S512_D8192 0.43 False
|
| 4054 |
-
torch_layer_norm LN_B1_S1024_D1024 0.03 False
|
| 4055 |
-
torch_layer_norm LN_B1_S1024_D2048 0.03 False
|
| 4056 |
-
torch_layer_norm LN_B1_S1024_D4096 0.03 False
|
| 4057 |
-
torch_layer_norm LN_B1_S1024_D8192 0.04 False
|
| 4058 |
-
torch_layer_norm LN_B1_S128_D1024 0.02 False
|
| 4059 |
-
torch_layer_norm LN_B1_S128_D2048 0.03 False
|
| 4060 |
-
torch_layer_norm LN_B1_S128_D4096 0.03 False
|
| 4061 |
-
torch_layer_norm LN_B1_S128_D8192 0.03 False
|
| 4062 |
-
torch_layer_norm LN_B1_S2048_D1024 0.03 False
|
| 4063 |
-
torch_layer_norm LN_B1_S2048_D2048 0.03 False
|
| 4064 |
-
torch_layer_norm LN_B1_S2048_D4096 0.04 False
|
| 4065 |
-
torch_layer_norm LN_B1_S2048_D8192 0.05 False
|
| 4066 |
-
torch_layer_norm LN_B1_S512_D1024 0.03 False
|
| 4067 |
-
torch_layer_norm LN_B1_S512_D2048 0.03 False
|
| 4068 |
-
torch_layer_norm LN_B1_S512_D4096 0.03 False
|
| 4069 |
-
torch_layer_norm LN_B1_S512_D8192 0.03 False
|
| 4070 |
-
torch_layer_norm LN_B4_S1024_D1024 0.03 False
|
| 4071 |
-
torch_layer_norm LN_B4_S1024_D2048 0.04 False
|
| 4072 |
-
torch_layer_norm LN_B4_S1024_D4096 0.05 False
|
| 4073 |
-
torch_layer_norm LN_B4_S1024_D8192 0.20 False
|
| 4074 |
-
torch_layer_norm LN_B4_S128_D1024 0.03 False
|
| 4075 |
-
torch_layer_norm LN_B4_S128_D2048 0.03 False
|
| 4076 |
-
torch_layer_norm LN_B4_S128_D4096 0.03 False
|
| 4077 |
-
torch_layer_norm LN_B4_S128_D8192 0.03 False
|
| 4078 |
-
torch_layer_norm LN_B4_S2048_D1024 0.04 False
|
| 4079 |
-
torch_layer_norm LN_B4_S2048_D2048 0.05 False
|
| 4080 |
-
torch_layer_norm LN_B4_S2048_D4096 0.21 False
|
| 4081 |
-
torch_layer_norm LN_B4_S2048_D8192 0.44 False
|
| 4082 |
-
torch_layer_norm LN_B4_S512_D1024 0.03 False
|
| 4083 |
-
torch_layer_norm LN_B4_S512_D2048 0.03 False
|
| 4084 |
-
torch_layer_norm LN_B4_S512_D4096 0.04 False
|
| 4085 |
-
torch_layer_norm LN_B4_S512_D8192 0.05 False
|
| 4086 |
|
| 4087 |
GENERATING COMBINED VISUALIZATION
|
| 4088 |
|
| 4089 |
-
Loaded
|
| 4090 |
-
|
|
|
|
| 4091 |
✓ Visualization saved as latency.svg
|
| 4092 |
✓ SVG visualization ready!
|
| 4093 |
|
|
@@ -4101,7 +4219,7 @@ Implementations included:
|
|
| 4101 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4102 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4103 |
<div class="uv-logs-content" style="display: none;">
|
| 4104 |
-
Installed 37 packages in
|
| 4105 |
</div>
|
| 4106 |
</div>
|
| 4107 |
<div class="cell-artifacts">
|
|
@@ -4109,12 +4227,12 @@ Installed 37 packages in 260ms
|
|
| 4109 |
<a href="artifacts/combine/latency.svg" class="artifact" target="_blank">latency.svg</a>
|
| 4110 |
<div class="artifact-preview">
|
| 4111 |
<?xml version='1.0' encoding='utf-8'?>
|
| 4112 |
-
<svg xmlns="http://www.w3.org/2000/svg" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ns2="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" width="
|
| 4113 |
<metadata>
|
| 4114 |
<rdf:RDF>
|
| 4115 |
<ns2:Work>
|
| 4116 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4117 |
-
<dc:date>2025-10-
|
| 4118 |
<dc:format>image/svg+xml</dc:format>
|
| 4119 |
<dc:creator>
|
| 4120 |
<ns2:Agent>
|
|
@@ -4129,9 +4247,214 @@ Installed 37 packages in 260ms
|
|
| 4129 |
</defs>
|
| 4130 |
<g id="figure--latency" class="figure">
|
| 4131 |
<g id="patch_1">
|
| 4132 |
-
<path d="M 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4133 |
</g>
|
| 4134 |
</g>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4135 |
</svg>
|
| 4136 |
</div>
|
| 4137 |
</div>
|
|
|
|
| 3867 |
<h2>Combined Summary and Visualization</h2>
|
| 3868 |
<div class="artifact-preview">
|
| 3869 |
<?xml version='1.0' encoding='utf-8'?>
|
| 3870 |
+
<svg xmlns="http://www.w3.org/2000/svg" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ns2="http://creativecommons.org/ns#" xmlns:ns4="http://www.w3.org/1999/xlink" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" width="864pt" height="576pt" viewBox="0 0 864 576" version="1.1">
|
| 3871 |
<metadata>
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
+
<dc:date>2025-10-28T14:09:21.825978</dc:date>
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
|
|
| 3887 |
</defs>
|
| 3888 |
<g id="figure--latency" class="figure">
|
| 3889 |
<g id="patch_1">
|
| 3890 |
+
<path d="M 0 576 L 864 576 L 864 0 L 0 0 L 0 576 z " style="fill: none" />
|
| 3891 |
+
</g>
|
| 3892 |
+
<g id="axes--1" class="axes">
|
| 3893 |
+
<g id="patch_2">
|
| 3894 |
+
<path d="M 47.72 457.251932 L 840.20233 457.251932 L 840.20233 26.88 L 47.72 26.88 L 47.72 457.251932 z " style="fill: none" />
|
| 3895 |
+
</g>
|
| 3896 |
+
<g id="matplotlib.axis_1">
|
| 3897 |
+
<g id="xtick_1">
|
| 3898 |
+
<g id="grid-x--1" class="grid grid-x">
|
| 3899 |
+
<path d="M 83.741924 457.251932 L 83.741924 26.88 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3900 |
+
</g>
|
| 3901 |
+
<g id="line2d_1">
|
| 3902 |
+
<defs>
|
| 3903 |
+
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3904 |
+
</defs>
|
| 3905 |
+
<g>
|
| 3906 |
+
<use ns4:href="#mafb3703e5b" x="83.741924" y="457.251932" style="stroke: #000000; stroke-width: 0.8" />
|
| 3907 |
+
</g>
|
| 3908 |
+
</g>
|
| 3909 |
+
<g id="text_1">
|
| 3910 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(45.726648 549.111375) rotate(-45)">LN_B16_S2048_D4096</text>
|
| 3911 |
+
</g>
|
| 3912 |
+
</g>
|
| 3913 |
+
<g id="xtick_2">
|
| 3914 |
+
<g id="grid-x--2" class="grid grid-x">
|
| 3915 |
+
<path d="M 323.888085 457.251932 L 323.888085 26.88 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3916 |
+
</g>
|
| 3917 |
+
<g id="line2d_2">
|
| 3918 |
+
<g>
|
| 3919 |
+
<use ns4:href="#mafb3703e5b" x="323.888085" y="457.251932" style="stroke: #000000; stroke-width: 0.8" />
|
| 3920 |
+
</g>
|
| 3921 |
+
</g>
|
| 3922 |
+
<g id="text_2">
|
| 3923 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(285.872809 549.111375) rotate(-45)">LN_B16_S2048_D8192</text>
|
| 3924 |
+
</g>
|
| 3925 |
+
</g>
|
| 3926 |
+
<g id="xtick_3">
|
| 3927 |
+
<g id="grid-x--3" class="grid grid-x">
|
| 3928 |
+
<path d="M 564.034245 457.251932 L 564.034245 26.88 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3929 |
+
</g>
|
| 3930 |
+
<g id="line2d_3">
|
| 3931 |
+
<g>
|
| 3932 |
+
<use ns4:href="#mafb3703e5b" x="564.034245" y="457.251932" style="stroke: #000000; stroke-width: 0.8" />
|
| 3933 |
+
</g>
|
| 3934 |
+
</g>
|
| 3935 |
+
<g id="text_3">
|
| 3936 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(526.018969 549.111375) rotate(-45)">LN_B16_S4096_D4096</text>
|
| 3937 |
+
</g>
|
| 3938 |
+
</g>
|
| 3939 |
+
<g id="xtick_4">
|
| 3940 |
+
<g id="grid-x--4" class="grid grid-x">
|
| 3941 |
+
<path d="M 804.180406 457.251932 L 804.180406 26.88 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3942 |
+
</g>
|
| 3943 |
+
<g id="line2d_4">
|
| 3944 |
+
<g>
|
| 3945 |
+
<use ns4:href="#mafb3703e5b" x="804.180406" y="457.251932" style="stroke: #000000; stroke-width: 0.8" />
|
| 3946 |
+
</g>
|
| 3947 |
+
</g>
|
| 3948 |
+
<g id="text_4">
|
| 3949 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(766.16513 549.111375) rotate(-45)">LN_B16_S4096_D8192</text>
|
| 3950 |
+
</g>
|
| 3951 |
+
</g>
|
| 3952 |
+
<g id="label--x" class="xlabel">
|
| 3953 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="443.961165" y="562.377038" transform="rotate(-0 443.961165 562.377038)">Workload</text>
|
| 3954 |
+
</g>
|
| 3955 |
+
</g>
|
| 3956 |
+
<g id="matplotlib.axis_2">
|
| 3957 |
+
<g id="ytick_1">
|
| 3958 |
+
<g id="grid-y--2" class="grid grid-y">
|
| 3959 |
+
<path d="M 47.72 409.029804 L 840.20233 409.029804 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3960 |
+
</g>
|
| 3961 |
+
<g id="line2d_5">
|
| 3962 |
+
<defs>
|
| 3963 |
+
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3964 |
+
</defs>
|
| 3965 |
+
<g>
|
| 3966 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="409.029804" style="stroke: #000000; stroke-width: 0.8" />
|
| 3967 |
+
</g>
|
| 3968 |
+
</g>
|
| 3969 |
+
<g id="text_5">
|
| 3970 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.829023" transform="rotate(-0 40.72 412.829023)">1.0</text>
|
| 3971 |
+
</g>
|
| 3972 |
+
</g>
|
| 3973 |
+
<g id="ytick_2">
|
| 3974 |
+
<g id="grid-y--3" class="grid grid-y">
|
| 3975 |
+
<path d="M 47.72 331.290271 L 840.20233 331.290271 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3976 |
+
</g>
|
| 3977 |
+
<g id="line2d_6">
|
| 3978 |
+
<g>
|
| 3979 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="331.290271" style="stroke: #000000; stroke-width: 0.8" />
|
| 3980 |
+
</g>
|
| 3981 |
+
</g>
|
| 3982 |
+
<g id="text_6">
|
| 3983 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.08949" transform="rotate(-0 40.72 335.08949)">1.5</text>
|
| 3984 |
+
</g>
|
| 3985 |
+
</g>
|
| 3986 |
+
<g id="ytick_3">
|
| 3987 |
+
<g id="grid-y--4" class="grid grid-y">
|
| 3988 |
+
<path d="M 47.72 253.550738 L 840.20233 253.550738 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3989 |
+
</g>
|
| 3990 |
+
<g id="line2d_7">
|
| 3991 |
+
<g>
|
| 3992 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="253.550738" style="stroke: #000000; stroke-width: 0.8" />
|
| 3993 |
+
</g>
|
| 3994 |
+
</g>
|
| 3995 |
+
<g id="text_7">
|
| 3996 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.349957" transform="rotate(-0 40.72 257.349957)">2.0</text>
|
| 3997 |
+
</g>
|
| 3998 |
+
</g>
|
| 3999 |
+
<g id="ytick_4">
|
| 4000 |
+
<g id="grid-y--5" class="grid grid-y">
|
| 4001 |
+
<path d="M 47.72 175.811205 L 840.20233 175.811205 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4002 |
+
</g>
|
| 4003 |
+
<g id="line2d_8">
|
| 4004 |
+
<g>
|
| 4005 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="175.811205" style="stroke: #000000; stroke-width: 0.8" />
|
| 4006 |
+
</g>
|
| 4007 |
+
</g>
|
| 4008 |
+
<g id="text_8">
|
| 4009 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.610424" transform="rotate(-0 40.72 179.610424)">2.5</text>
|
| 4010 |
+
</g>
|
| 4011 |
+
</g>
|
| 4012 |
+
<g id="ytick_5">
|
| 4013 |
+
<g id="grid-y--6" class="grid grid-y">
|
| 4014 |
+
<path d="M 47.72 98.071672 L 840.20233 98.071672 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4015 |
+
</g>
|
| 4016 |
+
<g id="line2d_9">
|
| 4017 |
+
<g>
|
| 4018 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="98.071672" style="stroke: #000000; stroke-width: 0.8" />
|
| 4019 |
+
</g>
|
| 4020 |
+
</g>
|
| 4021 |
+
<g id="text_9">
|
| 4022 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.870891" transform="rotate(-0 40.72 101.870891)">3.0</text>
|
| 4023 |
+
</g>
|
| 4024 |
+
</g>
|
| 4025 |
+
<g id="label--y" class="ylabel">
|
| 4026 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="242.065966" transform="rotate(-90 18.737188 242.065966)">Latency P50 (ms)</text>
|
| 4027 |
+
</g>
|
| 4028 |
+
</g>
|
| 4029 |
+
<g id="series--torch-layer-norm" class="series">
|
| 4030 |
+
<path d="M 83.741924 437.689571 L 323.888085 303.094453 L 564.034245 314.534914 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4031 |
+
<defs>
|
| 4032 |
+
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4033 |
+
</defs>
|
| 4034 |
+
<g clip-path="url(#p2214f54723)">
|
| 4035 |
+
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4036 |
+
<use ns4:href="#md7efaf3aec" x="323.888085" y="303.094453" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4037 |
+
<use ns4:href="#md7efaf3aec" x="564.034245" y="314.534914" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4038 |
+
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4039 |
+
</g>
|
| 4040 |
+
</g>
|
| 4041 |
+
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4042 |
+
<path d="M 83.741924 434.514533 L 323.888085 307.713737 L 564.034245 307.460461 L 804.180406 56.028111 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4043 |
+
<defs>
|
| 4044 |
+
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4045 |
+
</defs>
|
| 4046 |
+
<g clip-path="url(#p2214f54723)">
|
| 4047 |
+
<use ns4:href="#m9b8c54d372" x="83.741924" y="434.514533" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4048 |
+
<use ns4:href="#m9b8c54d372" x="323.888085" y="307.713737" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4049 |
+
<use ns4:href="#m9b8c54d372" x="564.034245" y="307.460461" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4050 |
+
<use ns4:href="#m9b8c54d372" x="804.180406" y="56.028111" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4051 |
+
</g>
|
| 4052 |
+
</g>
|
| 4053 |
+
<g id="patch_3">
|
| 4054 |
+
<path d="M 47.72 457.251932 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4055 |
+
</g>
|
| 4056 |
+
<g id="patch_4">
|
| 4057 |
+
<path d="M 840.20233 457.251932 L 840.20233 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4058 |
+
</g>
|
| 4059 |
+
<g id="patch_5">
|
| 4060 |
+
<path d="M 47.72 457.251932 L 840.20233 457.251932 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4061 |
+
</g>
|
| 4062 |
+
<g id="patch_6">
|
| 4063 |
+
<path d="M 47.72 26.88 L 840.20233 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4064 |
+
</g>
|
| 4065 |
+
<g id="text_10">
|
| 4066 |
+
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="443.961165" y="20.88" transform="rotate(-0 443.961165 20.88)">Attention Implementation Latency</text>
|
| 4067 |
+
</g>
|
| 4068 |
+
<g id="legend" class="legend">
|
| 4069 |
+
<g id="patch_7">
|
| 4070 |
+
<path d="M 54.72 64.7925 L 198.795 64.7925 Q 200.795 64.7925 200.795 62.7925 L 200.795 33.88 Q 200.795 31.88 198.795 31.88 L 54.72 31.88 Q 52.72 31.88 52.72 33.88 L 52.72 62.7925 Q 52.72 64.7925 54.72 64.7925 L 54.72 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4071 |
+
</g>
|
| 4072 |
+
<g id="line2d_10">
|
| 4073 |
+
<path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4074 |
+
<g>
|
| 4075 |
+
<use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4076 |
+
</g>
|
| 4077 |
+
</g>
|
| 4078 |
+
<g id="legend-label--torch-layer-norm" class="legend">
|
| 4079 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">torch_layer_norm</text>
|
| 4080 |
+
</g>
|
| 4081 |
+
<g id="line2d_11">
|
| 4082 |
+
<path d="M 56.72 54.934687 L 66.72 54.934687 L 76.72 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4083 |
+
<g>
|
| 4084 |
+
<use ns4:href="#m9b8c54d372" x="66.72" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4085 |
+
</g>
|
| 4086 |
+
</g>
|
| 4087 |
+
<g id="legend-label--hf-kernels-layer-norm" class="legend">
|
| 4088 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="58.434687" transform="rotate(-0 84.72 58.434687)">hf_kernels_layer_norm</text>
|
| 4089 |
+
</g>
|
| 4090 |
+
</g>
|
| 4091 |
</g>
|
| 4092 |
</g>
|
| 4093 |
+
<defs>
|
| 4094 |
+
<clipPath id="p2214f54723">
|
| 4095 |
+
<rect x="47.72" y="26.88" width="792.48233" height="430.371932" />
|
| 4096 |
+
</clipPath>
|
| 4097 |
+
</defs>
|
| 4098 |
</svg>
|
| 4099 |
</div>
|
| 4100 |
|
|
|
|
| 4105 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4106 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4107 |
</span> |
|
| 4108 |
+
Cell: combine | 4.25s
|
| 4109 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4110 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4111 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4192 |
COMBINED BENCHMARK SUMMARY
|
| 4193 |
|
| 4194 |
impl wl p50(ms) ok
|
| 4195 |
+
hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
|
| 4196 |
+
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
|
| 4197 |
+
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4198 |
+
hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True
|
| 4199 |
+
torch_layer_norm LN_B16_S2048_D4096 0.82 True
|
| 4200 |
+
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4201 |
+
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4202 |
+
torch_layer_norm LN_B16_S4096_D8192 3.33 True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4203 |
|
| 4204 |
GENERATING COMBINED VISUALIZATION
|
| 4205 |
|
| 4206 |
+
Loaded 8 records
|
| 4207 |
+
✓ Visualization saved as latency.svg
|
| 4208 |
+
Saved latency.png
|
| 4209 |
✓ Visualization saved as latency.svg
|
| 4210 |
✓ SVG visualization ready!
|
| 4211 |
|
|
|
|
| 4219 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4220 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4221 |
<div class="uv-logs-content" style="display: none;">
|
| 4222 |
+
Installed 37 packages in 219ms
|
| 4223 |
</div>
|
| 4224 |
</div>
|
| 4225 |
<div class="cell-artifacts">
|
|
|
|
| 4227 |
<a href="artifacts/combine/latency.svg" class="artifact" target="_blank">latency.svg</a>
|
| 4228 |
<div class="artifact-preview">
|
| 4229 |
<?xml version='1.0' encoding='utf-8'?>
|
| 4230 |
+
<svg xmlns="http://www.w3.org/2000/svg" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ns2="http://creativecommons.org/ns#" xmlns:ns4="http://www.w3.org/1999/xlink" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" width="864pt" height="576pt" viewBox="0 0 864 576" version="1.1">
|
| 4231 |
<metadata>
|
| 4232 |
<rdf:RDF>
|
| 4233 |
<ns2:Work>
|
| 4234 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4235 |
+
<dc:date>2025-10-28T14:09:21.825978</dc:date>
|
| 4236 |
<dc:format>image/svg+xml</dc:format>
|
| 4237 |
<dc:creator>
|
| 4238 |
<ns2:Agent>
|
|
|
|
| 4247 |
</defs>
|
| 4248 |
<g id="figure--latency" class="figure">
|
| 4249 |
<g id="patch_1">
|
| 4250 |
+
<path d="M 0 576 L 864 576 L 864 0 L 0 0 L 0 576 z " style="fill: none" />
|
| 4251 |
+
</g>
|
| 4252 |
+
<g id="axes--1" class="axes">
|
| 4253 |
+
<g id="patch_2">
|
| 4254 |
+
<path d="M 47.72 457.251932 L 840.20233 457.251932 L 840.20233 26.88 L 47.72 26.88 L 47.72 457.251932 z " style="fill: none" />
|
| 4255 |
+
</g>
|
| 4256 |
+
<g id="matplotlib.axis_1">
|
| 4257 |
+
<g id="xtick_1">
|
| 4258 |
+
<g id="grid-x--1" class="grid grid-x">
|
| 4259 |
+
<path d="M 83.741924 457.251932 L 83.741924 26.88 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4260 |
+
</g>
|
| 4261 |
+
<g id="line2d_1">
|
| 4262 |
+
<defs>
|
| 4263 |
+
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4264 |
+
</defs>
|
| 4265 |
+
<g>
|
| 4266 |
+
<use ns4:href="#mafb3703e5b" x="83.741924" y="457.251932" style="stroke: #000000; stroke-width: 0.8" />
|
| 4267 |
+
</g>
|
| 4268 |
+
</g>
|
| 4269 |
+
<g id="text_1">
|
| 4270 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(45.726648 549.111375) rotate(-45)">LN_B16_S2048_D4096</text>
|
| 4271 |
+
</g>
|
| 4272 |
+
</g>
|
| 4273 |
+
<g id="xtick_2">
|
| 4274 |
+
<g id="grid-x--2" class="grid grid-x">
|
| 4275 |
+
<path d="M 323.888085 457.251932 L 323.888085 26.88 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4276 |
+
</g>
|
| 4277 |
+
<g id="line2d_2">
|
| 4278 |
+
<g>
|
| 4279 |
+
<use ns4:href="#mafb3703e5b" x="323.888085" y="457.251932" style="stroke: #000000; stroke-width: 0.8" />
|
| 4280 |
+
</g>
|
| 4281 |
+
</g>
|
| 4282 |
+
<g id="text_2">
|
| 4283 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(285.872809 549.111375) rotate(-45)">LN_B16_S2048_D8192</text>
|
| 4284 |
+
</g>
|
| 4285 |
+
</g>
|
| 4286 |
+
<g id="xtick_3">
|
| 4287 |
+
<g id="grid-x--3" class="grid grid-x">
|
| 4288 |
+
<path d="M 564.034245 457.251932 L 564.034245 26.88 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4289 |
+
</g>
|
| 4290 |
+
<g id="line2d_3">
|
| 4291 |
+
<g>
|
| 4292 |
+
<use ns4:href="#mafb3703e5b" x="564.034245" y="457.251932" style="stroke: #000000; stroke-width: 0.8" />
|
| 4293 |
+
</g>
|
| 4294 |
+
</g>
|
| 4295 |
+
<g id="text_3">
|
| 4296 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(526.018969 549.111375) rotate(-45)">LN_B16_S4096_D4096</text>
|
| 4297 |
+
</g>
|
| 4298 |
+
</g>
|
| 4299 |
+
<g id="xtick_4">
|
| 4300 |
+
<g id="grid-x--4" class="grid grid-x">
|
| 4301 |
+
<path d="M 804.180406 457.251932 L 804.180406 26.88 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4302 |
+
</g>
|
| 4303 |
+
<g id="line2d_4">
|
| 4304 |
+
<g>
|
| 4305 |
+
<use ns4:href="#mafb3703e5b" x="804.180406" y="457.251932" style="stroke: #000000; stroke-width: 0.8" />
|
| 4306 |
+
</g>
|
| 4307 |
+
</g>
|
| 4308 |
+
<g id="text_4">
|
| 4309 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(766.16513 549.111375) rotate(-45)">LN_B16_S4096_D8192</text>
|
| 4310 |
+
</g>
|
| 4311 |
+
</g>
|
| 4312 |
+
<g id="label--x" class="xlabel">
|
| 4313 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="443.961165" y="562.377038" transform="rotate(-0 443.961165 562.377038)">Workload</text>
|
| 4314 |
+
</g>
|
| 4315 |
+
</g>
|
| 4316 |
+
<g id="matplotlib.axis_2">
|
| 4317 |
+
<g id="ytick_1">
|
| 4318 |
+
<g id="grid-y--2" class="grid grid-y">
|
| 4319 |
+
<path d="M 47.72 409.029804 L 840.20233 409.029804 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4320 |
+
</g>
|
| 4321 |
+
<g id="line2d_5">
|
| 4322 |
+
<defs>
|
| 4323 |
+
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4324 |
+
</defs>
|
| 4325 |
+
<g>
|
| 4326 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="409.029804" style="stroke: #000000; stroke-width: 0.8" />
|
| 4327 |
+
</g>
|
| 4328 |
+
</g>
|
| 4329 |
+
<g id="text_5">
|
| 4330 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.829023" transform="rotate(-0 40.72 412.829023)">1.0</text>
|
| 4331 |
+
</g>
|
| 4332 |
+
</g>
|
| 4333 |
+
<g id="ytick_2">
|
| 4334 |
+
<g id="grid-y--3" class="grid grid-y">
|
| 4335 |
+
<path d="M 47.72 331.290271 L 840.20233 331.290271 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4336 |
+
</g>
|
| 4337 |
+
<g id="line2d_6">
|
| 4338 |
+
<g>
|
| 4339 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="331.290271" style="stroke: #000000; stroke-width: 0.8" />
|
| 4340 |
+
</g>
|
| 4341 |
+
</g>
|
| 4342 |
+
<g id="text_6">
|
| 4343 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.08949" transform="rotate(-0 40.72 335.08949)">1.5</text>
|
| 4344 |
+
</g>
|
| 4345 |
+
</g>
|
| 4346 |
+
<g id="ytick_3">
|
| 4347 |
+
<g id="grid-y--4" class="grid grid-y">
|
| 4348 |
+
<path d="M 47.72 253.550738 L 840.20233 253.550738 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4349 |
+
</g>
|
| 4350 |
+
<g id="line2d_7">
|
| 4351 |
+
<g>
|
| 4352 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="253.550738" style="stroke: #000000; stroke-width: 0.8" />
|
| 4353 |
+
</g>
|
| 4354 |
+
</g>
|
| 4355 |
+
<g id="text_7">
|
| 4356 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.349957" transform="rotate(-0 40.72 257.349957)">2.0</text>
|
| 4357 |
+
</g>
|
| 4358 |
+
</g>
|
| 4359 |
+
<g id="ytick_4">
|
| 4360 |
+
<g id="grid-y--5" class="grid grid-y">
|
| 4361 |
+
<path d="M 47.72 175.811205 L 840.20233 175.811205 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4362 |
+
</g>
|
| 4363 |
+
<g id="line2d_8">
|
| 4364 |
+
<g>
|
| 4365 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="175.811205" style="stroke: #000000; stroke-width: 0.8" />
|
| 4366 |
+
</g>
|
| 4367 |
+
</g>
|
| 4368 |
+
<g id="text_8">
|
| 4369 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.610424" transform="rotate(-0 40.72 179.610424)">2.5</text>
|
| 4370 |
+
</g>
|
| 4371 |
+
</g>
|
| 4372 |
+
<g id="ytick_5">
|
| 4373 |
+
<g id="grid-y--6" class="grid grid-y">
|
| 4374 |
+
<path d="M 47.72 98.071672 L 840.20233 98.071672 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4375 |
+
</g>
|
| 4376 |
+
<g id="line2d_9">
|
| 4377 |
+
<g>
|
| 4378 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="98.071672" style="stroke: #000000; stroke-width: 0.8" />
|
| 4379 |
+
</g>
|
| 4380 |
+
</g>
|
| 4381 |
+
<g id="text_9">
|
| 4382 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.870891" transform="rotate(-0 40.72 101.870891)">3.0</text>
|
| 4383 |
+
</g>
|
| 4384 |
+
</g>
|
| 4385 |
+
<g id="label--y" class="ylabel">
|
| 4386 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="242.065966" transform="rotate(-90 18.737188 242.065966)">Latency P50 (ms)</text>
|
| 4387 |
+
</g>
|
| 4388 |
+
</g>
|
| 4389 |
+
<g id="series--torch-layer-norm" class="series">
|
| 4390 |
+
<path d="M 83.741924 437.689571 L 323.888085 303.094453 L 564.034245 314.534914 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4391 |
+
<defs>
|
| 4392 |
+
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4393 |
+
</defs>
|
| 4394 |
+
<g clip-path="url(#p2214f54723)">
|
| 4395 |
+
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4396 |
+
<use ns4:href="#md7efaf3aec" x="323.888085" y="303.094453" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4397 |
+
<use ns4:href="#md7efaf3aec" x="564.034245" y="314.534914" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4398 |
+
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4399 |
+
</g>
|
| 4400 |
+
</g>
|
| 4401 |
+
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4402 |
+
<path d="M 83.741924 434.514533 L 323.888085 307.713737 L 564.034245 307.460461 L 804.180406 56.028111 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4403 |
+
<defs>
|
| 4404 |
+
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4405 |
+
</defs>
|
| 4406 |
+
<g clip-path="url(#p2214f54723)">
|
| 4407 |
+
<use ns4:href="#m9b8c54d372" x="83.741924" y="434.514533" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4408 |
+
<use ns4:href="#m9b8c54d372" x="323.888085" y="307.713737" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4409 |
+
<use ns4:href="#m9b8c54d372" x="564.034245" y="307.460461" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4410 |
+
<use ns4:href="#m9b8c54d372" x="804.180406" y="56.028111" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4411 |
+
</g>
|
| 4412 |
+
</g>
|
| 4413 |
+
<g id="patch_3">
|
| 4414 |
+
<path d="M 47.72 457.251932 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4415 |
+
</g>
|
| 4416 |
+
<g id="patch_4">
|
| 4417 |
+
<path d="M 840.20233 457.251932 L 840.20233 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4418 |
+
</g>
|
| 4419 |
+
<g id="patch_5">
|
| 4420 |
+
<path d="M 47.72 457.251932 L 840.20233 457.251932 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4421 |
+
</g>
|
| 4422 |
+
<g id="patch_6">
|
| 4423 |
+
<path d="M 47.72 26.88 L 840.20233 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4424 |
+
</g>
|
| 4425 |
+
<g id="text_10">
|
| 4426 |
+
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="443.961165" y="20.88" transform="rotate(-0 443.961165 20.88)">Attention Implementation Latency</text>
|
| 4427 |
+
</g>
|
| 4428 |
+
<g id="legend" class="legend">
|
| 4429 |
+
<g id="patch_7">
|
| 4430 |
+
<path d="M 54.72 64.7925 L 198.795 64.7925 Q 200.795 64.7925 200.795 62.7925 L 200.795 33.88 Q 200.795 31.88 198.795 31.88 L 54.72 31.88 Q 52.72 31.88 52.72 33.88 L 52.72 62.7925 Q 52.72 64.7925 54.72 64.7925 L 54.72 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4431 |
+
</g>
|
| 4432 |
+
<g id="line2d_10">
|
| 4433 |
+
<path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4434 |
+
<g>
|
| 4435 |
+
<use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4436 |
+
</g>
|
| 4437 |
+
</g>
|
| 4438 |
+
<g id="legend-label--torch-layer-norm" class="legend">
|
| 4439 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">torch_layer_norm</text>
|
| 4440 |
+
</g>
|
| 4441 |
+
<g id="line2d_11">
|
| 4442 |
+
<path d="M 56.72 54.934687 L 66.72 54.934687 L 76.72 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4443 |
+
<g>
|
| 4444 |
+
<use ns4:href="#m9b8c54d372" x="66.72" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4445 |
+
</g>
|
| 4446 |
+
</g>
|
| 4447 |
+
<g id="legend-label--hf-kernels-layer-norm" class="legend">
|
| 4448 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="58.434687" transform="rotate(-0 84.72 58.434687)">hf_kernels_layer_norm</text>
|
| 4449 |
+
</g>
|
| 4450 |
+
</g>
|
| 4451 |
</g>
|
| 4452 |
</g>
|
| 4453 |
+
<defs>
|
| 4454 |
+
<clipPath id="p2214f54723">
|
| 4455 |
+
<rect x="47.72" y="26.88" width="792.48233" height="430.371932" />
|
| 4456 |
+
</clipPath>
|
| 4457 |
+
</defs>
|
| 4458 |
</svg>
|
| 4459 |
</div>
|
| 4460 |
</div>
|
rotary/impls/artifacts/benchmark/rotary.jsonl
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.1724160000549091, "p50": 0.17308600001797458, "p90": 0.1756759999125279, "mean": 0.1760500000045795, "iqr": 0.0032199998258874984, "raw_times": [0.17245600008664042, 0.1756759999125279, 0.1724160000549091, 0.17308600001797458, 0.18661599995084543], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.17975699995531613, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22642799990535423, "p50": 0.2294280000114668, "p90": 0.23093799995876907, "mean": 0.23135619996992318, "iqr": 0.0026599999500831473, "raw_times": [0.23093799995876907, 0.22642799990535423, 0.22827800000868592, 0.2417089999653399, 0.2294280000114668], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23494799995660287, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21688800006813835, "p50": 0.21992799997860857, "p90": 0.2219079999576934, "mean": 0.22172000001319248, "iqr": 0.004439999884198187, "raw_times": [0.2174680000734952, 0.2219079999576934, 0.21688800006813835, 0.23240799998802686, 0.21992799997860857], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.225418000013633, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21487700007583044, "p50": 0.21964699999443837, "p90": 0.22132800006602338, "mean": 0.21978760003094067, "iqr": 0.005100000066704524, "raw_times": [0.21487700007583044, 0.21622799999931885, 0.21964699999443837, 0.22132800006602338, 0.2268580000190923], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.24882799993974913, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21332699998311, "p50": 0.21615699995436444, "p90": 0.21744800005762954, "mean": 0.21590960000139603, "iqr": 0.0025000000505315256, "raw_times": [0.21332699998311, 0.21744800005762954, 0.21494800000709802, 0.21766800000477815, 0.21615699995436444], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22001800005000405, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21038799991401902, "p50": 0.21561700009442575, "p90": 0.21720800009461527, "mean": 0.22098599999935686, "iqr": 0.004100000182916119, "raw_times": [0.21038799991401902, 0.21720800009461527, 0.21561700009442575, 0.24860899998202513, 0.21310799991169915], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2178580000418151, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21404700009952649, "p50": 0.21557699994900759, "p90": 0.2158679999411106, "mean": 0.2152116000161186, "iqr": 0.0011999999287581886, "raw_times": [0.2158679999411106, 0.21589800007859594, 0.21404700009952649, 0.21466800001235242, 0.21557699994900759], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21567799990407366, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21701799994389148, "p50": 0.21822700000484474, "p90": 0.22002800005793688, "mean": 0.2237478000097326, "iqr": 0.002031000008173578, "raw_times": [0.22002800005793688, 0.2179970000497633, 0.2454689999922266, 0.21822700000484474, 0.21701799994389148], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22291799996310147, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21712800003115262, "p50": 0.21885700004986575, "p90": 0.2196080000658185, "mean": 0.22401780001928273, "iqr": 0.001630000042496249, "raw_times": [0.21797800002332224, 0.2196080000658185, 0.24651799992625456, 0.21885700004986575, 0.21712800003115262], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2206780000051367, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 10 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21462800009430794, "p50": 0.21782799990432977, "p90": 0.21795700001803198, "mean": 0.21911359999648994, "iqr": 0.0030300000162242213, "raw_times": [0.21462800009430794, 0.23022799996397225, 0.21782799990432977, 0.21492700000180776, 0.21795700001803198], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2186980000260519, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 11 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21579799999926763, "p50": 0.21701699995446688, "p90": 0.22130799993647088, "mean": 0.2237457999626713, "iqr": 0.004450000005817856, "raw_times": [0.21701699995446688, 0.22130799993647088, 0.21579799999926763, 0.24774799999249808, 0.21685799993065302], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22235700009787251, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 12 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22434800007431477, "p50": 0.2248280000003433, "p90": 0.22490799995011912, "mean": 0.22479799997654482, "iqr": 0.00031000001854408765, "raw_times": [0.2248280000003433, 0.22490799995011912, 0.22459799993157503, 0.22434800007431477, 0.22530799992637185], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23522799995134847, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 13 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21574699997017888, "p50": 0.21802799994929956, "p90": 0.21904799996264046, "mean": 0.22033179998288688, "iqr": 0.0018999999156221747, "raw_times": [0.21714800004701829, 0.21802799994929956, 0.2316879999852972, 0.21904799996264046, 0.21574699997017888], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22168800001054478, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 14 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21753800001533818, "p50": 0.21888800006308884, "p90": 0.22129700005280029, "mean": 0.22190180004599824, "iqr": 0.003358999947522534, "raw_times": [0.21753800001533818, 0.23384799999348616, 0.21793800010527775, 0.21888800006308884, 0.22129700005280029], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22266799999215436, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 15 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2166670000178783, "p50": 0.21850699999959033, "p90": 0.21964699999443837, "mean": 0.21864339998955984, "iqr": 0.001419000000169035, "raw_times": [0.21850699999959033, 0.2166670000178783, 0.22016799994162284, 0.21822799999426934, 0.21964699999443837], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23552799996195972, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 16 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21624800001518452, "p50": 0.21773700007088337, "p90": 0.21802799994929956, "mean": 0.21774760000425886, "iqr": 0.0013409999155555852, "raw_times": [0.21668700003374397, 0.21773700007088337, 0.22003799995218287, 0.21624800001518452, 0.21802799994929956], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2226780000000872, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 17 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21748699998624943, "p50": 0.22014700005001941, "p90": 0.22206799997093185, "mean": 0.22232159999475698, "iqr": 0.0019999999949504854, "raw_times": [0.22014700005001941, 0.23183799999060284, 0.22206799997093185, 0.21748699998624943, 0.22006799997598137], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22040800001832395, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 18 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21601800006010308, "p50": 0.21957800004202, "p90": 0.22023799999715266, "mean": 0.2213318000030995, "iqr": 0.0024510000002919696, "raw_times": [0.23303799991936103, 0.21601800006010308, 0.2177869999968607, 0.21957800004202, 0.22023799999715266], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.220787999978711, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 19 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21692799998618284, "p50": 0.22003699996275827, "p90": 0.2230679999684071, "mean": 0.222287800011145, "iqr": 0.0031599998919773498, "raw_times": [0.21692799998618284, 0.21990800007642974, 0.2314980000619471, 0.2230679999684071, 0.22003699996275827], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22102700006598752, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 20 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2160679999860804, "p50": 0.21972700005790102, "p90": 0.22029800004474964, "mean": 0.21970960001453932, "iqr": 0.0024610000082248007, "raw_times": [0.2160679999860804, 0.2246179999474407, 0.22029800004474964, 0.21972700005790102, 0.21783700003652484], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22191799996562622, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 21 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2172279999967941, "p50": 0.21847799996521644, "p90": 0.22105800007921061, "mean": 0.22193580000475777, "iqr": 0.0035110000453641987, "raw_times": [0.21847799996521644, 0.22105800007921061, 0.23536799994872126, 0.21754700003384642, 0.2172279999967941], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22206799997093185, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 22 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21436800000174117, "p50": 0.21785799992812827, "p90": 0.2195579999124675, "mean": 0.2202379999744153, "iqr": 0.0030299999025373836, "raw_times": [0.21436800000174117, 0.21785799992812827, 0.2195579999124675, 0.2165280000099301, 0.2328780000198094], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25353900002755836, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 23 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22968799999034673, "p50": 0.23015800002212927, "p90": 0.23064800006977748, "mean": 0.23369620002995362, "iqr": 0.0006600000688194996, "raw_times": [0.23015800002212927, 0.24799900006655662, 0.22968799999034673, 0.22998800000095798, 0.23064800006977748], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23042800000894204, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 24 |
+
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.6347319999804313, "p50": 0.6375930000785957, "p90": 0.639283000055002, "mean": 0.6376124000325945, "iqr": 0.003270999968663091, "raw_times": [0.6375930000785957, 0.636012000086339, 0.6404419999626043, 0.6347319999804313, 0.639283000055002], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.639422999938688, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
rotary/impls/cells/benchmark.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# ]
|
| 8 |
+
#
|
| 9 |
+
# [tool.uv.sources]
|
| 10 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 11 |
+
# ///
|
| 12 |
+
import torch
|
| 13 |
+
import sys
|
| 14 |
+
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def apply_rotary_torch(x1, x2, cos, sin, conj=False):
|
| 18 |
+
"""Reference rotary implementation."""
|
| 19 |
+
if not conj:
|
| 20 |
+
out1 = x1 * cos - x2 * sin
|
| 21 |
+
out2 = x1 * sin + x2 * cos
|
| 22 |
+
else:
|
| 23 |
+
out1 = x1 * cos + x2 * sin
|
| 24 |
+
out2 = -x1 * sin + x2 * cos
|
| 25 |
+
return out1, out2
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def torch_rotary(query, key, cos, sin, conj=False):
|
| 29 |
+
rotary_dim = cos.shape[-1]
|
| 30 |
+
|
| 31 |
+
# Clone inputs to avoid modifying them
|
| 32 |
+
q_out = query.clone()
|
| 33 |
+
k_out = key.clone()
|
| 34 |
+
|
| 35 |
+
# Apply rotation to query
|
| 36 |
+
q1 = q_out[..., :rotary_dim]
|
| 37 |
+
q2 = q_out[..., rotary_dim : 2 * rotary_dim]
|
| 38 |
+
q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
|
| 39 |
+
q_out[..., :rotary_dim] = q_out_1
|
| 40 |
+
q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2
|
| 41 |
+
|
| 42 |
+
# Apply rotation to key
|
| 43 |
+
k1 = k_out[..., :rotary_dim]
|
| 44 |
+
k2 = k_out[..., rotary_dim : 2 * rotary_dim]
|
| 45 |
+
k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
|
| 46 |
+
k_out[..., :rotary_dim] = k_out_1
|
| 47 |
+
k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2
|
| 48 |
+
|
| 49 |
+
return q_out, k_out
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
run_benchmark(
|
| 53 |
+
kernel_type=KernelTypeEnum.ROTARY,
|
| 54 |
+
impl_name="torch_eager",
|
| 55 |
+
impl_tags={"family": "pytorch", "backend": "eager"},
|
| 56 |
+
impl_func=torch_rotary,
|
| 57 |
+
)
|
rotary/impls/cells/nv.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
rotary/impls/hf_kernels_rotary.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rotary/impls/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /rotary/impls</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /rotary/impls</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='hf_kernels_rotary.html' class='file'>hf_kernels_rotary.html</a></li>
|
| 86 |
+
<li><a href='torch_rotary.html' class='file'>torch_rotary.html</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
rotary/impls/torch_rotary.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rotary/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /rotary</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /rotary</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='impls/index.html' class='dir'>impls/</a></li>
|
| 86 |
+
<li><a href='results/index.html' class='dir'>results/</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
rotary/results/artifacts/combine/latency.svg
ADDED
|
|
Git LFS Details
|
rotary/results/cells/combine.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "matplotlib",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
+
# ///
|
| 13 |
+
from kernels_benchmark_tools.core.visuals import generate_combined_results
|
| 14 |
+
|
| 15 |
+
# Map display names to uvnote environment variables
|
| 16 |
+
cache_env_map = {
|
| 17 |
+
"HF Kernels Rotary": "UVNOTE_FILE_HF_KERNELS_ROTARY_BENCHMARK",
|
| 18 |
+
"PyTorch Rotary": "UVNOTE_FILE_TORCH_ROTARY_BENCHMARK",
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
# Generate combined results with visualization
|
| 22 |
+
generate_combined_results(
|
| 23 |
+
cache_env_map=cache_env_map,
|
| 24 |
+
output_filename="rotary.jsonl",
|
| 25 |
+
svg_filename="latency.svg"
|
| 26 |
+
)
|
rotary/results/combined_results.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rotary/results/index.html
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /rotary/results</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /rotary/results</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='combined_results.html' class='file'>combined_results.html</a></li>
|
| 86 |
+
</ul>
|
| 87 |
+
</body>
|
| 88 |
+
</html>
|