Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- activation/impls/artifacts/benchmark/activation.jsonl +9 -9
- activation/impls/cells/benchmark.py +2 -2
- activation/impls/cells/sysinfo.py +14 -0
- activation/impls/hf_kernels_swiglu.html +96 -95
- activation/impls/index.html +1 -2
- activation/impls/torch_swiglu.html +120 -120
- activation/impls/torch_swiglu_darwin.html +0 -0
- activation/index.html +1 -1
- activation/results_darwin/artifacts/combine/latency.svg +3 -0
- activation/results_darwin/cells/combine.py +25 -0
- activation/results_darwin/combined_results.html +0 -0
- activation/results_darwin/index.html +88 -0
- activation/results_linux/artifacts/combine/latency.svg +2 -2
- activation/results_linux/combined_results.html +85 -111
- causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +24 -24
- causal_conv1d/impls/hf_kernels_causal_conv1d.html +0 -0
- causal_conv1d/impls/torch_causal_conv1d.html +0 -0
- causal_conv1d/results/artifacts/combine/latency.svg +2 -2
- causal_conv1d/results/combined_results.html +131 -131
- deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl +4 -4
- deformable_detr/impls/cells/benchmark.py +18 -94
- deformable_detr/impls/hf_kernels_deformable_detr.html +78 -78
- deformable_detr/impls/torch_deformable_detr.html +103 -97
- deformable_detr/results/artifacts/combine/latency.svg +2 -2
- deformable_detr/results/combined_results.html +56 -56
- flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -6
- flash_attn/impls/cells/benchmark.py +8 -9
- flash_attn/impls/flash_attention.html +140 -140
- flash_attn/impls/hf_kernels_flash_attn.html +93 -93
- flash_attn/impls/hf_kernels_flash_attn3.html +82 -83
- flash_attn/impls/mem_efficient_attention.html +134 -140
- flash_attn/impls/sage_attention.html +10 -12
- flash_attn/impls/xformers.html +90 -90
- flash_attn/results/artifacts/combine/latency.svg +2 -2
- flash_attn/results/combined_results.html +143 -143
- index.html +1 -1
- layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +4 -4
- layer_norm/impls/hf_kernels_layer_norm.html +54 -57
- layer_norm/impls/torch_layer_norm.html +54 -100
- layer_norm/results/artifacts/combine/latency.svg +2 -2
- layer_norm/results/combined_results.html +52 -52
- openai_moe/impls/artifacts/benchmark/openai_moe.jsonl +8 -8
- openai_moe/impls/binned_torch.html +189 -189
- openai_moe/impls/gpt_oss_moe.html +191 -191
- openai_moe/results/artifacts/combine/latency.svg +2 -2
- openai_moe/results/combined_results.html +240 -188
- rotary/impls/artifacts/benchmark/rotary.jsonl +24 -24
- rotary/impls/hf_kernels_rotary.html +0 -0
- rotary/impls/torch_rotary.html +0 -0
- rotary/index.html +1 -1
activation/impls/artifacts/benchmark/activation.jsonl
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
{"ts": "2025-12-
|
| 2 |
-
{"ts": "2025-12-
|
| 3 |
-
{"ts": "2025-12-
|
| 4 |
-
{"ts": "2025-12-
|
| 5 |
-
{"ts": "2025-12-
|
| 6 |
-
{"ts": "2025-12-
|
| 7 |
-
{"ts": "2025-12-
|
| 8 |
-
{"ts": "2025-12-
|
| 9 |
-
{"ts": "2025-12-
|
|
|
|
| 1 |
+
{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.13450000000148066, "p50": 0.1411669999811238, "p90": 0.1532919999931437, "mean": 0.1477000000022599, "iqr": 0.017083999978240172, "raw_times": [0.13620800001490352, 0.1733330000206479, 0.1532919999931437, 0.1411669999811238, 0.13450000000148066], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.1447500000040236, "peak_bytes": null, "ok": false, "absmax": 0.04913330078125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.04913330078125, "mae": 0.0008915023063309491, "mse": 4.496400833886582e-06, "ref": "swiglu_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.1742909999506992, "p50": 0.17550000001165245, "p90": 0.17633400000249821, "mean": 0.17563320000135718, "iqr": 0.001000999986899842, "raw_times": [0.1742909999506992, 0.17633400000249821, 0.17533300001559837, 0.17670800002633769, 0.17550000001165245], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.731916999995974, "peak_bytes": null, "ok": false, "absmax": 0.06802082061767578, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.06802082061767578, "mae": 0.0008884685230441391, "mse": 4.475335117604118e-06, "ref": "swiglu_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.35966699999789853, "p50": 0.3839590000325188, "p90": 0.4197920000024169, "mean": 0.3930668000066362, "iqr": 0.05745900000420079, "raw_times": [0.35966699999789853, 0.3623329999982161, 0.4395830000021306, 0.4197920000024169, 0.3839590000325188], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.37070900003755014, "peak_bytes": null, "ok": false, "absmax": 0.07091712951660156, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.07091712951660156, "mae": 0.0008893357589840889, "mse": 4.469751274882583e-06, "ref": "swiglu_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.27337500000612636, "p50": 0.325791999955527, "p90": 0.3564579999988382, "mean": 0.5360415999916768, "iqr": 0.03887500002974775, "raw_times": [1.4070000000288019, 0.3564579999988382, 0.325791999955527, 0.27337500000612636, 0.31758299996909045], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 1.2649170000145205, "peak_bytes": null, "ok": false, "absmax": 0.04913330078125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.04913330078125, "mae": 0.0008873133920133114, "mse": 4.3958548303635325e-06, "ref": "swiglu_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.3514999999651991, "p50": 0.39737500003411697, "p90": 0.42058299999325754, "mean": 0.44304979999196803, "iqr": 0.05525000000261571, "raw_times": [0.42058299999325754, 0.39737500003411697, 0.6804579999766247, 0.3514999999651991, 0.36533299999064184], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 1.976333000015984, "peak_bytes": null, "ok": false, "absmax": 0.06802082061767578, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.06802082061767578, "mae": 0.0008889895398169756, "mse": 4.431089109857567e-06, "ref": "swiglu_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.9706249999794636, "p50": 0.9802499999977954, "p90": 3.842000000020107, "mean": 2.413258199999291, "iqr": 2.863209000054212, "raw_times": [3.842000000020107, 5.294625000033193, 0.9802499999977954, 0.978790999965895, 0.9706249999794636], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 2.3860840000224925, "peak_bytes": null, "ok": false, "absmax": 0.08395957946777344, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.08395957946777344, "mae": 0.0008889408782124519, "mse": 4.476671620068373e-06, "ref": "swiglu_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.6639999999720203, "p50": 0.8687079999845082, "p90": 1.1298749999468782, "mean": 0.9603583999933107, "iqr": 0.2749159999098083, "raw_times": [0.8549590000370699, 1.284250000026077, 1.1298749999468782, 0.6639999999720203, 0.8687079999845082], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.7134589999964192, "peak_bytes": null, "ok": false, "absmax": 0.05687236785888672, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.05687236785888672, "mae": 0.0008884922135621309, "mse": 4.399109002406476e-06, "ref": "swiglu_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 1.141958000005161, "p50": 1.6311670000277445, "p90": 1.6544580000186215, "mean": 1.7749248000086482, "iqr": 0.366167000038331, "raw_times": [1.6544580000186215, 1.2882909999802905, 3.1587500000114233, 1.6311670000277445, 1.141958000005161], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 2.0730410000169286, "peak_bytes": null, "ok": false, "absmax": 0.06802082061767578, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.06802082061767578, "mae": 0.0008890957687981427, "mse": 4.448749677976593e-06, "ref": "swiglu_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-12-19T22:43:49Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 2.664708999986942, "p50": 3.365374999987125, "p90": 3.6645420000240847, "mean": 3.5541085999966526, "iqr": 0.8831670000404301, "raw_times": [2.664708999986942, 3.6645420000240847, 3.365374999987125, 5.2945420000014565, 2.7813749999836546], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 17.938291999996636, "peak_bytes": null, "ok": false, "absmax": 0.09098148345947266, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.09098148345947266, "mae": 0.0008892239420674741, "mse": 4.500504473980982e-06, "ref": "swiglu_fp32"}, "err": null}
|
activation/impls/cells/benchmark.py
CHANGED
|
@@ -22,7 +22,7 @@ def swiglu_eager(x):
|
|
| 22 |
|
| 23 |
run_benchmark(
|
| 24 |
kernel_type=KernelTypeEnum.ACTIVATION,
|
| 25 |
-
impl_name="
|
| 26 |
-
impl_tags={"family":"
|
| 27 |
impl_func=swiglu_eager,
|
| 28 |
)
|
|
|
|
| 22 |
|
| 23 |
run_benchmark(
|
| 24 |
kernel_type=KernelTypeEnum.ACTIVATION,
|
| 25 |
+
impl_name="torch_eager_darwin",
|
| 26 |
+
impl_tags={"family":"pytorch", "backend":"eager", "platform": "darwin"},
|
| 27 |
impl_func=swiglu_eager,
|
| 28 |
)
|
activation/impls/cells/sysinfo.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "torch==2.8.0",
|
| 5 |
+
# ]
|
| 6 |
+
# ///
|
| 7 |
+
import platform
|
| 8 |
+
import subprocess
|
| 9 |
+
print(f"Platform: {platform.system()} {platform.machine()}")
|
| 10 |
+
print(f"Python: {platform.python_version()}")
|
| 11 |
+
# Check for MPS availability
|
| 12 |
+
import torch
|
| 13 |
+
print(f"PyTorch: {torch.__version__}")
|
| 14 |
+
print(f"MPS available: {torch.backends.mps.is_available()}")
|
activation/impls/hf_kernels_swiglu.html
CHANGED
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: nv | 0.
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3905,7 +3905,7 @@ Cell: nv | 0.29s
|
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div id="output-nv" class="cell-output">
|
| 3908 |
-
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3911 |
+-----------------------------------------+------------------------+----------------------+
|
|
@@ -3914,7 +3914,7 @@ Cell: nv | 0.29s
|
|
| 3914 |
| | | MIG M. |
|
| 3915 |
|=========================================+========================+======================|
|
| 3916 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3917 |
-
| N/A
|
| 3918 |
| | | N/A |
|
| 3919 |
+-----------------------------------------+------------------------+----------------------+
|
| 3920 |
|
|
@@ -3938,7 +3938,7 @@ Cell: nv | 0.29s
|
|
| 3938 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3939 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3940 |
</span> |
|
| 3941 |
-
Cell: benchmark | 8.
|
| 3942 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3943 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3944 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3995,16 +3995,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
|
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3997 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3998 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3999 |
-
hf_kernels_swiglu 8.
|
| 4000 |
-
_activation_23bf3fb::silu_and_mul 0.
|
| 4001 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.128us 100.00% 4.128us 1.376us 3
|
| 4002 |
-
Activity Buffer Request 85.
|
| 4003 |
-
aten::empty 2.
|
| 4004 |
-
cudaLaunchKernel 2.
|
| 4005 |
-
cudaDeviceSynchronize 0.
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
-
Self CPU time total: 2.
|
| 4008 |
Self CUDA time total: 4.128us
|
| 4009 |
|
| 4010 |
|
|
@@ -4015,17 +4015,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
|
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4019 |
-
hf_kernels_swiglu 4.
|
| 4020 |
-
_activation_23bf3fb::silu_and_mul
|
| 4021 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4022 |
-
Activity Buffer Request 91.
|
| 4023 |
-
aten::empty
|
| 4024 |
-
cudaLaunchKernel 1.
|
| 4025 |
-
cudaDeviceSynchronize 0.
|
| 4026 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4027 |
-
Self CPU time total: 1.
|
| 4028 |
-
Self CUDA time total: 3.
|
| 4029 |
|
| 4030 |
|
| 4031 |
|
|
@@ -4035,17 +4035,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
|
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4039 |
-
hf_kernels_swiglu 4.
|
| 4040 |
-
_activation_23bf3fb::silu_and_mul
|
| 4041 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4042 |
-
Activity Buffer Request 91.
|
| 4043 |
-
aten::empty
|
| 4044 |
-
cudaLaunchKernel 1.
|
| 4045 |
-
cudaDeviceSynchronize 0.
|
| 4046 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4047 |
-
Self CPU time total: 1.
|
| 4048 |
-
Self CUDA time total: 4.
|
| 4049 |
|
| 4050 |
|
| 4051 |
|
|
@@ -4055,17 +4055,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
|
|
| 4055 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4056 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4057 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4058 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.
|
| 4059 |
-
hf_kernels_swiglu 4.
|
| 4060 |
-
_activation_23bf3fb::silu_and_mul 0.
|
| 4061 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4062 |
-
Activity Buffer Request
|
| 4063 |
-
aten::empty 0.
|
| 4064 |
-
cudaLaunchKernel
|
| 4065 |
-
cudaDeviceSynchronize 0.
|
| 4066 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4067 |
-
Self CPU time total: 2.
|
| 4068 |
-
Self CUDA time total: 4.
|
| 4069 |
|
| 4070 |
|
| 4071 |
|
|
@@ -4075,17 +4075,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
|
|
| 4075 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4076 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4077 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4078 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4079 |
-
hf_kernels_swiglu 4.
|
| 4080 |
-
_activation_23bf3fb::silu_and_mul 0.98% 20.
|
| 4081 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4082 |
-
Activity Buffer Request
|
| 4083 |
-
aten::empty 0.
|
| 4084 |
-
cudaLaunchKernel
|
| 4085 |
-
cudaDeviceSynchronize 0.
|
| 4086 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4087 |
-
Self CPU time total: 2.
|
| 4088 |
-
Self CUDA time total: 5.
|
| 4089 |
|
| 4090 |
|
| 4091 |
|
|
@@ -4095,17 +4095,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
|
|
| 4095 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4096 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4097 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4098 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4099 |
-
hf_kernels_swiglu
|
| 4100 |
-
_activation_23bf3fb::silu_and_mul
|
| 4101 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4102 |
-
Activity Buffer Request
|
| 4103 |
-
aten::empty 3.
|
| 4104 |
-
cudaLaunchKernel
|
| 4105 |
-
cudaDeviceSynchronize
|
| 4106 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4107 |
-
Self CPU time total:
|
| 4108 |
-
Self CUDA time total: 7.
|
| 4109 |
|
| 4110 |
|
| 4111 |
|
|
@@ -4115,16 +4115,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
|
|
| 4115 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4116 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4117 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4118 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4119 |
-
hf_kernels_swiglu
|
| 4120 |
-
_activation_23bf3fb::silu_and_mul
|
| 4121 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 100.00% 6.624us 2.208us 3
|
| 4122 |
-
Activity Buffer Request
|
| 4123 |
-
aten::empty
|
| 4124 |
-
cudaLaunchKernel 34.
|
| 4125 |
-
cudaDeviceSynchronize
|
| 4126 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4127 |
-
Self CPU time total:
|
| 4128 |
Self CUDA time total: 6.624us
|
| 4129 |
|
| 4130 |
|
|
@@ -4135,17 +4135,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
|
|
| 4135 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4136 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4137 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4138 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4139 |
-
hf_kernels_swiglu 4.
|
| 4140 |
-
_activation_23bf3fb::silu_and_mul
|
| 4141 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4142 |
-
Activity Buffer Request
|
| 4143 |
-
aten::empty
|
| 4144 |
-
cudaLaunchKernel
|
| 4145 |
-
cudaDeviceSynchronize 0.24% 4.
|
| 4146 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4147 |
-
Self CPU time total:
|
| 4148 |
-
Self CUDA time total: 9.
|
| 4149 |
|
| 4150 |
|
| 4151 |
|
|
@@ -4155,17 +4155,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
|
|
| 4155 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4156 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4157 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4158 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4159 |
-
hf_kernels_swiglu
|
| 4160 |
-
_activation_23bf3fb::silu_and_mul
|
| 4161 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.
|
| 4162 |
-
Activity Buffer Request
|
| 4163 |
-
aten::empty
|
| 4164 |
-
cudaLaunchKernel 35.
|
| 4165 |
-
cudaDeviceSynchronize
|
| 4166 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4167 |
-
Self CPU time total:
|
| 4168 |
-
Self CUDA time total: 13.
|
| 4169 |
|
| 4170 |
|
| 4171 |
impl wl p50(ms) ok
|
|
@@ -4182,13 +4182,14 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
|
|
| 4182 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4183 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4184 |
<div class="uv-logs-content" style="display: none;">
|
| 4185 |
-
Installed 51 packages in
|
| 4186 |
</div>
|
| 4187 |
</div>
|
| 4188 |
-
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4189 |
-
|
| 4190 |
-
Fetching 7 files:
|
| 4191 |
-
Fetching 7 files:
|
|
|
|
| 4192 |
<div class="cell-artifacts">
|
| 4193 |
<h4>Artifacts:</h4>
|
| 4194 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
|
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: nv | 0.25s
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div id="output-nv" class="cell-output">
|
| 3908 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 23:01:11 2025
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3911 |
+-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3914 |
| | | MIG M. |
|
| 3915 |
|=========================================+========================+======================|
|
| 3916 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3917 |
+
| N/A 39C P0 82W / 350W | 0MiB / 46068MiB | 10% Default |
|
| 3918 |
| | | N/A |
|
| 3919 |
+-----------------------------------------+------------------------+----------------------+
|
| 3920 |
|
|
|
|
| 3938 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3939 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3940 |
</span> |
|
| 3941 |
+
Cell: benchmark | 8.49s
|
| 3942 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3943 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3944 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3997 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3998 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 76.129us 1844.21% 76.129us 76.129us 1
|
| 3999 |
+
hf_kernels_swiglu 8.60% 174.603us 99.27% 2.015ms 2.015ms 0.000us 0.00% 5.568us 5.568us 1
|
| 4000 |
+
_activation_23bf3fb::silu_and_mul 0.97% 19.670us 88.54% 1.797ms 599.020us 4.128us 100.00% 5.568us 1.856us 3
|
| 4001 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.128us 100.00% 4.128us 1.376us 3
|
| 4002 |
+
Activity Buffer Request 85.37% 1.733ms 85.37% 1.733ms 1.733ms 1.440us 34.88% 1.440us 1.440us 1
|
| 4003 |
+
aten::empty 2.13% 43.191us 2.13% 43.191us 14.397us 0.000us 0.00% 0.000us 0.000us 3
|
| 4004 |
+
cudaLaunchKernel 2.20% 44.752us 2.20% 44.752us 14.917us 0.000us 0.00% 0.000us 0.000us 3
|
| 4005 |
+
cudaDeviceSynchronize 0.73% 14.741us 0.73% 14.741us 14.741us 0.000us 0.00% 0.000us 0.000us 1
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
+
Self CPU time total: 2.030ms
|
| 4008 |
Self CUDA time total: 4.128us
|
| 4009 |
|
| 4010 |
|
|
|
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.783us 1582.23% 62.783us 62.783us 1
|
| 4019 |
+
hf_kernels_swiglu 4.95% 92.601us 99.70% 1.863ms 1.863ms 0.000us 0.00% 5.312us 5.312us 1
|
| 4020 |
+
_activation_23bf3fb::silu_and_mul 1.25% 23.392us 93.77% 1.753ms 584.220us 3.968us 100.00% 5.312us 1.771us 3
|
| 4021 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3
|
| 4022 |
+
Activity Buffer Request 91.17% 1.704ms 91.17% 1.704ms 1.704ms 1.344us 33.87% 1.344us 1.344us 1
|
| 4023 |
+
aten::empty 0.97% 18.160us 0.97% 18.160us 6.053us 0.000us 0.00% 0.000us 0.000us 3
|
| 4024 |
+
cudaLaunchKernel 1.35% 25.221us 1.35% 25.221us 8.407us 0.000us 0.00% 0.000us 0.000us 3
|
| 4025 |
+
cudaDeviceSynchronize 0.30% 5.620us 0.30% 5.620us 5.620us 0.000us 0.00% 0.000us 0.000us 1
|
| 4026 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4027 |
+
Self CPU time total: 1.869ms
|
| 4028 |
+
Self CUDA time total: 3.968us
|
| 4029 |
|
| 4030 |
|
| 4031 |
|
|
|
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 61.887us 1264.03% 61.887us 61.887us 1
|
| 4039 |
+
hf_kernels_swiglu 4.90% 91.392us 99.70% 1.861ms 1.861ms 0.000us 0.00% 6.528us 6.528us 1
|
| 4040 |
+
_activation_23bf3fb::silu_and_mul 1.06% 19.772us 93.81% 1.751ms 583.690us 4.896us 100.00% 6.528us 2.176us 3
|
| 4041 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.896us 100.00% 4.896us 1.632us 3
|
| 4042 |
+
Activity Buffer Request 91.42% 1.706ms 91.42% 1.706ms 1.706ms 1.632us 33.33% 1.632us 1.632us 1
|
| 4043 |
+
aten::empty 1.00% 18.580us 1.00% 18.580us 6.193us 0.000us 0.00% 0.000us 0.000us 3
|
| 4044 |
+
cudaLaunchKernel 1.33% 24.870us 1.33% 24.870us 8.290us 0.000us 0.00% 0.000us 0.000us 3
|
| 4045 |
+
cudaDeviceSynchronize 0.30% 5.640us 0.30% 5.640us 5.640us 0.000us 0.00% 0.000us 0.000us 1
|
| 4046 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4047 |
+
Self CPU time total: 1.867ms
|
| 4048 |
+
Self CUDA time total: 4.896us
|
| 4049 |
|
| 4050 |
|
| 4051 |
|
|
|
|
| 4055 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4056 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4057 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4058 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.431us 1560.88% 66.431us 66.431us 1
|
| 4059 |
+
hf_kernels_swiglu 4.62% 96.552us 99.72% 2.084ms 2.084ms 0.000us 0.00% 5.696us 5.696us 1
|
| 4060 |
+
_activation_23bf3fb::silu_and_mul 0.92% 19.230us 94.20% 1.969ms 656.267us 4.256us 100.00% 5.696us 1.899us 3
|
| 4061 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.256us 100.00% 4.256us 1.419us 3
|
| 4062 |
+
Activity Buffer Request 82.63% 1.727ms 82.63% 1.727ms 1.727ms 1.440us 33.83% 1.440us 1.440us 1
|
| 4063 |
+
aten::empty 0.91% 18.961us 0.91% 18.961us 6.320us 0.000us 0.00% 0.000us 0.000us 3
|
| 4064 |
+
cudaLaunchKernel 10.64% 222.454us 10.64% 222.454us 74.151us 0.000us 0.00% 0.000us 0.000us 3
|
| 4065 |
+
cudaDeviceSynchronize 0.28% 5.800us 0.28% 5.800us 5.800us 0.000us 0.00% 0.000us 0.000us 1
|
| 4066 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4067 |
+
Self CPU time total: 2.090ms
|
| 4068 |
+
Self CUDA time total: 4.256us
|
| 4069 |
|
| 4070 |
|
| 4071 |
|
|
|
|
| 4075 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4076 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4077 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4078 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.753us 1065.60% 62.753us 62.753us 1
|
| 4079 |
+
hf_kernels_swiglu 4.32% 90.233us 99.73% 2.084ms 2.084ms 0.000us 0.00% 7.842us 7.842us 1
|
| 4080 |
+
_activation_23bf3fb::silu_and_mul 0.98% 20.530us 94.51% 1.975ms 658.421us 5.889us 100.00% 7.842us 2.614us 3
|
| 4081 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.889us 100.00% 5.889us 1.963us 3
|
| 4082 |
+
Activity Buffer Request 83.43% 1.744ms 83.43% 1.744ms 1.744ms 1.953us 33.16% 1.953us 1.953us 1
|
| 4083 |
+
aten::empty 0.90% 18.820us 0.90% 18.820us 6.273us 0.000us 0.00% 0.000us 0.000us 3
|
| 4084 |
+
cudaLaunchKernel 10.09% 210.974us 10.09% 210.974us 70.325us 0.000us 0.00% 0.000us 0.000us 3
|
| 4085 |
+
cudaDeviceSynchronize 0.27% 5.680us 0.27% 5.680us 5.680us 0.000us 0.00% 0.000us 0.000us 1
|
| 4086 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4087 |
+
Self CPU time total: 2.090ms
|
| 4088 |
+
Self CUDA time total: 5.889us
|
| 4089 |
|
| 4090 |
|
| 4091 |
|
|
|
|
| 4095 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4096 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4097 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4098 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 58.974us 761.74% 58.974us 58.974us 1
|
| 4099 |
+
hf_kernels_swiglu 14.39% 83.563us 99.11% 575.543us 575.543us 0.000us 0.00% 10.333us 10.333us 1
|
| 4100 |
+
_activation_23bf3fb::silu_and_mul 3.37% 19.590us 81.67% 474.270us 158.090us 7.742us 100.00% 10.333us 3.444us 3
|
| 4101 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.742us 100.00% 7.742us 2.581us 3
|
| 4102 |
+
Activity Buffer Request 43.30% 251.476us 43.30% 251.476us 251.476us 2.591us 33.47% 2.591us 2.591us 1
|
| 4103 |
+
aten::empty 3.05% 17.710us 3.05% 17.710us 5.903us 0.000us 0.00% 0.000us 0.000us 3
|
| 4104 |
+
cudaLaunchKernel 34.99% 203.204us 34.99% 203.204us 67.735us 0.000us 0.00% 0.000us 0.000us 3
|
| 4105 |
+
cudaDeviceSynchronize 0.89% 5.190us 0.89% 5.190us 5.190us 0.000us 0.00% 0.000us 0.000us 1
|
| 4106 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4107 |
+
Self CPU time total: 580.733us
|
| 4108 |
+
Self CUDA time total: 7.742us
|
| 4109 |
|
| 4110 |
|
| 4111 |
|
|
|
|
| 4115 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4116 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4117 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4118 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 60.191us 908.68% 60.191us 60.191us 1
|
| 4119 |
+
hf_kernels_swiglu 14.49% 83.902us 99.19% 574.293us 574.293us 0.000us 0.00% 8.832us 8.832us 1
|
| 4120 |
+
_activation_23bf3fb::silu_and_mul 3.38% 19.561us 81.54% 472.101us 157.367us 6.624us 100.00% 8.832us 2.944us 3
|
| 4121 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 100.00% 6.624us 2.208us 3
|
| 4122 |
+
Activity Buffer Request 43.39% 251.205us 43.39% 251.205us 251.205us 2.208us 33.33% 2.208us 2.208us 1
|
| 4123 |
+
aten::empty 3.16% 18.290us 3.16% 18.290us 6.097us 0.000us 0.00% 0.000us 0.000us 3
|
| 4124 |
+
cudaLaunchKernel 34.77% 201.335us 34.77% 201.335us 67.112us 0.000us 0.00% 0.000us 0.000us 3
|
| 4125 |
+
cudaDeviceSynchronize 0.81% 4.680us 0.81% 4.680us 4.680us 0.000us 0.00% 0.000us 0.000us 1
|
| 4126 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4127 |
+
Self CPU time total: 578.973us
|
| 4128 |
Self CUDA time total: 6.624us
|
| 4129 |
|
| 4130 |
|
|
|
|
| 4135 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4136 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4137 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4138 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.480us 685.45% 64.480us 64.480us 1
|
| 4139 |
+
hf_kernels_swiglu 4.47% 90.662us 99.76% 2.023ms 2.023ms 0.000us 0.00% 12.543us 12.543us 1
|
| 4140 |
+
_activation_23bf3fb::silu_and_mul 0.98% 19.960us 94.38% 1.913ms 637.817us 9.407us 100.00% 12.543us 4.181us 3
|
| 4141 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.407us 100.00% 9.407us 3.136us 3
|
| 4142 |
+
Activity Buffer Request 83.63% 1.695ms 83.63% 1.695ms 1.695ms 3.136us 33.34% 3.136us 3.136us 1
|
| 4143 |
+
aten::empty 0.91% 18.421us 0.91% 18.421us 6.140us 0.000us 0.00% 0.000us 0.000us 3
|
| 4144 |
+
cudaLaunchKernel 9.77% 198.004us 9.77% 198.004us 66.001us 0.000us 0.00% 0.000us 0.000us 3
|
| 4145 |
+
cudaDeviceSynchronize 0.24% 4.950us 0.24% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1
|
| 4146 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4147 |
+
Self CPU time total: 2.027ms
|
| 4148 |
+
Self CUDA time total: 9.407us
|
| 4149 |
|
| 4150 |
|
| 4151 |
|
|
|
|
| 4155 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4156 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4157 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4158 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 60.576us 465.11% 60.576us 60.576us 1
|
| 4159 |
+
hf_kernels_swiglu 15.18% 83.082us 99.12% 542.352us 542.352us 0.000us 0.00% 17.408us 17.408us 1
|
| 4160 |
+
_activation_23bf3fb::silu_and_mul 3.66% 20.041us 80.66% 441.340us 147.113us 13.024us 100.00% 17.408us 5.803us 3
|
| 4161 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.024us 100.00% 13.024us 4.341us 3
|
| 4162 |
+
Activity Buffer Request 41.24% 225.625us 41.24% 225.625us 225.625us 4.384us 33.66% 4.384us 4.384us 1
|
| 4163 |
+
aten::empty 3.28% 17.930us 3.28% 17.930us 5.977us 0.000us 0.00% 0.000us 0.000us 3
|
| 4164 |
+
cudaLaunchKernel 35.76% 195.674us 35.76% 195.674us 65.225us 0.000us 0.00% 0.000us 0.000us 3
|
| 4165 |
+
cudaDeviceSynchronize 0.88% 4.811us 0.88% 4.811us 4.811us 0.000us 0.00% 0.000us 0.000us 1
|
| 4166 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4167 |
+
Self CPU time total: 547.163us
|
| 4168 |
+
Self CUDA time total: 13.024us
|
| 4169 |
|
| 4170 |
|
| 4171 |
impl wl p50(ms) ok
|
|
|
|
| 4182 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4183 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4184 |
<div class="uv-logs-content" style="display: none;">
|
| 4185 |
+
Installed 51 packages in 306ms
|
| 4186 |
</div>
|
| 4187 |
</div>
|
| 4188 |
+
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
|
| 4189 |
+
|
| 4190 |
+
Fetching 7 files: 29%|██▊ | 2/7 [00:00<00:00, 17.51it/s]
|
| 4191 |
+
Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 14.39it/s]
|
| 4192 |
+
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 20.57it/s]</div>
|
| 4193 |
<div class="cell-artifacts">
|
| 4194 |
<h4>Artifacts:</h4>
|
| 4195 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
activation/impls/index.html
CHANGED
|
@@ -82,8 +82,7 @@
|
|
| 82 |
</div>
|
| 83 |
<h1>Index of /activation/impls</h1>
|
| 84 |
<ul>
|
| 85 |
-
<li><a href='
|
| 86 |
-
<li><a href='torch_swiglu.html' class='file'>torch_swiglu.html</a></li>
|
| 87 |
</ul>
|
| 88 |
</body>
|
| 89 |
</html>
|
|
|
|
| 82 |
</div>
|
| 83 |
<h1>Index of /activation/impls</h1>
|
| 84 |
<ul>
|
| 85 |
+
<li><a href='torch_swiglu_darwin.html' class='file'>torch_swiglu_darwin.html</a></li>
|
|
|
|
| 86 |
</ul>
|
| 87 |
</body>
|
| 88 |
</html>
|
activation/impls/torch_swiglu.html
CHANGED
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: nv | 0.
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3904,7 +3904,7 @@ Cell: nv | 0.29s
|
|
| 3904 |
</div>
|
| 3905 |
</div>
|
| 3906 |
<div id="output-nv" class="cell-output">
|
| 3907 |
-
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19
|
| 3908 |
+-----------------------------------------------------------------------------------------+
|
| 3909 |
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
|
@@ -3913,7 +3913,7 @@ Cell: nv | 0.29s
|
|
| 3913 |
| | | MIG M. |
|
| 3914 |
|=========================================+========================+======================|
|
| 3915 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3916 |
-
| N/A
|
| 3917 |
| | | N/A |
|
| 3918 |
+-----------------------------------------+------------------------+----------------------+
|
| 3919 |
|
|
@@ -3937,7 +3937,7 @@ Cell: nv | 0.29s
|
|
| 3937 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3938 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3939 |
</span> |
|
| 3940 |
-
Cell: benchmark | 3.
|
| 3941 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3942 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3943 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3987,20 +3987,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
|
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3991 |
-
torch_eager
|
| 3992 |
-
aten::silu 2.
|
| 3993 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 3994 |
-
aten::mul 1.
|
| 3995 |
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
|
| 3996 |
-
Activity Buffer Request
|
| 3997 |
-
aten::slice
|
| 3998 |
-
aten::as_strided 0.
|
| 3999 |
-
cudaLaunchKernel 3.
|
| 4000 |
-
cudaDeviceSynchronize 0.
|
| 4001 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4002 |
-
Self CPU time total: 2.
|
| 4003 |
-
Self CUDA time total: 12.
|
| 4004 |
|
| 4005 |
|
| 4006 |
|
|
@@ -4010,20 +4010,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
|
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4014 |
-
torch_eager 6.
|
| 4015 |
-
aten::silu
|
| 4016 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4017 |
-
aten::mul 1.
|
| 4018 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4019 |
-
Activity Buffer Request
|
| 4020 |
-
aten::slice 1.
|
| 4021 |
-
aten::as_strided 0.
|
| 4022 |
-
cudaLaunchKernel 2.
|
| 4023 |
-
cudaDeviceSynchronize 0.
|
| 4024 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
-
Self CPU time total: 2.
|
| 4026 |
-
Self CUDA time total: 12.
|
| 4027 |
|
| 4028 |
|
| 4029 |
|
|
@@ -4033,20 +4033,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
|
|
| 4033 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4034 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4037 |
-
torch_eager 6.
|
| 4038 |
-
aten::silu 2.
|
| 4039 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4040 |
-
aten::mul 1.
|
| 4041 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4042 |
-
Activity Buffer Request
|
| 4043 |
-
aten::slice 1.
|
| 4044 |
-
aten::as_strided 0.
|
| 4045 |
-
cudaLaunchKernel 2.
|
| 4046 |
-
cudaDeviceSynchronize 0.
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
-
Self CPU time total: 1.
|
| 4049 |
-
Self CUDA time total: 13.
|
| 4050 |
|
| 4051 |
|
| 4052 |
|
|
@@ -4056,20 +4056,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
|
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4060 |
-
torch_eager
|
| 4061 |
-
aten::silu 1.
|
| 4062 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4063 |
-
aten::mul 1.
|
| 4064 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.112us 48.
|
| 4065 |
-
Activity Buffer Request
|
| 4066 |
-
aten::slice 1.
|
| 4067 |
-
aten::as_strided 0.
|
| 4068 |
-
cudaLaunchKernel 9.
|
| 4069 |
-
cudaDeviceSynchronize 0.
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
Self CPU time total: 2.257ms
|
| 4072 |
-
Self CUDA time total: 12.
|
| 4073 |
|
| 4074 |
|
| 4075 |
|
|
@@ -4079,20 +4079,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
|
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4081 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4082 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4083 |
-
torch_eager 6.
|
| 4084 |
-
aten::silu
|
| 4085 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4086 |
-
aten::mul 1.
|
| 4087 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4088 |
-
Activity Buffer Request 79.
|
| 4089 |
-
aten::slice 1.
|
| 4090 |
-
aten::as_strided 0.
|
| 4091 |
-
cudaLaunchKernel 8.
|
| 4092 |
-
cudaDeviceSynchronize 0.
|
| 4093 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4094 |
-
Self CPU time total: 2.
|
| 4095 |
-
Self CUDA time total: 13.
|
| 4096 |
|
| 4097 |
|
| 4098 |
|
|
@@ -4102,20 +4102,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
|
|
| 4102 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4103 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4104 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4105 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4106 |
-
torch_eager
|
| 4107 |
-
aten::silu 1.
|
| 4108 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4109 |
-
aten::mul 1.
|
| 4110 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4111 |
-
Activity Buffer Request
|
| 4112 |
-
aten::slice 1.
|
| 4113 |
-
aten::as_strided 0.
|
| 4114 |
-
cudaLaunchKernel
|
| 4115 |
-
cudaDeviceSynchronize 0.
|
| 4116 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4117 |
-
Self CPU time total: 2.
|
| 4118 |
-
Self CUDA time total: 15.
|
| 4119 |
|
| 4120 |
|
| 4121 |
|
|
@@ -4125,20 +4125,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
|
|
| 4125 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4126 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4127 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4128 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4129 |
-
torch_eager 6.
|
| 4130 |
-
aten::silu 2.
|
| 4131 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4132 |
-
aten::mul 1.
|
| 4133 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4134 |
-
Activity Buffer Request
|
| 4135 |
-
aten::slice 1.
|
| 4136 |
-
aten::as_strided 0.
|
| 4137 |
-
cudaLaunchKernel
|
| 4138 |
-
cudaDeviceSynchronize 0.
|
| 4139 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4140 |
-
Self CPU time total: 2.
|
| 4141 |
-
Self CUDA time total: 14.
|
| 4142 |
|
| 4143 |
|
| 4144 |
|
|
@@ -4148,20 +4148,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
|
|
| 4148 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4149 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4150 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4151 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4152 |
-
torch_eager 5.
|
| 4153 |
-
aten::silu
|
| 4154 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4155 |
-
aten::mul 1.
|
| 4156 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 48.
|
| 4157 |
-
Activity Buffer Request 80.
|
| 4158 |
-
aten::slice 1.
|
| 4159 |
-
aten::as_strided 0.
|
| 4160 |
-
cudaLaunchKernel
|
| 4161 |
-
cudaDeviceSynchronize 0.27% 5.
|
| 4162 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4163 |
-
Self CPU time total: 2.
|
| 4164 |
-
Self CUDA time total: 15.
|
| 4165 |
|
| 4166 |
|
| 4167 |
|
|
@@ -4171,24 +4171,24 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
|
|
| 4171 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4172 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4173 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4174 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4175 |
-
torch_eager
|
| 4176 |
-
aten::silu
|
| 4177 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.
|
| 4178 |
-
aten::mul 1.
|
| 4179 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.
|
| 4180 |
-
Activity Buffer Request
|
| 4181 |
-
aten::slice 1.
|
| 4182 |
-
aten::as_strided 0.
|
| 4183 |
-
cudaLaunchKernel 8.
|
| 4184 |
-
cudaDeviceSynchronize 0.
|
| 4185 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4186 |
-
Self CPU time total: 2.
|
| 4187 |
-
Self CUDA time total: 22.
|
| 4188 |
|
| 4189 |
|
| 4190 |
impl wl p50(ms) ok
|
| 4191 |
-
torch_eager cuda_T128_D1024 0.
|
| 4192 |
torch_eager cuda_T128_D2048 0.05 True
|
| 4193 |
torch_eager cuda_T128_D768 0.04 True
|
| 4194 |
torch_eager cuda_T256_D1024 0.05 True
|
|
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: nv | 0.25s
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3904 |
</div>
|
| 3905 |
</div>
|
| 3906 |
<div id="output-nv" class="cell-output">
|
| 3907 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 23:01:11 2025
|
| 3908 |
+-----------------------------------------------------------------------------------------+
|
| 3909 |
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3913 |
| | | MIG M. |
|
| 3914 |
|=========================================+========================+======================|
|
| 3915 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3916 |
+
| N/A 39C P0 82W / 350W | 0MiB / 46068MiB | 10% Default |
|
| 3917 |
| | | N/A |
|
| 3918 |
+-----------------------------------------+------------------------+----------------------+
|
| 3919 |
|
|
|
|
| 3937 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3938 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3939 |
</span> |
|
| 3940 |
+
Cell: benchmark | 3.88s
|
| 3941 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3942 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3943 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 176.512us 1385.93% 176.512us 176.512us 1
|
| 3991 |
+
torch_eager 8.54% 185.335us 99.30% 2.155ms 2.155ms 0.000us 0.00% 15.072us 15.072us 1
|
| 3992 |
+
aten::silu 2.61% 56.610us 85.90% 1.864ms 621.400us 6.560us 51.51% 8.896us 2.965us 3
|
| 3993 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.51% 6.560us 2.187us 3
|
| 3994 |
+
aten::mul 1.46% 31.580us 2.49% 54.091us 18.030us 6.176us 48.49% 6.176us 2.059us 3
|
| 3995 |
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
|
| 3996 |
+
Activity Buffer Request 81.25% 1.763ms 81.25% 1.763ms 1.763ms 2.336us 18.34% 2.336us 2.336us 1
|
| 3997 |
+
aten::slice 1.93% 41.799us 2.37% 51.470us 8.578us 0.000us 0.00% 0.000us 0.000us 6
|
| 3998 |
+
aten::as_strided 0.45% 9.671us 0.45% 9.671us 1.612us 0.000us 0.00% 0.000us 0.000us 6
|
| 3999 |
+
cudaLaunchKernel 3.08% 66.742us 3.08% 66.742us 11.124us 0.000us 0.00% 0.000us 0.000us 6
|
| 4000 |
+
cudaDeviceSynchronize 0.70% 15.141us 0.70% 15.141us 15.141us 0.000us 0.00% 0.000us 0.000us 1
|
| 4001 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4002 |
+
Self CPU time total: 2.170ms
|
| 4003 |
+
Self CUDA time total: 12.736us
|
| 4004 |
|
| 4005 |
|
| 4006 |
|
|
|
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 147.103us 1187.66% 147.103us 147.103us 1
|
| 4014 |
+
torch_eager 6.42% 134.022us 99.73% 2.081ms 2.081ms 0.000us 0.00% 14.563us 14.563us 1
|
| 4015 |
+
aten::silu 1.84% 38.392us 89.66% 1.871ms 623.681us 6.401us 51.68% 8.578us 2.859us 3
|
| 4016 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.401us 51.68% 6.401us 2.134us 3
|
| 4017 |
+
aten::mul 1.30% 27.120us 2.25% 46.940us 15.647us 5.985us 48.32% 5.985us 1.995us 3
|
| 4018 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.985us 48.32% 5.985us 1.995us 3
|
| 4019 |
+
Activity Buffer Request 86.49% 1.805ms 86.49% 1.805ms 1.805ms 2.177us 17.58% 2.177us 2.177us 1
|
| 4020 |
+
aten::slice 1.12% 23.282us 1.39% 29.102us 4.850us 0.000us 0.00% 0.000us 0.000us 6
|
| 4021 |
+
aten::as_strided 0.28% 5.820us 0.28% 5.820us 0.970us 0.000us 0.00% 0.000us 0.000us 6
|
| 4022 |
+
cudaLaunchKernel 2.28% 47.661us 2.28% 47.661us 7.944us 0.000us 0.00% 0.000us 0.000us 6
|
| 4023 |
+
cudaDeviceSynchronize 0.27% 5.671us 0.27% 5.671us 5.671us 0.000us 0.00% 0.000us 0.000us 1
|
| 4024 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
+
Self CPU time total: 2.087ms
|
| 4026 |
+
Self CUDA time total: 12.386us
|
| 4027 |
|
| 4028 |
|
| 4029 |
|
|
|
|
| 4033 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4034 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 145.888us 1106.55% 145.888us 145.888us 1
|
| 4037 |
+
torch_eager 6.31% 124.322us 99.70% 1.963ms 1.963ms 0.000us 0.00% 15.456us 15.456us 1
|
| 4038 |
+
aten::silu 2.05% 40.451us 89.58% 1.764ms 587.980us 6.784us 51.46% 9.056us 3.019us 3
|
| 4039 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.46% 6.784us 2.261us 3
|
| 4040 |
+
aten::mul 1.27% 25.091us 2.33% 45.941us 15.314us 6.400us 48.54% 6.400us 2.133us 3
|
| 4041 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.400us 48.54% 6.400us 2.133us 3
|
| 4042 |
+
Activity Buffer Request 86.22% 1.698ms 86.22% 1.698ms 1.698ms 2.272us 17.23% 2.272us 2.272us 1
|
| 4043 |
+
aten::slice 1.19% 23.361us 1.47% 29.031us 4.839us 0.000us 0.00% 0.000us 0.000us 6
|
| 4044 |
+
aten::as_strided 0.29% 5.670us 0.29% 5.670us 0.945us 0.000us 0.00% 0.000us 0.000us 6
|
| 4045 |
+
cudaLaunchKernel 2.36% 46.481us 2.36% 46.481us 7.747us 0.000us 0.00% 0.000us 0.000us 6
|
| 4046 |
+
cudaDeviceSynchronize 0.30% 5.880us 0.30% 5.880us 5.880us 0.000us 0.00% 0.000us 0.000us 1
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
+
Self CPU time total: 1.969ms
|
| 4049 |
+
Self CUDA time total: 13.184us
|
| 4050 |
|
| 4051 |
|
| 4052 |
|
|
|
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 156.959us 1244.82% 156.959us 156.959us 1
|
| 4060 |
+
torch_eager 5.42% 122.252us 99.77% 2.252ms 2.252ms 0.000us 0.00% 14.785us 14.785us 1
|
| 4061 |
+
aten::silu 1.78% 40.211us 90.81% 2.050ms 683.202us 6.497us 51.53% 8.673us 2.891us 3
|
| 4062 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.497us 51.53% 6.497us 2.166us 3
|
| 4063 |
+
aten::mul 1.27% 28.640us 2.19% 49.471us 16.490us 6.112us 48.47% 6.112us 2.037us 3
|
| 4064 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.112us 48.47% 6.112us 2.037us 3
|
| 4065 |
+
Activity Buffer Request 80.04% 1.807ms 80.04% 1.807ms 1.807ms 2.176us 17.26% 2.176us 2.176us 1
|
| 4066 |
+
aten::slice 1.10% 24.730us 1.36% 30.660us 5.110us 0.000us 0.00% 0.000us 0.000us 6
|
| 4067 |
+
aten::as_strided 0.26% 5.930us 0.26% 5.930us 0.988us 0.000us 0.00% 0.000us 0.000us 6
|
| 4068 |
+
cudaLaunchKernel 9.91% 223.637us 9.91% 223.637us 37.273us 0.000us 0.00% 0.000us 0.000us 6
|
| 4069 |
+
cudaDeviceSynchronize 0.23% 5.130us 0.23% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
Self CPU time total: 2.257ms
|
| 4072 |
+
Self CUDA time total: 12.609us
|
| 4073 |
|
| 4074 |
|
| 4075 |
|
|
|
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4081 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4082 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 160.703us 1215.97% 160.703us 160.703us 1
|
| 4083 |
+
torch_eager 6.46% 135.762us 99.74% 2.098ms 2.098ms 0.000us 0.00% 15.488us 15.488us 1
|
| 4084 |
+
aten::silu 1.92% 40.421us 89.37% 1.880ms 626.541us 6.816us 51.57% 9.088us 3.029us 3
|
| 4085 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 51.57% 6.816us 2.272us 3
|
| 4086 |
+
aten::mul 1.37% 28.851us 2.33% 49.101us 16.367us 6.400us 48.43% 6.400us 2.133us 3
|
| 4087 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.400us 48.43% 6.400us 2.133us 3
|
| 4088 |
+
Activity Buffer Request 79.67% 1.676ms 79.67% 1.676ms 1.676ms 2.272us 17.19% 2.272us 2.272us 1
|
| 4089 |
+
aten::slice 1.24% 26.071us 1.57% 33.081us 5.513us 0.000us 0.00% 0.000us 0.000us 6
|
| 4090 |
+
aten::as_strided 0.33% 7.010us 0.33% 7.010us 1.168us 0.000us 0.00% 0.000us 0.000us 6
|
| 4091 |
+
cudaLaunchKernel 8.75% 183.945us 8.75% 183.945us 30.657us 0.000us 0.00% 0.000us 0.000us 6
|
| 4092 |
+
cudaDeviceSynchronize 0.26% 5.530us 0.26% 5.530us 5.530us 0.000us 0.00% 0.000us 0.000us 1
|
| 4093 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4094 |
+
Self CPU time total: 2.103ms
|
| 4095 |
+
Self CUDA time total: 13.216us
|
| 4096 |
|
| 4097 |
|
| 4098 |
|
|
|
|
| 4102 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4103 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4104 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4105 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 160.606us 1034.83% 160.606us 160.606us 1
|
| 4106 |
+
torch_eager 5.99% 133.963us 99.76% 2.233ms 2.233ms 0.000us 0.00% 18.208us 18.208us 1
|
| 4107 |
+
aten::silu 1.79% 40.170us 90.10% 2.017ms 672.181us 7.936us 51.13% 10.624us 3.541us 3
|
| 4108 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.13% 7.936us 2.645us 3
|
| 4109 |
+
aten::mul 1.29% 28.971us 2.18% 48.701us 16.234us 7.584us 48.87% 7.584us 2.528us 3
|
| 4110 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.87% 7.584us 2.528us 3
|
| 4111 |
+
Activity Buffer Request 81.23% 1.818ms 81.23% 1.818ms 1.818ms 2.688us 17.32% 2.688us 2.688us 1
|
| 4112 |
+
aten::slice 1.18% 26.440us 1.50% 33.480us 5.580us 0.000us 0.00% 0.000us 0.000us 6
|
| 4113 |
+
aten::as_strided 0.31% 7.040us 0.31% 7.040us 1.173us 0.000us 0.00% 0.000us 0.000us 6
|
| 4114 |
+
cudaLaunchKernel 7.96% 178.055us 7.96% 178.055us 29.676us 0.000us 0.00% 0.000us 0.000us 6
|
| 4115 |
+
cudaDeviceSynchronize 0.24% 5.430us 0.24% 5.430us 5.430us 0.000us 0.00% 0.000us 0.000us 1
|
| 4116 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4117 |
+
Self CPU time total: 2.238ms
|
| 4118 |
+
Self CUDA time total: 15.520us
|
| 4119 |
|
| 4120 |
|
| 4121 |
|
|
|
|
| 4125 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4126 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4127 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4128 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 161.343us 1127.96% 161.343us 161.343us 1
|
| 4129 |
+
torch_eager 6.15% 126.753us 99.76% 2.055ms 2.055ms 0.000us 0.00% 16.768us 16.768us 1
|
| 4130 |
+
aten::silu 2.04% 42.050us 89.57% 1.845ms 614.923us 7.328us 51.23% 9.792us 3.264us 3
|
| 4131 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 51.23% 7.328us 2.443us 3
|
| 4132 |
+
aten::mul 1.44% 29.680us 2.44% 50.310us 16.770us 6.976us 48.77% 6.976us 2.325us 3
|
| 4133 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.976us 48.77% 6.976us 2.325us 3
|
| 4134 |
+
Activity Buffer Request 78.32% 1.613ms 78.32% 1.613ms 1.613ms 2.464us 17.23% 2.464us 2.464us 1
|
| 4135 |
+
aten::slice 1.25% 25.802us 1.59% 32.722us 5.454us 0.000us 0.00% 0.000us 0.000us 6
|
| 4136 |
+
aten::as_strided 0.34% 6.920us 0.34% 6.920us 1.153us 0.000us 0.00% 0.000us 0.000us 6
|
| 4137 |
+
cudaLaunchKernel 10.21% 210.375us 10.21% 210.375us 35.062us 0.000us 0.00% 0.000us 0.000us 6
|
| 4138 |
+
cudaDeviceSynchronize 0.24% 4.981us 0.24% 4.981us 4.981us 0.000us 0.00% 0.000us 0.000us 1
|
| 4139 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4140 |
+
Self CPU time total: 2.060ms
|
| 4141 |
+
Self CUDA time total: 14.304us
|
| 4142 |
|
| 4143 |
|
| 4144 |
|
|
|
|
| 4148 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4149 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4150 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4151 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.936us 1000.62% 155.936us 155.936us 1
|
| 4152 |
+
torch_eager 5.31% 107.073us 99.73% 2.011ms 2.011ms 0.000us 0.00% 18.272us 18.272us 1
|
| 4153 |
+
aten::silu 1.95% 39.312us 90.55% 1.825ms 608.464us 7.968us 51.13% 10.656us 3.552us 3
|
| 4154 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 51.13% 7.968us 2.656us 3
|
| 4155 |
+
aten::mul 1.40% 28.240us 2.34% 47.090us 15.697us 7.616us 48.87% 7.616us 2.539us 3
|
| 4156 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 48.87% 7.616us 2.539us 3
|
| 4157 |
+
Activity Buffer Request 80.78% 1.628ms 80.78% 1.628ms 1.628ms 2.688us 17.25% 2.688us 2.688us 1
|
| 4158 |
+
aten::slice 1.22% 24.550us 1.54% 30.960us 5.160us 0.000us 0.00% 0.000us 0.000us 6
|
| 4159 |
+
aten::as_strided 0.32% 6.410us 0.32% 6.410us 1.068us 0.000us 0.00% 0.000us 0.000us 6
|
| 4160 |
+
cudaLaunchKernel 8.75% 176.473us 8.75% 176.473us 29.412us 0.000us 0.00% 0.000us 0.000us 6
|
| 4161 |
+
cudaDeviceSynchronize 0.27% 5.381us 0.27% 5.381us 5.381us 0.000us 0.00% 0.000us 0.000us 1
|
| 4162 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4163 |
+
Self CPU time total: 2.016ms
|
| 4164 |
+
Self CUDA time total: 15.584us
|
| 4165 |
|
| 4166 |
|
| 4167 |
|
|
|
|
| 4171 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4172 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4173 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4174 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 156.608us 695.20% 156.608us 156.608us 1
|
| 4175 |
+
torch_eager 4.97% 102.273us 99.73% 2.054ms 2.054ms 0.000us 0.00% 26.431us 26.431us 1
|
| 4176 |
+
aten::silu 1.93% 39.830us 90.91% 1.872ms 624.047us 11.552us 51.28% 15.456us 5.152us 3
|
| 4177 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 51.28% 11.552us 3.851us 3
|
| 4178 |
+
aten::mul 1.40% 28.900us 2.35% 48.460us 16.153us 10.975us 48.72% 10.975us 3.658us 3
|
| 4179 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.975us 48.72% 10.975us 3.658us 3
|
| 4180 |
+
Activity Buffer Request 81.21% 1.672ms 81.21% 1.672ms 1.672ms 3.904us 17.33% 3.904us 3.904us 1
|
| 4181 |
+
aten::slice 1.20% 24.753us 1.50% 30.941us 5.157us 0.000us 0.00% 0.000us 0.000us 6
|
| 4182 |
+
aten::as_strided 0.30% 6.188us 0.30% 6.188us 1.031us 0.000us 0.00% 0.000us 0.000us 6
|
| 4183 |
+
cudaLaunchKernel 8.72% 179.534us 8.72% 179.534us 29.922us 0.000us 0.00% 0.000us 0.000us 6
|
| 4184 |
+
cudaDeviceSynchronize 0.27% 5.530us 0.27% 5.530us 5.530us 0.000us 0.00% 0.000us 0.000us 1
|
| 4185 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4186 |
+
Self CPU time total: 2.059ms
|
| 4187 |
+
Self CUDA time total: 22.527us
|
| 4188 |
|
| 4189 |
|
| 4190 |
impl wl p50(ms) ok
|
| 4191 |
+
torch_eager cuda_T128_D1024 0.05 True
|
| 4192 |
torch_eager cuda_T128_D2048 0.05 True
|
| 4193 |
torch_eager cuda_T128_D768 0.04 True
|
| 4194 |
torch_eager cuda_T256_D1024 0.05 True
|
activation/impls/torch_swiglu_darwin.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
activation/index.html
CHANGED
|
@@ -83,7 +83,7 @@
|
|
| 83 |
<h1>Index of /activation</h1>
|
| 84 |
<ul>
|
| 85 |
<li><a href='impls/index.html' class='dir'>impls/</a></li>
|
| 86 |
-
<li><a href='
|
| 87 |
</ul>
|
| 88 |
</body>
|
| 89 |
</html>
|
|
|
|
| 83 |
<h1>Index of /activation</h1>
|
| 84 |
<ul>
|
| 85 |
<li><a href='impls/index.html' class='dir'>impls/</a></li>
|
| 86 |
+
<li><a href='results_darwin/index.html' class='dir'>results_darwin/</a></li>
|
| 87 |
</ul>
|
| 88 |
</body>
|
| 89 |
</html>
|
activation/results_darwin/artifacts/combine/latency.svg
ADDED
|
|
Git LFS Details
|
activation/results_darwin/cells/combine.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "matplotlib"
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
+
# ///
|
| 13 |
+
from kernels_benchmark_tools.core.visuals import generate_combined_results
|
| 14 |
+
|
| 15 |
+
# Map display names to uvnote environment variables
|
| 16 |
+
cache_env_map = {
|
| 17 |
+
"PyTorch SwiGLU (macOS)": "UVNOTE_FILE_TORCH_SWIGLU_DARWIN_BENCHMARK",
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
# Generate combined results with visualization
|
| 21 |
+
generate_combined_results(
|
| 22 |
+
cache_env_map=cache_env_map,
|
| 23 |
+
output_filename="activation.jsonl",
|
| 24 |
+
svg_filename="latency.svg"
|
| 25 |
+
)
|
activation/results_darwin/combined_results.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
activation/results_darwin/index.html
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /activation/results_darwin</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /activation/results_darwin</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='combined_results.html' class='file'>combined_results.html</a></li>
|
| 86 |
+
</ul>
|
| 87 |
+
</body>
|
| 88 |
+
</html>
|
activation/results_linux/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
activation/results_linux/combined_results.html
CHANGED
|
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
-
<dc:date>2025-12-
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
|
@@ -4038,96 +4038,83 @@ body[data-tool="eraser"] .main-content {
|
|
| 4038 |
<g id="matplotlib.axis_2">
|
| 4039 |
<g id="ytick_1">
|
| 4040 |
<g id="grid-y--2" class="grid grid-y">
|
| 4041 |
-
<path d="M 60.23
|
| 4042 |
</g>
|
| 4043 |
<g id="line2d_10">
|
| 4044 |
<defs>
|
| 4045 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4046 |
</defs>
|
| 4047 |
<g>
|
| 4048 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="text_10">
|
| 4052 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4053 |
</g>
|
| 4054 |
</g>
|
| 4055 |
<g id="ytick_2">
|
| 4056 |
<g id="grid-y--3" class="grid grid-y">
|
| 4057 |
-
<path d="M 60.23
|
| 4058 |
</g>
|
| 4059 |
<g id="line2d_11">
|
| 4060 |
<g>
|
| 4061 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="text_11">
|
| 4065 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4066 |
</g>
|
| 4067 |
</g>
|
| 4068 |
<g id="ytick_3">
|
| 4069 |
<g id="grid-y--4" class="grid grid-y">
|
| 4070 |
-
<path d="M 60.23
|
| 4071 |
</g>
|
| 4072 |
<g id="line2d_12">
|
| 4073 |
<g>
|
| 4074 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="text_12">
|
| 4078 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="ytick_4">
|
| 4082 |
<g id="grid-y--5" class="grid grid-y">
|
| 4083 |
-
<path d="M 60.23
|
| 4084 |
</g>
|
| 4085 |
<g id="line2d_13">
|
| 4086 |
<g>
|
| 4087 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="text_13">
|
| 4091 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4092 |
</g>
|
| 4093 |
</g>
|
| 4094 |
<g id="ytick_5">
|
| 4095 |
<g id="grid-y--6" class="grid grid-y">
|
| 4096 |
-
<path d="M 60.23
|
| 4097 |
</g>
|
| 4098 |
<g id="line2d_14">
|
| 4099 |
<g>
|
| 4100 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4101 |
</g>
|
| 4102 |
</g>
|
| 4103 |
<g id="text_14">
|
| 4104 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4105 |
</g>
|
| 4106 |
</g>
|
| 4107 |
<g id="ytick_6">
|
| 4108 |
<g id="grid-y--7" class="grid grid-y">
|
| 4109 |
-
<path d="M 60.23
|
| 4110 |
</g>
|
| 4111 |
<g id="line2d_15">
|
| 4112 |
<g>
|
| 4113 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4114 |
</g>
|
| 4115 |
</g>
|
| 4116 |
<g id="text_15">
|
| 4117 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4118 |
-
</g>
|
| 4119 |
-
</g>
|
| 4120 |
-
<g id="ytick_7">
|
| 4121 |
-
<g id="grid-y--8" class="grid grid-y">
|
| 4122 |
-
<path d="M 60.23 52.395855 L 847.294169 52.395855 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4123 |
-
</g>
|
| 4124 |
-
<g id="line2d_16">
|
| 4125 |
-
<g>
|
| 4126 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="52.395855" style="stroke: #000000; stroke-width: 0.8" />
|
| 4127 |
-
</g>
|
| 4128 |
-
</g>
|
| 4129 |
-
<g id="text_16">
|
| 4130 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="56.195074" transform="rotate(-0 53.23 56.195074)">0.055</text>
|
| 4131 |
</g>
|
| 4132 |
</g>
|
| 4133 |
<g id="label--y" class="ylabel">
|
|
@@ -4135,37 +4122,37 @@ body[data-tool="eraser"] .main-content {
|
|
| 4135 |
</g>
|
| 4136 |
</g>
|
| 4137 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4138 |
-
<path d="M 96.005644 451.16779 L 185.444754
|
| 4139 |
<defs>
|
| 4140 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4141 |
</defs>
|
| 4142 |
<g clip-path="url(#p620c7d392f)">
|
| 4143 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4144 |
-
<use ns4:href="#md7efaf3aec" x="185.444754" y="
|
| 4145 |
-
<use ns4:href="#md7efaf3aec" x="274.883864" y="
|
| 4146 |
-
<use ns4:href="#md7efaf3aec" x="364.322974" y="
|
| 4147 |
-
<use ns4:href="#md7efaf3aec" x="453.762084" y="
|
| 4148 |
-
<use ns4:href="#md7efaf3aec" x="543.201194" y="
|
| 4149 |
-
<use ns4:href="#md7efaf3aec" x="632.640304" y="397.
|
| 4150 |
-
<use ns4:href="#md7efaf3aec" x="722.079415" y="
|
| 4151 |
-
<use ns4:href="#md7efaf3aec" x="811.518525" y="
|
| 4152 |
</g>
|
| 4153 |
</g>
|
| 4154 |
<g id="series--torch-eager" class="series">
|
| 4155 |
-
<path d="M 96.005644
|
| 4156 |
<defs>
|
| 4157 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4158 |
</defs>
|
| 4159 |
<g clip-path="url(#p620c7d392f)">
|
| 4160 |
-
<use ns4:href="#m9b8c54d372" x="96.005644" y="
|
| 4161 |
-
<use ns4:href="#m9b8c54d372" x="185.444754" y="
|
| 4162 |
-
<use ns4:href="#m9b8c54d372" x="274.883864" y="61.
|
| 4163 |
-
<use ns4:href="#m9b8c54d372" x="364.322974" y="
|
| 4164 |
-
<use ns4:href="#m9b8c54d372" x="453.762084" y="
|
| 4165 |
-
<use ns4:href="#m9b8c54d372" x="543.201194" y="
|
| 4166 |
-
<use ns4:href="#m9b8c54d372" x="632.640304" y="
|
| 4167 |
-
<use ns4:href="#m9b8c54d372" x="722.079415" y="
|
| 4168 |
-
<use ns4:href="#m9b8c54d372" x="811.518525" y="
|
| 4169 |
</g>
|
| 4170 |
</g>
|
| 4171 |
<g id="patch_3">
|
|
@@ -4180,14 +4167,14 @@ body[data-tool="eraser"] .main-content {
|
|
| 4180 |
<g id="patch_6">
|
| 4181 |
<path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4182 |
</g>
|
| 4183 |
-
<g id="
|
| 4184 |
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
|
| 4185 |
</g>
|
| 4186 |
<g id="legend" class="legend">
|
| 4187 |
<g id="patch_7">
|
| 4188 |
<path d="M 720.811356 466.37197 L 840.294169 466.37197 Q 842.294169 466.37197 842.294169 464.37197 L 842.294169 435.45947 Q 842.294169 433.45947 840.294169 433.45947 L 720.811356 433.45947 Q 718.811356 433.45947 718.811356 435.45947 L 718.811356 464.37197 Q 718.811356 466.37197 720.811356 466.37197 L 720.811356 466.37197 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4189 |
</g>
|
| 4190 |
-
<g id="
|
| 4191 |
<path d="M 722.811356 441.557908 L 732.811356 441.557908 L 742.811356 441.557908 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4192 |
<g>
|
| 4193 |
<use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
|
|
@@ -4196,7 +4183,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4196 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4197 |
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
|
| 4198 |
</g>
|
| 4199 |
-
<g id="
|
| 4200 |
<path d="M 722.811356 456.514158 L 732.811356 456.514158 L 742.811356 456.514158 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4201 |
<g>
|
| 4202 |
<use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
|
@@ -4223,7 +4210,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4223 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4224 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4225 |
</span> |
|
| 4226 |
-
Cell: combine | 4.
|
| 4227 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4228 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4229 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4319,7 +4306,7 @@ hf_kernels_swiglu cuda_T256_D768 0.03 True
|
|
| 4319 |
hf_kernels_swiglu cuda_T512_D1024 0.03 True
|
| 4320 |
hf_kernels_swiglu cuda_T512_D2048 0.03 True
|
| 4321 |
hf_kernels_swiglu cuda_T512_D768 0.03 True
|
| 4322 |
-
torch_eager cuda_T128_D1024 0.
|
| 4323 |
torch_eager cuda_T128_D2048 0.05 True
|
| 4324 |
torch_eager cuda_T128_D768 0.04 True
|
| 4325 |
torch_eager cuda_T256_D1024 0.05 True
|
|
@@ -4347,7 +4334,7 @@ Implementations included:
|
|
| 4347 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4348 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4349 |
<div class="uv-logs-content" style="display: none;">
|
| 4350 |
-
Installed 37 packages in
|
| 4351 |
</div>
|
| 4352 |
</div>
|
| 4353 |
<div class="cell-artifacts">
|
|
@@ -4360,7 +4347,7 @@ Installed 37 packages in 206ms
|
|
| 4360 |
<rdf:RDF>
|
| 4361 |
<ns2:Work>
|
| 4362 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4363 |
-
<dc:date>2025-12-
|
| 4364 |
<dc:format>image/svg+xml</dc:format>
|
| 4365 |
<dc:creator>
|
| 4366 |
<ns2:Agent>
|
|
@@ -4509,96 +4496,83 @@ Installed 37 packages in 206ms
|
|
| 4509 |
<g id="matplotlib.axis_2">
|
| 4510 |
<g id="ytick_1">
|
| 4511 |
<g id="grid-y--2" class="grid grid-y">
|
| 4512 |
-
<path d="M 60.23
|
| 4513 |
</g>
|
| 4514 |
<g id="line2d_10">
|
| 4515 |
<defs>
|
| 4516 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4517 |
</defs>
|
| 4518 |
<g>
|
| 4519 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4520 |
</g>
|
| 4521 |
</g>
|
| 4522 |
<g id="text_10">
|
| 4523 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4524 |
</g>
|
| 4525 |
</g>
|
| 4526 |
<g id="ytick_2">
|
| 4527 |
<g id="grid-y--3" class="grid grid-y">
|
| 4528 |
-
<path d="M 60.23
|
| 4529 |
</g>
|
| 4530 |
<g id="line2d_11">
|
| 4531 |
<g>
|
| 4532 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4533 |
</g>
|
| 4534 |
</g>
|
| 4535 |
<g id="text_11">
|
| 4536 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4537 |
</g>
|
| 4538 |
</g>
|
| 4539 |
<g id="ytick_3">
|
| 4540 |
<g id="grid-y--4" class="grid grid-y">
|
| 4541 |
-
<path d="M 60.23
|
| 4542 |
</g>
|
| 4543 |
<g id="line2d_12">
|
| 4544 |
<g>
|
| 4545 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4546 |
</g>
|
| 4547 |
</g>
|
| 4548 |
<g id="text_12">
|
| 4549 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4550 |
</g>
|
| 4551 |
</g>
|
| 4552 |
<g id="ytick_4">
|
| 4553 |
<g id="grid-y--5" class="grid grid-y">
|
| 4554 |
-
<path d="M 60.23
|
| 4555 |
</g>
|
| 4556 |
<g id="line2d_13">
|
| 4557 |
<g>
|
| 4558 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4559 |
</g>
|
| 4560 |
</g>
|
| 4561 |
<g id="text_13">
|
| 4562 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4563 |
</g>
|
| 4564 |
</g>
|
| 4565 |
<g id="ytick_5">
|
| 4566 |
<g id="grid-y--6" class="grid grid-y">
|
| 4567 |
-
<path d="M 60.23
|
| 4568 |
</g>
|
| 4569 |
<g id="line2d_14">
|
| 4570 |
<g>
|
| 4571 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4572 |
</g>
|
| 4573 |
</g>
|
| 4574 |
<g id="text_14">
|
| 4575 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4576 |
</g>
|
| 4577 |
</g>
|
| 4578 |
<g id="ytick_6">
|
| 4579 |
<g id="grid-y--7" class="grid grid-y">
|
| 4580 |
-
<path d="M 60.23
|
| 4581 |
</g>
|
| 4582 |
<g id="line2d_15">
|
| 4583 |
<g>
|
| 4584 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4585 |
</g>
|
| 4586 |
</g>
|
| 4587 |
<g id="text_15">
|
| 4588 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4589 |
-
</g>
|
| 4590 |
-
</g>
|
| 4591 |
-
<g id="ytick_7">
|
| 4592 |
-
<g id="grid-y--8" class="grid grid-y">
|
| 4593 |
-
<path d="M 60.23 52.395855 L 847.294169 52.395855 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4594 |
-
</g>
|
| 4595 |
-
<g id="line2d_16">
|
| 4596 |
-
<g>
|
| 4597 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="52.395855" style="stroke: #000000; stroke-width: 0.8" />
|
| 4598 |
-
</g>
|
| 4599 |
-
</g>
|
| 4600 |
-
<g id="text_16">
|
| 4601 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="56.195074" transform="rotate(-0 53.23 56.195074)">0.055</text>
|
| 4602 |
</g>
|
| 4603 |
</g>
|
| 4604 |
<g id="label--y" class="ylabel">
|
|
@@ -4606,37 +4580,37 @@ Installed 37 packages in 206ms
|
|
| 4606 |
</g>
|
| 4607 |
</g>
|
| 4608 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4609 |
-
<path d="M 96.005644 451.16779 L 185.444754
|
| 4610 |
<defs>
|
| 4611 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4612 |
</defs>
|
| 4613 |
<g clip-path="url(#p620c7d392f)">
|
| 4614 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4615 |
-
<use ns4:href="#md7efaf3aec" x="185.444754" y="
|
| 4616 |
-
<use ns4:href="#md7efaf3aec" x="274.883864" y="
|
| 4617 |
-
<use ns4:href="#md7efaf3aec" x="364.322974" y="
|
| 4618 |
-
<use ns4:href="#md7efaf3aec" x="453.762084" y="
|
| 4619 |
-
<use ns4:href="#md7efaf3aec" x="543.201194" y="
|
| 4620 |
-
<use ns4:href="#md7efaf3aec" x="632.640304" y="397.
|
| 4621 |
-
<use ns4:href="#md7efaf3aec" x="722.079415" y="
|
| 4622 |
-
<use ns4:href="#md7efaf3aec" x="811.518525" y="
|
| 4623 |
</g>
|
| 4624 |
</g>
|
| 4625 |
<g id="series--torch-eager" class="series">
|
| 4626 |
-
<path d="M 96.005644
|
| 4627 |
<defs>
|
| 4628 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4629 |
</defs>
|
| 4630 |
<g clip-path="url(#p620c7d392f)">
|
| 4631 |
-
<use ns4:href="#m9b8c54d372" x="96.005644" y="
|
| 4632 |
-
<use ns4:href="#m9b8c54d372" x="185.444754" y="
|
| 4633 |
-
<use ns4:href="#m9b8c54d372" x="274.883864" y="61.
|
| 4634 |
-
<use ns4:href="#m9b8c54d372" x="364.322974" y="
|
| 4635 |
-
<use ns4:href="#m9b8c54d372" x="453.762084" y="
|
| 4636 |
-
<use ns4:href="#m9b8c54d372" x="543.201194" y="
|
| 4637 |
-
<use ns4:href="#m9b8c54d372" x="632.640304" y="
|
| 4638 |
-
<use ns4:href="#m9b8c54d372" x="722.079415" y="
|
| 4639 |
-
<use ns4:href="#m9b8c54d372" x="811.518525" y="
|
| 4640 |
</g>
|
| 4641 |
</g>
|
| 4642 |
<g id="patch_3">
|
|
@@ -4651,14 +4625,14 @@ Installed 37 packages in 206ms
|
|
| 4651 |
<g id="patch_6">
|
| 4652 |
<path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4653 |
</g>
|
| 4654 |
-
<g id="
|
| 4655 |
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
|
| 4656 |
</g>
|
| 4657 |
<g id="legend" class="legend">
|
| 4658 |
<g id="patch_7">
|
| 4659 |
<path d="M 720.811356 466.37197 L 840.294169 466.37197 Q 842.294169 466.37197 842.294169 464.37197 L 842.294169 435.45947 Q 842.294169 433.45947 840.294169 433.45947 L 720.811356 433.45947 Q 718.811356 433.45947 718.811356 435.45947 L 718.811356 464.37197 Q 718.811356 466.37197 720.811356 466.37197 L 720.811356 466.37197 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4660 |
</g>
|
| 4661 |
-
<g id="
|
| 4662 |
<path d="M 722.811356 441.557908 L 732.811356 441.557908 L 742.811356 441.557908 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4663 |
<g>
|
| 4664 |
<use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
|
|
@@ -4667,7 +4641,7 @@ Installed 37 packages in 206ms
|
|
| 4667 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4668 |
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
|
| 4669 |
</g>
|
| 4670 |
-
<g id="
|
| 4671 |
<path d="M 722.811356 456.514158 L 732.811356 456.514158 L 742.811356 456.514158 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4672 |
<g>
|
| 4673 |
<use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
|
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
+
<dc:date>2025-12-19T23:02:36.234026</dc:date>
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
|
|
|
| 4038 |
<g id="matplotlib.axis_2">
|
| 4039 |
<g id="ytick_1">
|
| 4040 |
<g id="grid-y--2" class="grid grid-y">
|
| 4041 |
+
<path d="M 60.23 439.989819 L 847.294169 439.989819 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4042 |
</g>
|
| 4043 |
<g id="line2d_10">
|
| 4044 |
<defs>
|
| 4045 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4046 |
</defs>
|
| 4047 |
<g>
|
| 4048 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="439.989819" style="stroke: #000000; stroke-width: 0.8" />
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="text_10">
|
| 4052 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="443.789038" transform="rotate(-0 53.23 443.789038)">0.025</text>
|
| 4053 |
</g>
|
| 4054 |
</g>
|
| 4055 |
<g id="ytick_2">
|
| 4056 |
<g id="grid-y--3" class="grid grid-y">
|
| 4057 |
+
<path d="M 60.23 364.462996 L 847.294169 364.462996 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4058 |
</g>
|
| 4059 |
<g id="line2d_11">
|
| 4060 |
<g>
|
| 4061 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="364.462996" style="stroke: #000000; stroke-width: 0.8" />
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="text_11">
|
| 4065 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="368.262215" transform="rotate(-0 53.23 368.262215)">0.030</text>
|
| 4066 |
</g>
|
| 4067 |
</g>
|
| 4068 |
<g id="ytick_3">
|
| 4069 |
<g id="grid-y--4" class="grid grid-y">
|
| 4070 |
+
<path d="M 60.23 288.936174 L 847.294169 288.936174 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4071 |
</g>
|
| 4072 |
<g id="line2d_12">
|
| 4073 |
<g>
|
| 4074 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="288.936174" style="stroke: #000000; stroke-width: 0.8" />
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="text_12">
|
| 4078 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="292.735392" transform="rotate(-0 53.23 292.735392)">0.035</text>
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="ytick_4">
|
| 4082 |
<g id="grid-y--5" class="grid grid-y">
|
| 4083 |
+
<path d="M 60.23 213.409351 L 847.294169 213.409351 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4084 |
</g>
|
| 4085 |
<g id="line2d_13">
|
| 4086 |
<g>
|
| 4087 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="213.409351" style="stroke: #000000; stroke-width: 0.8" />
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="text_13">
|
| 4091 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="217.20857" transform="rotate(-0 53.23 217.20857)">0.040</text>
|
| 4092 |
</g>
|
| 4093 |
</g>
|
| 4094 |
<g id="ytick_5">
|
| 4095 |
<g id="grid-y--6" class="grid grid-y">
|
| 4096 |
+
<path d="M 60.23 137.882528 L 847.294169 137.882528 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4097 |
</g>
|
| 4098 |
<g id="line2d_14">
|
| 4099 |
<g>
|
| 4100 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="137.882528" style="stroke: #000000; stroke-width: 0.8" />
|
| 4101 |
</g>
|
| 4102 |
</g>
|
| 4103 |
<g id="text_14">
|
| 4104 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="141.681747" transform="rotate(-0 53.23 141.681747)">0.045</text>
|
| 4105 |
</g>
|
| 4106 |
</g>
|
| 4107 |
<g id="ytick_6">
|
| 4108 |
<g id="grid-y--7" class="grid grid-y">
|
| 4109 |
+
<path d="M 60.23 62.355705 L 847.294169 62.355705 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4110 |
</g>
|
| 4111 |
<g id="line2d_15">
|
| 4112 |
<g>
|
| 4113 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="62.355705" style="stroke: #000000; stroke-width: 0.8" />
|
| 4114 |
</g>
|
| 4115 |
</g>
|
| 4116 |
<g id="text_15">
|
| 4117 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="66.154924" transform="rotate(-0 53.23 66.154924)">0.050</text>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4118 |
</g>
|
| 4119 |
</g>
|
| 4120 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4122 |
</g>
|
| 4123 |
</g>
|
| 4124 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4125 |
+
<path d="M 96.005644 451.16779 L 185.444754 381.81906 L 274.883864 399.492335 L 364.322974 403.132727 L 453.762084 404.326052 L 543.201194 418.540201 L 632.640304 397.090583 L 722.079415 413.087163 L 811.518525 398.299013 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4126 |
<defs>
|
| 4127 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4128 |
</defs>
|
| 4129 |
<g clip-path="url(#p620c7d392f)">
|
| 4130 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4131 |
+
<use ns4:href="#md7efaf3aec" x="185.444754" y="381.81906" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4132 |
+
<use ns4:href="#md7efaf3aec" x="274.883864" y="399.492335" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4133 |
+
<use ns4:href="#md7efaf3aec" x="364.322974" y="403.132727" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4134 |
+
<use ns4:href="#md7efaf3aec" x="453.762084" y="404.326052" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4135 |
+
<use ns4:href="#md7efaf3aec" x="543.201194" y="418.540201" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4136 |
+
<use ns4:href="#md7efaf3aec" x="632.640304" y="397.090583" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4137 |
+
<use ns4:href="#md7efaf3aec" x="722.079415" y="413.087163" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4138 |
+
<use ns4:href="#md7efaf3aec" x="811.518525" y="398.299013" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4139 |
</g>
|
| 4140 |
</g>
|
| 4141 |
<g id="series--torch-eager" class="series">
|
| 4142 |
+
<path d="M 96.005644 187.579177 L 185.444754 58.095992 L 274.883864 61.268119 L 364.322974 62.778657 L 453.762084 62.340599 L 543.201194 67.476423 L 632.640304 47.08418 L 722.079415 54.03265 L 811.518525 57.204778 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4143 |
<defs>
|
| 4144 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4145 |
</defs>
|
| 4146 |
<g clip-path="url(#p620c7d392f)">
|
| 4147 |
+
<use ns4:href="#m9b8c54d372" x="96.005644" y="187.579177" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4148 |
+
<use ns4:href="#m9b8c54d372" x="185.444754" y="58.095992" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4149 |
+
<use ns4:href="#m9b8c54d372" x="274.883864" y="61.268119" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4150 |
+
<use ns4:href="#m9b8c54d372" x="364.322974" y="62.778657" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4151 |
+
<use ns4:href="#m9b8c54d372" x="453.762084" y="62.340599" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4152 |
+
<use ns4:href="#m9b8c54d372" x="543.201194" y="67.476423" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4153 |
+
<use ns4:href="#m9b8c54d372" x="632.640304" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4154 |
+
<use ns4:href="#m9b8c54d372" x="722.079415" y="54.03265" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4155 |
+
<use ns4:href="#m9b8c54d372" x="811.518525" y="57.204778" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4156 |
</g>
|
| 4157 |
</g>
|
| 4158 |
<g id="patch_3">
|
|
|
|
| 4167 |
<g id="patch_6">
|
| 4168 |
<path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4169 |
</g>
|
| 4170 |
+
<g id="text_16">
|
| 4171 |
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
|
| 4172 |
</g>
|
| 4173 |
<g id="legend" class="legend">
|
| 4174 |
<g id="patch_7">
|
| 4175 |
<path d="M 720.811356 466.37197 L 840.294169 466.37197 Q 842.294169 466.37197 842.294169 464.37197 L 842.294169 435.45947 Q 842.294169 433.45947 840.294169 433.45947 L 720.811356 433.45947 Q 718.811356 433.45947 718.811356 435.45947 L 718.811356 464.37197 Q 718.811356 466.37197 720.811356 466.37197 L 720.811356 466.37197 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4176 |
</g>
|
| 4177 |
+
<g id="line2d_16">
|
| 4178 |
<path d="M 722.811356 441.557908 L 732.811356 441.557908 L 742.811356 441.557908 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4179 |
<g>
|
| 4180 |
<use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
|
|
|
|
| 4183 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4184 |
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
|
| 4185 |
</g>
|
| 4186 |
+
<g id="line2d_17">
|
| 4187 |
<path d="M 722.811356 456.514158 L 732.811356 456.514158 L 742.811356 456.514158 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4188 |
<g>
|
| 4189 |
<use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
|
|
|
| 4210 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4211 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4212 |
</span> |
|
| 4213 |
+
Cell: combine | 4.66s
|
| 4214 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4215 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4216 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4306 |
hf_kernels_swiglu cuda_T512_D1024 0.03 True
|
| 4307 |
hf_kernels_swiglu cuda_T512_D2048 0.03 True
|
| 4308 |
hf_kernels_swiglu cuda_T512_D768 0.03 True
|
| 4309 |
+
torch_eager cuda_T128_D1024 0.05 True
|
| 4310 |
torch_eager cuda_T128_D2048 0.05 True
|
| 4311 |
torch_eager cuda_T128_D768 0.04 True
|
| 4312 |
torch_eager cuda_T256_D1024 0.05 True
|
|
|
|
| 4334 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4335 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4336 |
<div class="uv-logs-content" style="display: none;">
|
| 4337 |
+
Installed 37 packages in 339ms
|
| 4338 |
</div>
|
| 4339 |
</div>
|
| 4340 |
<div class="cell-artifacts">
|
|
|
|
| 4347 |
<rdf:RDF>
|
| 4348 |
<ns2:Work>
|
| 4349 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4350 |
+
<dc:date>2025-12-19T23:02:36.234026</dc:date>
|
| 4351 |
<dc:format>image/svg+xml</dc:format>
|
| 4352 |
<dc:creator>
|
| 4353 |
<ns2:Agent>
|
|
|
|
| 4496 |
<g id="matplotlib.axis_2">
|
| 4497 |
<g id="ytick_1">
|
| 4498 |
<g id="grid-y--2" class="grid grid-y">
|
| 4499 |
+
<path d="M 60.23 439.989819 L 847.294169 439.989819 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4500 |
</g>
|
| 4501 |
<g id="line2d_10">
|
| 4502 |
<defs>
|
| 4503 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4504 |
</defs>
|
| 4505 |
<g>
|
| 4506 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="439.989819" style="stroke: #000000; stroke-width: 0.8" />
|
| 4507 |
</g>
|
| 4508 |
</g>
|
| 4509 |
<g id="text_10">
|
| 4510 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="443.789038" transform="rotate(-0 53.23 443.789038)">0.025</text>
|
| 4511 |
</g>
|
| 4512 |
</g>
|
| 4513 |
<g id="ytick_2">
|
| 4514 |
<g id="grid-y--3" class="grid grid-y">
|
| 4515 |
+
<path d="M 60.23 364.462996 L 847.294169 364.462996 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4516 |
</g>
|
| 4517 |
<g id="line2d_11">
|
| 4518 |
<g>
|
| 4519 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="364.462996" style="stroke: #000000; stroke-width: 0.8" />
|
| 4520 |
</g>
|
| 4521 |
</g>
|
| 4522 |
<g id="text_11">
|
| 4523 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="368.262215" transform="rotate(-0 53.23 368.262215)">0.030</text>
|
| 4524 |
</g>
|
| 4525 |
</g>
|
| 4526 |
<g id="ytick_3">
|
| 4527 |
<g id="grid-y--4" class="grid grid-y">
|
| 4528 |
+
<path d="M 60.23 288.936174 L 847.294169 288.936174 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4529 |
</g>
|
| 4530 |
<g id="line2d_12">
|
| 4531 |
<g>
|
| 4532 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="288.936174" style="stroke: #000000; stroke-width: 0.8" />
|
| 4533 |
</g>
|
| 4534 |
</g>
|
| 4535 |
<g id="text_12">
|
| 4536 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="292.735392" transform="rotate(-0 53.23 292.735392)">0.035</text>
|
| 4537 |
</g>
|
| 4538 |
</g>
|
| 4539 |
<g id="ytick_4">
|
| 4540 |
<g id="grid-y--5" class="grid grid-y">
|
| 4541 |
+
<path d="M 60.23 213.409351 L 847.294169 213.409351 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4542 |
</g>
|
| 4543 |
<g id="line2d_13">
|
| 4544 |
<g>
|
| 4545 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="213.409351" style="stroke: #000000; stroke-width: 0.8" />
|
| 4546 |
</g>
|
| 4547 |
</g>
|
| 4548 |
<g id="text_13">
|
| 4549 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="217.20857" transform="rotate(-0 53.23 217.20857)">0.040</text>
|
| 4550 |
</g>
|
| 4551 |
</g>
|
| 4552 |
<g id="ytick_5">
|
| 4553 |
<g id="grid-y--6" class="grid grid-y">
|
| 4554 |
+
<path d="M 60.23 137.882528 L 847.294169 137.882528 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4555 |
</g>
|
| 4556 |
<g id="line2d_14">
|
| 4557 |
<g>
|
| 4558 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="137.882528" style="stroke: #000000; stroke-width: 0.8" />
|
| 4559 |
</g>
|
| 4560 |
</g>
|
| 4561 |
<g id="text_14">
|
| 4562 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="141.681747" transform="rotate(-0 53.23 141.681747)">0.045</text>
|
| 4563 |
</g>
|
| 4564 |
</g>
|
| 4565 |
<g id="ytick_6">
|
| 4566 |
<g id="grid-y--7" class="grid grid-y">
|
| 4567 |
+
<path d="M 60.23 62.355705 L 847.294169 62.355705 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4568 |
</g>
|
| 4569 |
<g id="line2d_15">
|
| 4570 |
<g>
|
| 4571 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="62.355705" style="stroke: #000000; stroke-width: 0.8" />
|
| 4572 |
</g>
|
| 4573 |
</g>
|
| 4574 |
<g id="text_15">
|
| 4575 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="66.154924" transform="rotate(-0 53.23 66.154924)">0.050</text>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4576 |
</g>
|
| 4577 |
</g>
|
| 4578 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4580 |
</g>
|
| 4581 |
</g>
|
| 4582 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4583 |
+
<path d="M 96.005644 451.16779 L 185.444754 381.81906 L 274.883864 399.492335 L 364.322974 403.132727 L 453.762084 404.326052 L 543.201194 418.540201 L 632.640304 397.090583 L 722.079415 413.087163 L 811.518525 398.299013 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4584 |
<defs>
|
| 4585 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4586 |
</defs>
|
| 4587 |
<g clip-path="url(#p620c7d392f)">
|
| 4588 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4589 |
+
<use ns4:href="#md7efaf3aec" x="185.444754" y="381.81906" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4590 |
+
<use ns4:href="#md7efaf3aec" x="274.883864" y="399.492335" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4591 |
+
<use ns4:href="#md7efaf3aec" x="364.322974" y="403.132727" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4592 |
+
<use ns4:href="#md7efaf3aec" x="453.762084" y="404.326052" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4593 |
+
<use ns4:href="#md7efaf3aec" x="543.201194" y="418.540201" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4594 |
+
<use ns4:href="#md7efaf3aec" x="632.640304" y="397.090583" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4595 |
+
<use ns4:href="#md7efaf3aec" x="722.079415" y="413.087163" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4596 |
+
<use ns4:href="#md7efaf3aec" x="811.518525" y="398.299013" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4597 |
</g>
|
| 4598 |
</g>
|
| 4599 |
<g id="series--torch-eager" class="series">
|
| 4600 |
+
<path d="M 96.005644 187.579177 L 185.444754 58.095992 L 274.883864 61.268119 L 364.322974 62.778657 L 453.762084 62.340599 L 543.201194 67.476423 L 632.640304 47.08418 L 722.079415 54.03265 L 811.518525 57.204778 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4601 |
<defs>
|
| 4602 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4603 |
</defs>
|
| 4604 |
<g clip-path="url(#p620c7d392f)">
|
| 4605 |
+
<use ns4:href="#m9b8c54d372" x="96.005644" y="187.579177" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4606 |
+
<use ns4:href="#m9b8c54d372" x="185.444754" y="58.095992" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4607 |
+
<use ns4:href="#m9b8c54d372" x="274.883864" y="61.268119" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4608 |
+
<use ns4:href="#m9b8c54d372" x="364.322974" y="62.778657" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4609 |
+
<use ns4:href="#m9b8c54d372" x="453.762084" y="62.340599" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4610 |
+
<use ns4:href="#m9b8c54d372" x="543.201194" y="67.476423" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4611 |
+
<use ns4:href="#m9b8c54d372" x="632.640304" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4612 |
+
<use ns4:href="#m9b8c54d372" x="722.079415" y="54.03265" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4613 |
+
<use ns4:href="#m9b8c54d372" x="811.518525" y="57.204778" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4614 |
</g>
|
| 4615 |
</g>
|
| 4616 |
<g id="patch_3">
|
|
|
|
| 4625 |
<g id="patch_6">
|
| 4626 |
<path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4627 |
</g>
|
| 4628 |
+
<g id="text_16">
|
| 4629 |
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
|
| 4630 |
</g>
|
| 4631 |
<g id="legend" class="legend">
|
| 4632 |
<g id="patch_7">
|
| 4633 |
<path d="M 720.811356 466.37197 L 840.294169 466.37197 Q 842.294169 466.37197 842.294169 464.37197 L 842.294169 435.45947 Q 842.294169 433.45947 840.294169 433.45947 L 720.811356 433.45947 Q 718.811356 433.45947 718.811356 435.45947 L 718.811356 464.37197 Q 718.811356 466.37197 720.811356 466.37197 L 720.811356 466.37197 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4634 |
</g>
|
| 4635 |
+
<g id="line2d_16">
|
| 4636 |
<path d="M 722.811356 441.557908 L 732.811356 441.557908 L 742.811356 441.557908 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4637 |
<g>
|
| 4638 |
<use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
|
|
|
|
| 4641 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4642 |
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
|
| 4643 |
</g>
|
| 4644 |
+
<g id="line2d_17">
|
| 4645 |
<path d="M 722.811356 456.514158 L 732.811356 456.514158 L 742.811356 456.514158 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4646 |
<g>
|
| 4647 |
<use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
CHANGED
|
@@ -1,24 +1,24 @@
|
|
| 1 |
-
{"ts": "2025-12-
|
| 2 |
-
{"ts": "2025-12-
|
| 3 |
-
{"ts": "2025-12-
|
| 4 |
-
{"ts": "2025-12-
|
| 5 |
-
{"ts": "2025-12-
|
| 6 |
-
{"ts": "2025-12-
|
| 7 |
-
{"ts": "2025-12-
|
| 8 |
-
{"ts": "2025-12-
|
| 9 |
-
{"ts": "2025-12-
|
| 10 |
-
{"ts": "2025-12-
|
| 11 |
-
{"ts": "2025-12-
|
| 12 |
-
{"ts": "2025-12-
|
| 13 |
-
{"ts": "2025-12-
|
| 14 |
-
{"ts": "2025-12-
|
| 15 |
-
{"ts": "2025-12-
|
| 16 |
-
{"ts": "2025-12-
|
| 17 |
-
{"ts": "2025-12-
|
| 18 |
-
{"ts": "2025-12-
|
| 19 |
-
{"ts": "2025-12-
|
| 20 |
-
{"ts": "2025-12-
|
| 21 |
-
{"ts": "2025-12-
|
| 22 |
-
{"ts": "2025-12-
|
| 23 |
-
{"ts": "2025-12-
|
| 24 |
-
{"ts": "2025-12-
|
|
|
|
| 1 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04374100012682902, "p50": 0.046281000095405034, "p90": 0.04699100009020185, "mean": 0.04588520009747299, "iqr": 0.0018789999103319133, "raw_times": [0.0473009999950591, 0.04699100009020185, 0.04511200017986994, 0.046281000095405034, 0.04374100012682902], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054351000017049955, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05076099978396087, "p50": 0.05160099999557133, "p90": 0.0517210000907653, "mean": 0.05158899998605193, "iqr": 0.00034000004234258085, "raw_times": [0.05076099978396087, 0.052481000011539436, 0.05138100004842272, 0.0517210000907653, 0.05160099999557133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06265199999688775, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04984099996363511, "p50": 0.05232199987403874, "p90": 0.05389100010688708, "mean": 0.05356520000532328, "iqr": 0.0016999999843392288, "raw_times": [0.04984099996363511, 0.05389100010688708, 0.05232199987403874, 0.05958099995950761, 0.05219100012254785], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05472199995892879, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050031999990096665, "p50": 0.0509510000483715, "p90": 0.0512909998633404, "mean": 0.05097519997434574, "iqr": 0.0009099996987060877, "raw_times": [0.050381000164634315, 0.0512909998633404, 0.05222099980528583, 0.0509510000483715, 0.050031999990096665], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05585200005953084, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048549999974056846, "p50": 0.04953099983140419, "p90": 0.04999000020688982, "mean": 0.04990460001863539, "iqr": 0.0006790000952605624, "raw_times": [0.04953099983140419, 0.052140999969196855, 0.04999000020688982, 0.049311000111629255, 0.048549999974056846], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054900999884921475, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04891100002168969, "p50": 0.04958099998475518, "p90": 0.05115099997965444, "mean": 0.05000500000278407, "iqr": 0.002170000016121776, "raw_times": [0.04891100002168969, 0.05140100006428838, 0.04958099998475518, 0.05115099997965444, 0.04898099996353267], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05376100011744711, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04850100003750413, "p50": 0.04982199993719405, "p90": 0.04984099996363511, "mean": 0.04968119997101894, "iqr": 0.0003499999365885742, "raw_times": [0.04850100003750413, 0.04984099996363511, 0.04949100002704654, 0.050750999889714876, 0.04982199993719405], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054501000022355583, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049300999990009586, "p50": 0.04983100006938912, "p90": 0.050170999884358025, "mean": 0.05779919997621619, "iqr": 0.0004900000476482091, "raw_times": [0.09001200010061439, 0.04983100006938912, 0.049680999836709816, 0.050170999884358025, 0.049300999990009586], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052752000101463636, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047761000132595655, "p50": 0.04934100002174091, "p90": 0.049350999915986904, "mean": 0.0488690000565839, "iqr": 0.0008099998467514524, "raw_times": [0.04854100006923545, 0.04935100014336058, 0.049350999915986904, 0.04934100002174091, 0.047761000132595655], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052481000011539436, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 10 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048450999884153134, "p50": 0.04965099992659816, "p90": 0.050621000127648585, "mean": 0.05171900002096663, "iqr": 0.0015900000107649248, "raw_times": [0.048450999884153134, 0.04903100011688366, 0.04965099992659816, 0.050621000127648585, 0.06084100004954962], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053670999932364793, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 11 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047580999989804695, "p50": 0.0487410000005184, "p90": 0.050201000021843356, "mean": 0.04913100001431303, "iqr": 0.002320000021427404, "raw_times": [0.047580999989804695, 0.050201000021843356, 0.051251000058982754, 0.04788100000041595, 0.0487410000005184], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054761000001235516, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 12 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049270999852524255, "p50": 0.049540999953023857, "p90": 0.04967099994246382, "mean": 0.049752999939300935, "iqr": 0.0003000000106112566, "raw_times": [0.049370999931852566, 0.049540999953023857, 0.049270999852524255, 0.05091100001664017, 0.04967099994246382], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053771000011693104, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 13 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04857099997934711, "p50": 0.04948099990542687, "p90": 0.049690999958329485, "mean": 0.04942899995512562, "iqr": 0.0004900000476482091, "raw_times": [0.04857099997934711, 0.04948099990542687, 0.049690999958329485, 0.049200999910681276, 0.050201000021843356], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05436199990072055, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 14 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048160999995161546, "p50": 0.049621000016486505, "p90": 0.05022099981033534, "mean": 0.04960719993505336, "iqr": 0.0007689998255955288, "raw_times": [0.048160999995161546, 0.049621000016486505, 0.05022099981033534, 0.049451999984739814, 0.050580999868543586], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053051000122650294, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 15 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04980199992132839, "p50": 0.05055099995843193, "p90": 0.050551000185805606, "mean": 0.05498319997059298, "iqr": 0.00047000025915622246, "raw_times": [0.07393099986074958, 0.05008099992664938, 0.05055099995843193, 0.050551000185805606, 0.04980199992132839], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053621999995812075, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 16 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048960999947667005, "p50": 0.05131100010657974, "p90": 0.052661000154330395, "mean": 0.051906999988204916, "iqr": 0.0029200002700235927, "raw_times": [0.048960999947667005, 0.052661000154330395, 0.056860999848140636, 0.0497409998843068, 0.05131100010657974], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053932000128043, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 17 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04804100012734125, "p50": 0.04966200003764243, "p90": 0.05004099989491806, "mean": 0.04977120001967705, "iqr": 0.0007399999049084727, "raw_times": [0.04804100012734125, 0.05181100004847394, 0.04966200003764243, 0.05004099989491806, 0.049300999990009586], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05391199988480366, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 18 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047730999995110324, "p50": 0.04978099991603813, "p90": 0.049860999979500775, "mean": 0.04935500001010951, "iqr": 0.0009599998520570807, "raw_times": [0.047730999995110324, 0.04978099991603813, 0.05050100003245461, 0.048901000127443695, 0.049860999979500775], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0537809999059391, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 19 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049270999852524255, "p50": 0.05024100005357468, "p90": 0.05084099984742352, "mean": 0.050164999993285164, "iqr": 0.0012699997569143306, "raw_times": [0.049270999852524255, 0.05024100005357468, 0.05084099984742352, 0.05090100012239418, 0.04957100009050919], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05270199994811264, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 20 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04698099996858218, "p50": 0.049690999958329485, "p90": 0.04984100019100879, "mean": 0.049111000043922104, "iqr": 0.00083000008999079, "raw_times": [0.04698099996858218, 0.04984100019100879, 0.050031000000672066, 0.049011000101018, 0.049690999958329485], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05244099997980811, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 21 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04767199993693794, "p50": 0.049621000016486505, "p90": 0.04999099996894074, "mean": 0.049529399984749034, "iqr": 0.0007789999472151976, "raw_times": [0.04767199993693794, 0.049621000016486505, 0.05115099997965444, 0.04999099996894074, 0.049212000021725544], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05385099984778208, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 22 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04878100003224972, "p50": 0.049651000153971836, "p90": 0.050100999942515045, "mean": 0.04970100003447442, "iqr": 0.0005699998837371822, "raw_times": [0.04878100003224972, 0.050440999984857626, 0.049651000153971836, 0.04953100005877786, 0.050100999942515045], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054691000059392536, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 23 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04801099998985592, "p50": 0.049041000011129654, "p90": 0.04916099987894995, "mean": 0.04916500001854729, "iqr": 0.00038999974094622303, "raw_times": [0.04801099998985592, 0.050841000074797194, 0.04916099987894995, 0.04877100013800373, 0.049041000011129654], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05514100007530942, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 24 |
+
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04843099986828747, "p50": 0.04957099986313551, "p90": 0.04984099996363511, "mean": 0.04942899995512562, "iqr": 0.00046999980440887157, "raw_times": [0.04843099986828747, 0.04937100015922624, 0.049930999921343755, 0.04957099986313551, 0.04984099996363511], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05356199994821509, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
causal_conv1d/impls/hf_kernels_causal_conv1d.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/impls/torch_causal_conv1d.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
causal_conv1d/results/combined_results.html
CHANGED
|
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
-
<dc:date>2025-12-
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
|
@@ -4233,70 +4233,70 @@ body[data-tool="eraser"] .main-content {
|
|
| 4233 |
<g id="matplotlib.axis_2">
|
| 4234 |
<g id="ytick_1">
|
| 4235 |
<g id="grid-y--2" class="grid grid-y">
|
| 4236 |
-
<path d="M 47.72 375.
|
| 4237 |
</g>
|
| 4238 |
<g id="line2d_25">
|
| 4239 |
<defs>
|
| 4240 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4241 |
</defs>
|
| 4242 |
<g>
|
| 4243 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="375.
|
| 4244 |
</g>
|
| 4245 |
</g>
|
| 4246 |
<g id="text_25">
|
| 4247 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.
|
| 4248 |
</g>
|
| 4249 |
</g>
|
| 4250 |
<g id="ytick_2">
|
| 4251 |
<g id="grid-y--3" class="grid grid-y">
|
| 4252 |
-
<path d="M 47.72
|
| 4253 |
</g>
|
| 4254 |
<g id="line2d_26">
|
| 4255 |
<g>
|
| 4256 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4257 |
</g>
|
| 4258 |
</g>
|
| 4259 |
<g id="text_26">
|
| 4260 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.
|
| 4261 |
</g>
|
| 4262 |
</g>
|
| 4263 |
<g id="ytick_3">
|
| 4264 |
<g id="grid-y--4" class="grid grid-y">
|
| 4265 |
-
<path d="M 47.72 210.
|
| 4266 |
</g>
|
| 4267 |
<g id="line2d_27">
|
| 4268 |
<g>
|
| 4269 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="210.
|
| 4270 |
</g>
|
| 4271 |
</g>
|
| 4272 |
<g id="text_27">
|
| 4273 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="214.
|
| 4274 |
</g>
|
| 4275 |
</g>
|
| 4276 |
<g id="ytick_4">
|
| 4277 |
<g id="grid-y--5" class="grid grid-y">
|
| 4278 |
-
<path d="M 47.72 127.
|
| 4279 |
</g>
|
| 4280 |
<g id="line2d_28">
|
| 4281 |
<g>
|
| 4282 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="127.
|
| 4283 |
</g>
|
| 4284 |
</g>
|
| 4285 |
<g id="text_28">
|
| 4286 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="131.
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="ytick_5">
|
| 4290 |
<g id="grid-y--6" class="grid grid-y">
|
| 4291 |
-
<path d="M 47.72 45.
|
| 4292 |
</g>
|
| 4293 |
<g id="line2d_29">
|
| 4294 |
<g>
|
| 4295 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="45.
|
| 4296 |
</g>
|
| 4297 |
</g>
|
| 4298 |
<g id="text_29">
|
| 4299 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4300 |
</g>
|
| 4301 |
</g>
|
| 4302 |
<g id="label--y" class="ylabel">
|
|
@@ -4304,66 +4304,66 @@ body[data-tool="eraser"] .main-content {
|
|
| 4304 |
</g>
|
| 4305 |
</g>
|
| 4306 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4307 |
-
<path d="M 83.325193 420.186871 L 114.286231
|
| 4308 |
<defs>
|
| 4309 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4310 |
</defs>
|
| 4311 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4312 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4313 |
-
<use ns4:href="#md7efaf3aec" x="114.286231" y="
|
| 4314 |
-
<use ns4:href="#md7efaf3aec" x="145.247268" y="
|
| 4315 |
-
<use ns4:href="#md7efaf3aec" x="176.208306" y="
|
| 4316 |
-
<use ns4:href="#md7efaf3aec" x="207.169343" y="
|
| 4317 |
-
<use ns4:href="#md7efaf3aec" x="238.130381" y="
|
| 4318 |
-
<use ns4:href="#md7efaf3aec" x="269.091418" y="
|
| 4319 |
-
<use ns4:href="#md7efaf3aec" x="300.052455" y="
|
| 4320 |
-
<use ns4:href="#md7efaf3aec" x="331.013493" y="
|
| 4321 |
-
<use ns4:href="#md7efaf3aec" x="361.97453" y="
|
| 4322 |
-
<use ns4:href="#md7efaf3aec" x="392.935568" y="
|
| 4323 |
-
<use ns4:href="#md7efaf3aec" x="423.896605" y="
|
| 4324 |
-
<use ns4:href="#md7efaf3aec" x="454.857643" y="
|
| 4325 |
-
<use ns4:href="#md7efaf3aec" x="485.81868" y="
|
| 4326 |
-
<use ns4:href="#md7efaf3aec" x="516.779718" y="
|
| 4327 |
-
<use ns4:href="#md7efaf3aec" x="547.740755" y="
|
| 4328 |
-
<use ns4:href="#md7efaf3aec" x="578.701793" y="
|
| 4329 |
-
<use ns4:href="#md7efaf3aec" x="609.66283" y="
|
| 4330 |
-
<use ns4:href="#md7efaf3aec" x="640.623868" y="
|
| 4331 |
-
<use ns4:href="#md7efaf3aec" x="671.584905" y="
|
| 4332 |
-
<use ns4:href="#md7efaf3aec" x="702.545943" y="
|
| 4333 |
-
<use ns4:href="#md7efaf3aec" x="733.50698" y="
|
| 4334 |
-
<use ns4:href="#md7efaf3aec" x="764.468018" y="
|
| 4335 |
-
<use ns4:href="#md7efaf3aec" x="795.429055" y="
|
| 4336 |
</g>
|
| 4337 |
</g>
|
| 4338 |
<g id="series--torch-eager" class="series">
|
| 4339 |
-
<path d="M 83.325193
|
| 4340 |
<defs>
|
| 4341 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4342 |
</defs>
|
| 4343 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4344 |
-
<use ns4:href="#m9b8c54d372" x="83.325193" y="
|
| 4345 |
-
<use ns4:href="#m9b8c54d372" x="114.286231" y="
|
| 4346 |
-
<use ns4:href="#m9b8c54d372" x="145.247268" y="
|
| 4347 |
-
<use ns4:href="#m9b8c54d372" x="176.208306" y="
|
| 4348 |
-
<use ns4:href="#m9b8c54d372" x="207.169343" y="
|
| 4349 |
-
<use ns4:href="#m9b8c54d372" x="238.130381" y="
|
| 4350 |
-
<use ns4:href="#m9b8c54d372" x="269.091418" y="390.
|
| 4351 |
-
<use ns4:href="#m9b8c54d372" x="300.052455" y="
|
| 4352 |
-
<use ns4:href="#m9b8c54d372" x="331.013493" y="
|
| 4353 |
-
<use ns4:href="#m9b8c54d372" x="361.97453" y="
|
| 4354 |
-
<use ns4:href="#m9b8c54d372" x="392.935568" y="
|
| 4355 |
-
<use ns4:href="#m9b8c54d372" x="423.896605" y="
|
| 4356 |
-
<use ns4:href="#m9b8c54d372" x="454.857643" y="
|
| 4357 |
-
<use ns4:href="#m9b8c54d372" x="485.81868" y="
|
| 4358 |
-
<use ns4:href="#m9b8c54d372" x="516.779718" y="
|
| 4359 |
-
<use ns4:href="#m9b8c54d372" x="547.740755" y="
|
| 4360 |
-
<use ns4:href="#m9b8c54d372" x="578.701793" y="
|
| 4361 |
-
<use ns4:href="#m9b8c54d372" x="609.66283" y="
|
| 4362 |
-
<use ns4:href="#m9b8c54d372" x="640.623868" y="
|
| 4363 |
-
<use ns4:href="#m9b8c54d372" x="671.584905" y="
|
| 4364 |
-
<use ns4:href="#m9b8c54d372" x="702.545943" y="
|
| 4365 |
-
<use ns4:href="#m9b8c54d372" x="733.50698" y="
|
| 4366 |
-
<use ns4:href="#m9b8c54d372" x="764.468018" y="
|
| 4367 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4368 |
</g>
|
| 4369 |
</g>
|
|
@@ -4422,7 +4422,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4422 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4423 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4424 |
</span> |
|
| 4425 |
-
Cell: combine | 4.
|
| 4426 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4427 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4428 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4547,7 +4547,7 @@ torch_eager cuda_B2_D64_S512_W2 0.08 True
|
|
| 4547 |
torch_eager cuda_B2_D64_S512_W4 0.08 True
|
| 4548 |
torch_eager cuda_B4_D2048_S128_W2 0.08 True
|
| 4549 |
torch_eager cuda_B4_D2048_S128_W4 0.08 True
|
| 4550 |
-
torch_eager cuda_B4_D2048_S2048_W2 0.
|
| 4551 |
torch_eager cuda_B4_D2048_S2048_W4 0.50 True
|
| 4552 |
torch_eager cuda_B4_D2048_S512_W2 0.09 True
|
| 4553 |
torch_eager cuda_B4_D2048_S512_W4 0.10 True
|
|
@@ -4576,7 +4576,7 @@ Implementations included:
|
|
| 4576 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4577 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4578 |
<div class="uv-logs-content" style="display: none;">
|
| 4579 |
-
Installed 37 packages in
|
| 4580 |
</div>
|
| 4581 |
</div>
|
| 4582 |
<div class="cell-artifacts">
|
|
@@ -4589,7 +4589,7 @@ Installed 37 packages in 204ms
|
|
| 4589 |
<rdf:RDF>
|
| 4590 |
<ns2:Work>
|
| 4591 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4592 |
-
<dc:date>2025-12-
|
| 4593 |
<dc:format>image/svg+xml</dc:format>
|
| 4594 |
<dc:creator>
|
| 4595 |
<ns2:Agent>
|
|
@@ -4933,70 +4933,70 @@ Installed 37 packages in 204ms
|
|
| 4933 |
<g id="matplotlib.axis_2">
|
| 4934 |
<g id="ytick_1">
|
| 4935 |
<g id="grid-y--2" class="grid grid-y">
|
| 4936 |
-
<path d="M 47.72 375.
|
| 4937 |
</g>
|
| 4938 |
<g id="line2d_25">
|
| 4939 |
<defs>
|
| 4940 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4941 |
</defs>
|
| 4942 |
<g>
|
| 4943 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="375.
|
| 4944 |
</g>
|
| 4945 |
</g>
|
| 4946 |
<g id="text_25">
|
| 4947 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.
|
| 4948 |
</g>
|
| 4949 |
</g>
|
| 4950 |
<g id="ytick_2">
|
| 4951 |
<g id="grid-y--3" class="grid grid-y">
|
| 4952 |
-
<path d="M 47.72
|
| 4953 |
</g>
|
| 4954 |
<g id="line2d_26">
|
| 4955 |
<g>
|
| 4956 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4957 |
</g>
|
| 4958 |
</g>
|
| 4959 |
<g id="text_26">
|
| 4960 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.
|
| 4961 |
</g>
|
| 4962 |
</g>
|
| 4963 |
<g id="ytick_3">
|
| 4964 |
<g id="grid-y--4" class="grid grid-y">
|
| 4965 |
-
<path d="M 47.72 210.
|
| 4966 |
</g>
|
| 4967 |
<g id="line2d_27">
|
| 4968 |
<g>
|
| 4969 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="210.
|
| 4970 |
</g>
|
| 4971 |
</g>
|
| 4972 |
<g id="text_27">
|
| 4973 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="214.
|
| 4974 |
</g>
|
| 4975 |
</g>
|
| 4976 |
<g id="ytick_4">
|
| 4977 |
<g id="grid-y--5" class="grid grid-y">
|
| 4978 |
-
<path d="M 47.72 127.
|
| 4979 |
</g>
|
| 4980 |
<g id="line2d_28">
|
| 4981 |
<g>
|
| 4982 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="127.
|
| 4983 |
</g>
|
| 4984 |
</g>
|
| 4985 |
<g id="text_28">
|
| 4986 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="131.
|
| 4987 |
</g>
|
| 4988 |
</g>
|
| 4989 |
<g id="ytick_5">
|
| 4990 |
<g id="grid-y--6" class="grid grid-y">
|
| 4991 |
-
<path d="M 47.72 45.
|
| 4992 |
</g>
|
| 4993 |
<g id="line2d_29">
|
| 4994 |
<g>
|
| 4995 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="45.
|
| 4996 |
</g>
|
| 4997 |
</g>
|
| 4998 |
<g id="text_29">
|
| 4999 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 5000 |
</g>
|
| 5001 |
</g>
|
| 5002 |
<g id="label--y" class="ylabel">
|
|
@@ -5004,66 +5004,66 @@ Installed 37 packages in 204ms
|
|
| 5004 |
</g>
|
| 5005 |
</g>
|
| 5006 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 5007 |
-
<path d="M 83.325193 420.186871 L 114.286231
|
| 5008 |
<defs>
|
| 5009 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 5010 |
</defs>
|
| 5011 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5012 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5013 |
-
<use ns4:href="#md7efaf3aec" x="114.286231" y="
|
| 5014 |
-
<use ns4:href="#md7efaf3aec" x="145.247268" y="
|
| 5015 |
-
<use ns4:href="#md7efaf3aec" x="176.208306" y="
|
| 5016 |
-
<use ns4:href="#md7efaf3aec" x="207.169343" y="
|
| 5017 |
-
<use ns4:href="#md7efaf3aec" x="238.130381" y="
|
| 5018 |
-
<use ns4:href="#md7efaf3aec" x="269.091418" y="
|
| 5019 |
-
<use ns4:href="#md7efaf3aec" x="300.052455" y="
|
| 5020 |
-
<use ns4:href="#md7efaf3aec" x="331.013493" y="
|
| 5021 |
-
<use ns4:href="#md7efaf3aec" x="361.97453" y="
|
| 5022 |
-
<use ns4:href="#md7efaf3aec" x="392.935568" y="
|
| 5023 |
-
<use ns4:href="#md7efaf3aec" x="423.896605" y="
|
| 5024 |
-
<use ns4:href="#md7efaf3aec" x="454.857643" y="
|
| 5025 |
-
<use ns4:href="#md7efaf3aec" x="485.81868" y="
|
| 5026 |
-
<use ns4:href="#md7efaf3aec" x="516.779718" y="
|
| 5027 |
-
<use ns4:href="#md7efaf3aec" x="547.740755" y="
|
| 5028 |
-
<use ns4:href="#md7efaf3aec" x="578.701793" y="
|
| 5029 |
-
<use ns4:href="#md7efaf3aec" x="609.66283" y="
|
| 5030 |
-
<use ns4:href="#md7efaf3aec" x="640.623868" y="
|
| 5031 |
-
<use ns4:href="#md7efaf3aec" x="671.584905" y="
|
| 5032 |
-
<use ns4:href="#md7efaf3aec" x="702.545943" y="
|
| 5033 |
-
<use ns4:href="#md7efaf3aec" x="733.50698" y="
|
| 5034 |
-
<use ns4:href="#md7efaf3aec" x="764.468018" y="
|
| 5035 |
-
<use ns4:href="#md7efaf3aec" x="795.429055" y="
|
| 5036 |
</g>
|
| 5037 |
</g>
|
| 5038 |
<g id="series--torch-eager" class="series">
|
| 5039 |
-
<path d="M 83.325193
|
| 5040 |
<defs>
|
| 5041 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 5042 |
</defs>
|
| 5043 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5044 |
-
<use ns4:href="#m9b8c54d372" x="83.325193" y="
|
| 5045 |
-
<use ns4:href="#m9b8c54d372" x="114.286231" y="
|
| 5046 |
-
<use ns4:href="#m9b8c54d372" x="145.247268" y="
|
| 5047 |
-
<use ns4:href="#m9b8c54d372" x="176.208306" y="
|
| 5048 |
-
<use ns4:href="#m9b8c54d372" x="207.169343" y="
|
| 5049 |
-
<use ns4:href="#m9b8c54d372" x="238.130381" y="
|
| 5050 |
-
<use ns4:href="#m9b8c54d372" x="269.091418" y="390.
|
| 5051 |
-
<use ns4:href="#m9b8c54d372" x="300.052455" y="
|
| 5052 |
-
<use ns4:href="#m9b8c54d372" x="331.013493" y="
|
| 5053 |
-
<use ns4:href="#m9b8c54d372" x="361.97453" y="
|
| 5054 |
-
<use ns4:href="#m9b8c54d372" x="392.935568" y="
|
| 5055 |
-
<use ns4:href="#m9b8c54d372" x="423.896605" y="
|
| 5056 |
-
<use ns4:href="#m9b8c54d372" x="454.857643" y="
|
| 5057 |
-
<use ns4:href="#m9b8c54d372" x="485.81868" y="
|
| 5058 |
-
<use ns4:href="#m9b8c54d372" x="516.779718" y="
|
| 5059 |
-
<use ns4:href="#m9b8c54d372" x="547.740755" y="
|
| 5060 |
-
<use ns4:href="#m9b8c54d372" x="578.701793" y="
|
| 5061 |
-
<use ns4:href="#m9b8c54d372" x="609.66283" y="
|
| 5062 |
-
<use ns4:href="#m9b8c54d372" x="640.623868" y="
|
| 5063 |
-
<use ns4:href="#m9b8c54d372" x="671.584905" y="
|
| 5064 |
-
<use ns4:href="#m9b8c54d372" x="702.545943" y="
|
| 5065 |
-
<use ns4:href="#m9b8c54d372" x="733.50698" y="
|
| 5066 |
-
<use ns4:href="#m9b8c54d372" x="764.468018" y="
|
| 5067 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5068 |
</g>
|
| 5069 |
</g>
|
|
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
+
<dc:date>2025-12-19T23:02:31.637981</dc:date>
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
|
|
|
| 4233 |
<g id="matplotlib.axis_2">
|
| 4234 |
<g id="ytick_1">
|
| 4235 |
<g id="grid-y--2" class="grid grid-y">
|
| 4236 |
+
<path d="M 47.72 375.771468 L 831.034248 375.771468 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4237 |
</g>
|
| 4238 |
<g id="line2d_25">
|
| 4239 |
<defs>
|
| 4240 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4241 |
</defs>
|
| 4242 |
<g>
|
| 4243 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="375.771468" style="stroke: #000000; stroke-width: 0.8" />
|
| 4244 |
</g>
|
| 4245 |
</g>
|
| 4246 |
<g id="text_25">
|
| 4247 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.570687" transform="rotate(-0 40.72 379.570687)">0.1</text>
|
| 4248 |
</g>
|
| 4249 |
</g>
|
| 4250 |
<g id="ytick_2">
|
| 4251 |
<g id="grid-y--3" class="grid grid-y">
|
| 4252 |
+
<path d="M 47.72 293.090475 L 831.034248 293.090475 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4253 |
</g>
|
| 4254 |
<g id="line2d_26">
|
| 4255 |
<g>
|
| 4256 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="293.090475" style="stroke: #000000; stroke-width: 0.8" />
|
| 4257 |
</g>
|
| 4258 |
</g>
|
| 4259 |
<g id="text_26">
|
| 4260 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.889693" transform="rotate(-0 40.72 296.889693)">0.2</text>
|
| 4261 |
</g>
|
| 4262 |
</g>
|
| 4263 |
<g id="ytick_3">
|
| 4264 |
<g id="grid-y--4" class="grid grid-y">
|
| 4265 |
+
<path d="M 47.72 210.409481 L 831.034248 210.409481 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4266 |
</g>
|
| 4267 |
<g id="line2d_27">
|
| 4268 |
<g>
|
| 4269 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="210.409481" style="stroke: #000000; stroke-width: 0.8" />
|
| 4270 |
</g>
|
| 4271 |
</g>
|
| 4272 |
<g id="text_27">
|
| 4273 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="214.2087" transform="rotate(-0 40.72 214.2087)">0.3</text>
|
| 4274 |
</g>
|
| 4275 |
</g>
|
| 4276 |
<g id="ytick_4">
|
| 4277 |
<g id="grid-y--5" class="grid grid-y">
|
| 4278 |
+
<path d="M 47.72 127.728488 L 831.034248 127.728488 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4279 |
</g>
|
| 4280 |
<g id="line2d_28">
|
| 4281 |
<g>
|
| 4282 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="127.728488" style="stroke: #000000; stroke-width: 0.8" />
|
| 4283 |
</g>
|
| 4284 |
</g>
|
| 4285 |
<g id="text_28">
|
| 4286 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="131.527707" transform="rotate(-0 40.72 131.527707)">0.4</text>
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="ytick_5">
|
| 4290 |
<g id="grid-y--6" class="grid grid-y">
|
| 4291 |
+
<path d="M 47.72 45.047495 L 831.034248 45.047495 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4292 |
</g>
|
| 4293 |
<g id="line2d_29">
|
| 4294 |
<g>
|
| 4295 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="45.047495" style="stroke: #000000; stroke-width: 0.8" />
|
| 4296 |
</g>
|
| 4297 |
</g>
|
| 4298 |
<g id="text_29">
|
| 4299 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="48.846713" transform="rotate(-0 40.72 48.846713)">0.5</text>
|
| 4300 |
</g>
|
| 4301 |
</g>
|
| 4302 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4304 |
</g>
|
| 4305 |
</g>
|
| 4306 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4307 |
+
<path d="M 83.325193 420.186871 L 114.286231 415.788242 L 145.247268 415.192112 L 176.208306 416.325668 L 207.169343 417.499739 L 238.130381 417.458398 L 269.091418 417.259137 L 300.052455 417.251695 L 331.013493 417.656832 L 361.97453 417.400521 L 392.935568 418.152918 L 423.896605 417.49147 L 454.857643 417.541079 L 485.81868 417.425325 L 516.779718 416.656392 L 547.740755 416.028017 L 578.701793 417.391426 L 609.66283 417.293036 L 640.623868 416.912703 L 671.584905 417.367449 L 702.545943 417.425325 L 733.50698 417.400521 L 764.468018 417.904875 L 795.429055 417.466666 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4308 |
<defs>
|
| 4309 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4310 |
</defs>
|
| 4311 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4312 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4313 |
+
<use ns4:href="#md7efaf3aec" x="114.286231" y="415.788242" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4314 |
+
<use ns4:href="#md7efaf3aec" x="145.247268" y="415.192112" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4315 |
+
<use ns4:href="#md7efaf3aec" x="176.208306" y="416.325668" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4316 |
+
<use ns4:href="#md7efaf3aec" x="207.169343" y="417.499739" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4317 |
+
<use ns4:href="#md7efaf3aec" x="238.130381" y="417.458398" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4318 |
+
<use ns4:href="#md7efaf3aec" x="269.091418" y="417.259137" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4319 |
+
<use ns4:href="#md7efaf3aec" x="300.052455" y="417.251695" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4320 |
+
<use ns4:href="#md7efaf3aec" x="331.013493" y="417.656832" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4321 |
+
<use ns4:href="#md7efaf3aec" x="361.97453" y="417.400521" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4322 |
+
<use ns4:href="#md7efaf3aec" x="392.935568" y="418.152918" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4323 |
+
<use ns4:href="#md7efaf3aec" x="423.896605" y="417.49147" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4324 |
+
<use ns4:href="#md7efaf3aec" x="454.857643" y="417.541079" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4325 |
+
<use ns4:href="#md7efaf3aec" x="485.81868" y="417.425325" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4326 |
+
<use ns4:href="#md7efaf3aec" x="516.779718" y="416.656392" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4327 |
+
<use ns4:href="#md7efaf3aec" x="547.740755" y="416.028017" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4328 |
+
<use ns4:href="#md7efaf3aec" x="578.701793" y="417.391426" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4329 |
+
<use ns4:href="#md7efaf3aec" x="609.66283" y="417.293036" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4330 |
+
<use ns4:href="#md7efaf3aec" x="640.623868" y="416.912703" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4331 |
+
<use ns4:href="#md7efaf3aec" x="671.584905" y="417.367449" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4332 |
+
<use ns4:href="#md7efaf3aec" x="702.545943" y="417.425325" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4333 |
+
<use ns4:href="#md7efaf3aec" x="733.50698" y="417.400521" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4334 |
+
<use ns4:href="#md7efaf3aec" x="764.468018" y="417.904875" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4335 |
+
<use ns4:href="#md7efaf3aec" x="795.429055" y="417.466666" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4336 |
</g>
|
| 4337 |
</g>
|
| 4338 |
<g id="series--torch-eager" class="series">
|
| 4339 |
+
<path d="M 83.325193 401.186778 L 114.286231 389.759438 L 145.247268 389.88346 L 176.208306 391.429594 L 207.169343 391.437862 L 238.130381 391.3064 L 269.091418 390.148866 L 300.052455 391.892608 L 331.013493 391.718978 L 361.97453 391.488298 L 392.935568 327.44112 L 423.896605 323.861033 L 454.857643 393.09975 L 485.81868 392.802099 L 516.779718 391.967848 L 547.740755 391.330377 L 578.701793 391.702442 L 609.66283 391.388254 L 640.623868 392.347353 L 671.584905 392.611933 L 702.545943 382.05357 L 733.50698 376.74545 L 764.468018 59.053655 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4340 |
<defs>
|
| 4341 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4342 |
</defs>
|
| 4343 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4344 |
+
<use ns4:href="#m9b8c54d372" x="83.325193" y="401.186778" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4345 |
+
<use ns4:href="#m9b8c54d372" x="114.286231" y="389.759438" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4346 |
+
<use ns4:href="#m9b8c54d372" x="145.247268" y="389.88346" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4347 |
+
<use ns4:href="#m9b8c54d372" x="176.208306" y="391.429594" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4348 |
+
<use ns4:href="#m9b8c54d372" x="207.169343" y="391.437862" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4349 |
+
<use ns4:href="#m9b8c54d372" x="238.130381" y="391.3064" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4350 |
+
<use ns4:href="#m9b8c54d372" x="269.091418" y="390.148866" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4351 |
+
<use ns4:href="#m9b8c54d372" x="300.052455" y="391.892608" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4352 |
+
<use ns4:href="#m9b8c54d372" x="331.013493" y="391.718978" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4353 |
+
<use ns4:href="#m9b8c54d372" x="361.97453" y="391.488298" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4354 |
+
<use ns4:href="#m9b8c54d372" x="392.935568" y="327.44112" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4355 |
+
<use ns4:href="#m9b8c54d372" x="423.896605" y="323.861033" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4356 |
+
<use ns4:href="#m9b8c54d372" x="454.857643" y="393.09975" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4357 |
+
<use ns4:href="#m9b8c54d372" x="485.81868" y="392.802099" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4358 |
+
<use ns4:href="#m9b8c54d372" x="516.779718" y="391.967848" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4359 |
+
<use ns4:href="#m9b8c54d372" x="547.740755" y="391.330377" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4360 |
+
<use ns4:href="#m9b8c54d372" x="578.701793" y="391.702442" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4361 |
+
<use ns4:href="#m9b8c54d372" x="609.66283" y="391.388254" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4362 |
+
<use ns4:href="#m9b8c54d372" x="640.623868" y="392.347353" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4363 |
+
<use ns4:href="#m9b8c54d372" x="671.584905" y="392.611933" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4364 |
+
<use ns4:href="#m9b8c54d372" x="702.545943" y="382.05357" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4365 |
+
<use ns4:href="#m9b8c54d372" x="733.50698" y="376.74545" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4366 |
+
<use ns4:href="#m9b8c54d372" x="764.468018" y="59.053655" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4367 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4368 |
</g>
|
| 4369 |
</g>
|
|
|
|
| 4422 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4423 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4424 |
</span> |
|
| 4425 |
+
Cell: combine | 4.67s
|
| 4426 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4427 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4428 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4547 |
torch_eager cuda_B2_D64_S512_W4 0.08 True
|
| 4548 |
torch_eager cuda_B4_D2048_S128_W2 0.08 True
|
| 4549 |
torch_eager cuda_B4_D2048_S128_W4 0.08 True
|
| 4550 |
+
torch_eager cuda_B4_D2048_S2048_W2 0.48 True
|
| 4551 |
torch_eager cuda_B4_D2048_S2048_W4 0.50 True
|
| 4552 |
torch_eager cuda_B4_D2048_S512_W2 0.09 True
|
| 4553 |
torch_eager cuda_B4_D2048_S512_W4 0.10 True
|
|
|
|
| 4576 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4577 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4578 |
<div class="uv-logs-content" style="display: none;">
|
| 4579 |
+
Installed 37 packages in 343ms
|
| 4580 |
</div>
|
| 4581 |
</div>
|
| 4582 |
<div class="cell-artifacts">
|
|
|
|
| 4589 |
<rdf:RDF>
|
| 4590 |
<ns2:Work>
|
| 4591 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4592 |
+
<dc:date>2025-12-19T23:02:31.637981</dc:date>
|
| 4593 |
<dc:format>image/svg+xml</dc:format>
|
| 4594 |
<dc:creator>
|
| 4595 |
<ns2:Agent>
|
|
|
|
| 4933 |
<g id="matplotlib.axis_2">
|
| 4934 |
<g id="ytick_1">
|
| 4935 |
<g id="grid-y--2" class="grid grid-y">
|
| 4936 |
+
<path d="M 47.72 375.771468 L 831.034248 375.771468 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4937 |
</g>
|
| 4938 |
<g id="line2d_25">
|
| 4939 |
<defs>
|
| 4940 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4941 |
</defs>
|
| 4942 |
<g>
|
| 4943 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="375.771468" style="stroke: #000000; stroke-width: 0.8" />
|
| 4944 |
</g>
|
| 4945 |
</g>
|
| 4946 |
<g id="text_25">
|
| 4947 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.570687" transform="rotate(-0 40.72 379.570687)">0.1</text>
|
| 4948 |
</g>
|
| 4949 |
</g>
|
| 4950 |
<g id="ytick_2">
|
| 4951 |
<g id="grid-y--3" class="grid grid-y">
|
| 4952 |
+
<path d="M 47.72 293.090475 L 831.034248 293.090475 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4953 |
</g>
|
| 4954 |
<g id="line2d_26">
|
| 4955 |
<g>
|
| 4956 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="293.090475" style="stroke: #000000; stroke-width: 0.8" />
|
| 4957 |
</g>
|
| 4958 |
</g>
|
| 4959 |
<g id="text_26">
|
| 4960 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.889693" transform="rotate(-0 40.72 296.889693)">0.2</text>
|
| 4961 |
</g>
|
| 4962 |
</g>
|
| 4963 |
<g id="ytick_3">
|
| 4964 |
<g id="grid-y--4" class="grid grid-y">
|
| 4965 |
+
<path d="M 47.72 210.409481 L 831.034248 210.409481 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4966 |
</g>
|
| 4967 |
<g id="line2d_27">
|
| 4968 |
<g>
|
| 4969 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="210.409481" style="stroke: #000000; stroke-width: 0.8" />
|
| 4970 |
</g>
|
| 4971 |
</g>
|
| 4972 |
<g id="text_27">
|
| 4973 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="214.2087" transform="rotate(-0 40.72 214.2087)">0.3</text>
|
| 4974 |
</g>
|
| 4975 |
</g>
|
| 4976 |
<g id="ytick_4">
|
| 4977 |
<g id="grid-y--5" class="grid grid-y">
|
| 4978 |
+
<path d="M 47.72 127.728488 L 831.034248 127.728488 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4979 |
</g>
|
| 4980 |
<g id="line2d_28">
|
| 4981 |
<g>
|
| 4982 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="127.728488" style="stroke: #000000; stroke-width: 0.8" />
|
| 4983 |
</g>
|
| 4984 |
</g>
|
| 4985 |
<g id="text_28">
|
| 4986 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="131.527707" transform="rotate(-0 40.72 131.527707)">0.4</text>
|
| 4987 |
</g>
|
| 4988 |
</g>
|
| 4989 |
<g id="ytick_5">
|
| 4990 |
<g id="grid-y--6" class="grid grid-y">
|
| 4991 |
+
<path d="M 47.72 45.047495 L 831.034248 45.047495 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4992 |
</g>
|
| 4993 |
<g id="line2d_29">
|
| 4994 |
<g>
|
| 4995 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="45.047495" style="stroke: #000000; stroke-width: 0.8" />
|
| 4996 |
</g>
|
| 4997 |
</g>
|
| 4998 |
<g id="text_29">
|
| 4999 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="48.846713" transform="rotate(-0 40.72 48.846713)">0.5</text>
|
| 5000 |
</g>
|
| 5001 |
</g>
|
| 5002 |
<g id="label--y" class="ylabel">
|
|
|
|
| 5004 |
</g>
|
| 5005 |
</g>
|
| 5006 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 5007 |
+
<path d="M 83.325193 420.186871 L 114.286231 415.788242 L 145.247268 415.192112 L 176.208306 416.325668 L 207.169343 417.499739 L 238.130381 417.458398 L 269.091418 417.259137 L 300.052455 417.251695 L 331.013493 417.656832 L 361.97453 417.400521 L 392.935568 418.152918 L 423.896605 417.49147 L 454.857643 417.541079 L 485.81868 417.425325 L 516.779718 416.656392 L 547.740755 416.028017 L 578.701793 417.391426 L 609.66283 417.293036 L 640.623868 416.912703 L 671.584905 417.367449 L 702.545943 417.425325 L 733.50698 417.400521 L 764.468018 417.904875 L 795.429055 417.466666 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 5008 |
<defs>
|
| 5009 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 5010 |
</defs>
|
| 5011 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5012 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5013 |
+
<use ns4:href="#md7efaf3aec" x="114.286231" y="415.788242" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5014 |
+
<use ns4:href="#md7efaf3aec" x="145.247268" y="415.192112" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5015 |
+
<use ns4:href="#md7efaf3aec" x="176.208306" y="416.325668" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5016 |
+
<use ns4:href="#md7efaf3aec" x="207.169343" y="417.499739" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5017 |
+
<use ns4:href="#md7efaf3aec" x="238.130381" y="417.458398" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5018 |
+
<use ns4:href="#md7efaf3aec" x="269.091418" y="417.259137" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5019 |
+
<use ns4:href="#md7efaf3aec" x="300.052455" y="417.251695" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5020 |
+
<use ns4:href="#md7efaf3aec" x="331.013493" y="417.656832" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5021 |
+
<use ns4:href="#md7efaf3aec" x="361.97453" y="417.400521" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5022 |
+
<use ns4:href="#md7efaf3aec" x="392.935568" y="418.152918" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5023 |
+
<use ns4:href="#md7efaf3aec" x="423.896605" y="417.49147" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5024 |
+
<use ns4:href="#md7efaf3aec" x="454.857643" y="417.541079" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5025 |
+
<use ns4:href="#md7efaf3aec" x="485.81868" y="417.425325" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5026 |
+
<use ns4:href="#md7efaf3aec" x="516.779718" y="416.656392" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5027 |
+
<use ns4:href="#md7efaf3aec" x="547.740755" y="416.028017" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5028 |
+
<use ns4:href="#md7efaf3aec" x="578.701793" y="417.391426" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5029 |
+
<use ns4:href="#md7efaf3aec" x="609.66283" y="417.293036" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5030 |
+
<use ns4:href="#md7efaf3aec" x="640.623868" y="416.912703" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5031 |
+
<use ns4:href="#md7efaf3aec" x="671.584905" y="417.367449" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5032 |
+
<use ns4:href="#md7efaf3aec" x="702.545943" y="417.425325" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5033 |
+
<use ns4:href="#md7efaf3aec" x="733.50698" y="417.400521" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5034 |
+
<use ns4:href="#md7efaf3aec" x="764.468018" y="417.904875" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5035 |
+
<use ns4:href="#md7efaf3aec" x="795.429055" y="417.466666" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5036 |
</g>
|
| 5037 |
</g>
|
| 5038 |
<g id="series--torch-eager" class="series">
|
| 5039 |
+
<path d="M 83.325193 401.186778 L 114.286231 389.759438 L 145.247268 389.88346 L 176.208306 391.429594 L 207.169343 391.437862 L 238.130381 391.3064 L 269.091418 390.148866 L 300.052455 391.892608 L 331.013493 391.718978 L 361.97453 391.488298 L 392.935568 327.44112 L 423.896605 323.861033 L 454.857643 393.09975 L 485.81868 392.802099 L 516.779718 391.967848 L 547.740755 391.330377 L 578.701793 391.702442 L 609.66283 391.388254 L 640.623868 392.347353 L 671.584905 392.611933 L 702.545943 382.05357 L 733.50698 376.74545 L 764.468018 59.053655 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 5040 |
<defs>
|
| 5041 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 5042 |
</defs>
|
| 5043 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5044 |
+
<use ns4:href="#m9b8c54d372" x="83.325193" y="401.186778" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5045 |
+
<use ns4:href="#m9b8c54d372" x="114.286231" y="389.759438" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5046 |
+
<use ns4:href="#m9b8c54d372" x="145.247268" y="389.88346" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5047 |
+
<use ns4:href="#m9b8c54d372" x="176.208306" y="391.429594" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5048 |
+
<use ns4:href="#m9b8c54d372" x="207.169343" y="391.437862" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5049 |
+
<use ns4:href="#m9b8c54d372" x="238.130381" y="391.3064" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5050 |
+
<use ns4:href="#m9b8c54d372" x="269.091418" y="390.148866" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5051 |
+
<use ns4:href="#m9b8c54d372" x="300.052455" y="391.892608" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5052 |
+
<use ns4:href="#m9b8c54d372" x="331.013493" y="391.718978" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5053 |
+
<use ns4:href="#m9b8c54d372" x="361.97453" y="391.488298" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5054 |
+
<use ns4:href="#m9b8c54d372" x="392.935568" y="327.44112" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5055 |
+
<use ns4:href="#m9b8c54d372" x="423.896605" y="323.861033" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5056 |
+
<use ns4:href="#m9b8c54d372" x="454.857643" y="393.09975" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5057 |
+
<use ns4:href="#m9b8c54d372" x="485.81868" y="392.802099" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5058 |
+
<use ns4:href="#m9b8c54d372" x="516.779718" y="391.967848" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5059 |
+
<use ns4:href="#m9b8c54d372" x="547.740755" y="391.330377" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5060 |
+
<use ns4:href="#m9b8c54d372" x="578.701793" y="391.702442" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5061 |
+
<use ns4:href="#m9b8c54d372" x="609.66283" y="391.388254" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5062 |
+
<use ns4:href="#m9b8c54d372" x="640.623868" y="392.347353" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5063 |
+
<use ns4:href="#m9b8c54d372" x="671.584905" y="392.611933" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5064 |
+
<use ns4:href="#m9b8c54d372" x="702.545943" y="382.05357" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5065 |
+
<use ns4:href="#m9b8c54d372" x="733.50698" y="376.74545" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5066 |
+
<use ns4:href="#m9b8c54d372" x="764.468018" y="59.053655" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5067 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5068 |
</g>
|
| 5069 |
</g>
|
deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
{"ts": "2025-12-
|
| 2 |
-
{"ts": "2025-12-
|
| 3 |
-
{"ts": "2025-12-
|
| 4 |
-
{"ts": "2025-12-
|
|
|
|
| 1 |
+
{"ts": "2025-12-19T23:02:21Z", "run": "bd3674eb0704484693460041fd14f59b", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_Q100_H8_E256_L4_P4", "batch_size": 1, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.03611099987210764, "p50": 0.037491000057343626, "p90": 0.038670999856549315, "mean": 0.03807699995377334, "iqr": 0.0014299998838396277, "raw_times": [0.04087100001015642, 0.038670999856549315, 0.03724099997270969, 0.037491000057343626, 0.03611099987210764], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04587100011121947, "peak_bytes": 2264064, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.539113701047427e-08, "mse": 6.418638644407112e-15, "ref": "deformable_detr_torch"}, "err": null}
|
| 2 |
+
{"ts": "2025-12-19T23:02:21Z", "run": "bd3674eb0704484693460041fd14f59b", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_Q300_H8_E256_L4_P4", "batch_size": 1, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0427610000315326, "p50": 0.04391099992062664, "p90": 0.04453099995771481, "mean": 0.043983000023217755, "iqr": 0.0007099997674231417, "raw_times": [0.0427610000315326, 0.04453099995771481, 0.044891000015923055, 0.04382100019029167, 0.04391099992062664], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04723100005321612, "peak_bytes": 4004864, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.559346050176828e-08, "mse": 6.4289483059246175e-15, "ref": "deformable_detr_torch"}, "err": null}
|
| 3 |
+
{"ts": "2025-12-19T23:02:21Z", "run": "bd3674eb0704484693460041fd14f59b", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_Q100_H8_E256_L4_P4", "batch_size": 2, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04207100005260145, "p50": 0.04387099988889531, "p90": 0.044481000031737494, "mean": 0.04371499999251682, "iqr": 0.0019200001588615123, "raw_times": [0.04387099988889531, 0.044481000031737494, 0.04559100011647388, 0.04256099987287598, 0.04207100005260145], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04643099987333699, "peak_bytes": 5459968, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.555110149657594e-08, "mse": 6.418781369458724e-15, "ref": "deformable_detr_torch"}, "err": null}
|
| 4 |
+
{"ts": "2025-12-19T23:02:21Z", "run": "bd3674eb0704484693460041fd14f59b", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_Q300_H8_E256_L4_P4", "batch_size": 2, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04534100003183994, "p50": 0.04615100010596507, "p90": 0.04615100010596507, "mean": 0.045852800030843355, "iqr": 0.0007410001217067474, "raw_times": [0.04615100010596507, 0.04540999998425832, 0.04615100010596507, 0.04534100003183994, 0.04621099992618838], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04675100012718758, "peak_bytes": 8008704, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.5905669427147586e-08, "mse": 6.485184940875199e-15, "ref": "deformable_detr_torch"}, "err": null}
|
deformable_detr/impls/cells/benchmark.py
CHANGED
|
@@ -4,6 +4,7 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
|
@@ -12,107 +13,30 @@
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
|
|
|
| 15 |
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
|
|
|
|
| 18 |
value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64
|
| 19 |
):
|
| 20 |
-
"""
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
# Split value tensor by levels
|
| 30 |
-
value_list = value.split([int(h * w) for h, w in spatial_shapes.tolist()], dim=1)
|
| 31 |
-
|
| 32 |
-
# Iterate through each level (can't avoid this loop easily)
|
| 33 |
-
for level_idx in range(num_levels):
|
| 34 |
-
h, w = spatial_shapes[level_idx].tolist()
|
| 35 |
-
value_level = value_list[level_idx] # (bs, h*w, num_heads, channels)
|
| 36 |
-
|
| 37 |
-
# Reshape to spatial grid: (bs, num_heads, channels, h, w)
|
| 38 |
-
value_spatial = value_level.reshape(bs, h, w, num_heads, channels).permute(0, 3, 4, 1, 2)
|
| 39 |
-
|
| 40 |
-
# Get sampling locations and weights for this level
|
| 41 |
-
# loc: (bs, num_queries, num_heads, num_points, 2)
|
| 42 |
-
loc = sampling_locations[:, :, :, level_idx, :, :]
|
| 43 |
-
# weight: (bs, num_queries, num_heads, num_points)
|
| 44 |
-
weight = attention_weights[:, :, :, level_idx, :]
|
| 45 |
-
|
| 46 |
-
# Convert normalized coordinates to pixel coordinates
|
| 47 |
-
# loc[..., 0] is x (width), loc[..., 1] is y (height)
|
| 48 |
-
x = loc[..., 0] * w - 0.5 # (bs, num_queries, num_heads, num_points)
|
| 49 |
-
y = loc[..., 1] * h - 0.5
|
| 50 |
-
|
| 51 |
-
# Get integer coordinates for bilinear interpolation
|
| 52 |
-
x0 = torch.floor(x).long()
|
| 53 |
-
y0 = torch.floor(y).long()
|
| 54 |
-
x1 = x0 + 1
|
| 55 |
-
y1 = y0 + 1
|
| 56 |
-
|
| 57 |
-
# Compute interpolation weights BEFORE clamping (important!)
|
| 58 |
-
lw = x - x0.float() # weight for x direction
|
| 59 |
-
lh = y - y0.float() # weight for y direction
|
| 60 |
-
hw = 1 - lw
|
| 61 |
-
hh = 1 - lh
|
| 62 |
-
|
| 63 |
-
# Create mask for valid sample locations
|
| 64 |
-
valid = (y > -1) & (x > -1) & (y < h) & (x < w)
|
| 65 |
-
|
| 66 |
-
# Create masks for each corner being in bounds
|
| 67 |
-
mask_tl = ((y0 >= 0) & (x0 >= 0)).unsqueeze(-1).float()
|
| 68 |
-
mask_tr = ((y0 >= 0) & (x1 <= w - 1)).unsqueeze(-1).float()
|
| 69 |
-
mask_bl = ((y1 <= h - 1) & (x0 >= 0)).unsqueeze(-1).float()
|
| 70 |
-
mask_br = ((y1 <= h - 1) & (x1 <= w - 1)).unsqueeze(-1).float()
|
| 71 |
-
|
| 72 |
-
# Clamp coordinates for safe indexing
|
| 73 |
-
x0_clamped = torch.clamp(x0, 0, w - 1)
|
| 74 |
-
x1_clamped = torch.clamp(x1, 0, w - 1)
|
| 75 |
-
y0_clamped = torch.clamp(y0, 0, h - 1)
|
| 76 |
-
y1_clamped = torch.clamp(y1, 0, h - 1)
|
| 77 |
-
|
| 78 |
-
# Bilinear interpolation weights for all 4 corners
|
| 79 |
-
w_tl = (hh * hw).unsqueeze(-1) # top-left: (bs, num_queries, num_heads, num_points, 1)
|
| 80 |
-
w_tr = (hh * lw).unsqueeze(-1) # top-right
|
| 81 |
-
w_bl = (lh * hw).unsqueeze(-1) # bottom-left
|
| 82 |
-
w_br = (lh * lw).unsqueeze(-1) # bottom-right
|
| 83 |
-
|
| 84 |
-
# Gather values from the 4 corners using advanced indexing
|
| 85 |
-
batch_idx = torch.arange(bs, device=value.device).view(bs, 1, 1, 1).expand(bs, num_queries, num_heads, num_points)
|
| 86 |
-
head_idx = torch.arange(num_heads, device=value.device).view(1, 1, num_heads, 1).expand(bs, num_queries, num_heads, num_points)
|
| 87 |
-
|
| 88 |
-
# Gather corner values with clamped indices, then apply corner masks
|
| 89 |
-
v_tl = value_spatial[batch_idx, head_idx, :, y0_clamped, x0_clamped] * mask_tl
|
| 90 |
-
v_tr = value_spatial[batch_idx, head_idx, :, y0_clamped, x1_clamped] * mask_tr
|
| 91 |
-
v_bl = value_spatial[batch_idx, head_idx, :, y1_clamped, x0_clamped] * mask_bl
|
| 92 |
-
v_br = value_spatial[batch_idx, head_idx, :, y1_clamped, x1_clamped] * mask_br
|
| 93 |
-
|
| 94 |
-
# Bilinear interpolation
|
| 95 |
-
sampled = w_tl * v_tl + w_tr * v_tr + w_bl * v_bl + w_br * v_br
|
| 96 |
-
|
| 97 |
-
# Apply valid mask (only accumulate if entire sample location is valid)
|
| 98 |
-
sampled = sampled * valid.unsqueeze(-1).float()
|
| 99 |
-
|
| 100 |
-
# Apply attention weights and sum over points
|
| 101 |
-
# weight: (bs, num_queries, num_heads, num_points)
|
| 102 |
-
# Expand weight: (bs, num_queries, num_heads, num_points, 1)
|
| 103 |
-
weighted_sampled = sampled * weight.unsqueeze(-1)
|
| 104 |
-
|
| 105 |
-
# Sum over points: (bs, num_queries, num_heads, channels)
|
| 106 |
-
output += weighted_sampled.sum(dim=3)
|
| 107 |
-
|
| 108 |
-
# Flatten last two dimensions to match kernel output
|
| 109 |
-
return output.reshape(bs, num_queries, num_heads * channels)
|
| 110 |
|
| 111 |
|
| 112 |
run_benchmark(
|
| 113 |
kernel_type=KernelTypeEnum.DEFORMABLE_DETR,
|
| 114 |
-
impl_name="
|
| 115 |
-
impl_tags={"family": "
|
| 116 |
-
impl_func=
|
| 117 |
dtype="float32",
|
| 118 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
+
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
+
from kernels import get_kernel
|
| 17 |
|
| 18 |
+
# Load the deformable DETR kernel
|
| 19 |
+
deformable_detr = get_kernel("kernels-community/deformable-detr")
|
| 20 |
|
| 21 |
+
|
| 22 |
+
def hf_kernels_deformable_detr(
|
| 23 |
value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64
|
| 24 |
):
|
| 25 |
+
"""HuggingFace Kernels Deformable DETR Multi-Scale Deformable Attention"""
|
| 26 |
+
return deformable_detr.ms_deform_attn_forward(
|
| 27 |
+
value=value,
|
| 28 |
+
spatial_shapes=spatial_shapes,
|
| 29 |
+
level_start_index=level_start_index,
|
| 30 |
+
sampling_loc=sampling_locations,
|
| 31 |
+
attn_weight=attention_weights,
|
| 32 |
+
im2col_step=im2col_step
|
| 33 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
run_benchmark(
|
| 37 |
kernel_type=KernelTypeEnum.DEFORMABLE_DETR,
|
| 38 |
+
impl_name="hf_kernels_deformable_detr",
|
| 39 |
+
impl_tags={"family": "hf-kernels", "backend": "cuda"},
|
| 40 |
+
impl_func=hf_kernels_deformable_detr,
|
| 41 |
dtype="float32",
|
| 42 |
)
|
deformable_detr/impls/hf_kernels_deformable_detr.html
CHANGED
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: nv | 0.
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3905,7 +3905,7 @@ Cell: nv | 0.28s
|
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div id="output-nv" class="cell-output">
|
| 3908 |
-
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3911 |
+-----------------------------------------+------------------------+----------------------+
|
|
@@ -3914,7 +3914,7 @@ Cell: nv | 0.28s
|
|
| 3914 |
| | | MIG M. |
|
| 3915 |
|=========================================+========================+======================|
|
| 3916 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3917 |
-
| N/A
|
| 3918 |
| | | N/A |
|
| 3919 |
+-----------------------------------------+------------------------+----------------------+
|
| 3920 |
|
|
@@ -3938,7 +3938,7 @@ Cell: nv | 0.28s
|
|
| 3938 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3939 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3940 |
</span> |
|
| 3941 |
-
Cell: benchmark |
|
| 3942 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3943 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3944 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4003,24 +4003,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q100_H8_E256_L4_P4
|
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4005 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4006 |
-
hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4007 |
-
hf_kernels_deformable_detr 6.
|
| 4008 |
-
_deformable_detr_57c3d32::ms_deform_attn_forward
|
| 4009 |
-
void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.
|
| 4010 |
-
aten::zeros 0.
|
| 4011 |
-
aten::zero_ 0.
|
| 4012 |
-
aten::fill_ 1.
|
| 4013 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4014 |
-
Activity Buffer Request
|
| 4015 |
-
aten::empty 1.
|
| 4016 |
-
cudaLaunchKernel 2.
|
| 4017 |
-
aten::view 0.
|
| 4018 |
-
aten::select
|
| 4019 |
-
aten::as_strided 0.
|
| 4020 |
-
cudaDeviceSynchronize 0.
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
-
Self CPU time total: 2.
|
| 4023 |
-
Self CUDA time total: 25.
|
| 4024 |
|
| 4025 |
|
| 4026 |
|
|
@@ -4030,24 +4030,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q300_H8_E256_L4_P4
|
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4032 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4033 |
-
hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4034 |
-
hf_kernels_deformable_detr
|
| 4035 |
-
_deformable_detr_57c3d32::ms_deform_attn_forward 1.
|
| 4036 |
-
void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 23.
|
| 4037 |
-
aten::zeros 0.
|
| 4038 |
-
aten::zero_ 0.
|
| 4039 |
-
aten::fill_ 1.
|
| 4040 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4041 |
-
Activity Buffer Request
|
| 4042 |
-
aten::empty 0.
|
| 4043 |
-
cudaLaunchKernel 2.
|
| 4044 |
-
aten::view 0.
|
| 4045 |
-
aten::select 0.
|
| 4046 |
-
aten::as_strided 0.
|
| 4047 |
-
cudaDeviceSynchronize 0.27% 5.
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
-
Self CPU time total: 1.
|
| 4050 |
-
Self CUDA time total: 26.
|
| 4051 |
|
| 4052 |
|
| 4053 |
|
|
@@ -4057,24 +4057,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q100_H8_E256_L4_P4
|
|
| 4057 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4058 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4059 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4060 |
-
hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4061 |
-
hf_kernels_deformable_detr
|
| 4062 |
-
_deformable_detr_57c3d32::ms_deform_attn_forward 1.
|
| 4063 |
-
void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.
|
| 4064 |
-
aten::zeros 0.
|
| 4065 |
-
aten::zero_ 0.
|
| 4066 |
-
aten::fill_ 1.
|
| 4067 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4068 |
-
Activity Buffer Request
|
| 4069 |
-
aten::empty 0.
|
| 4070 |
-
cudaLaunchKernel 2.
|
| 4071 |
-
aten::view 0.
|
| 4072 |
-
aten::select 0.
|
| 4073 |
-
aten::as_strided 0.
|
| 4074 |
-
cudaDeviceSynchronize 0.
|
| 4075 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4076 |
-
Self CPU time total: 1.
|
| 4077 |
-
Self CUDA time total: 25.
|
| 4078 |
|
| 4079 |
|
| 4080 |
|
|
@@ -4084,42 +4084,42 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q300_H8_E256_L4_P4
|
|
| 4084 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4085 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4086 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4087 |
-
hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4088 |
-
hf_kernels_deformable_detr
|
| 4089 |
-
_deformable_detr_57c3d32::ms_deform_attn_forward 1.
|
| 4090 |
-
void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 43.
|
| 4091 |
-
aten::zeros 0.
|
| 4092 |
-
aten::zero_ 0.
|
| 4093 |
-
aten::fill_ 1.
|
| 4094 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4095 |
-
Activity Buffer Request 79.
|
| 4096 |
-
aten::empty 0.
|
| 4097 |
-
cudaLaunchKernel
|
| 4098 |
-
aten::view 0.
|
| 4099 |
-
aten::select 0.
|
| 4100 |
-
aten::as_strided 0.
|
| 4101 |
-
cudaDeviceSynchronize 0.23%
|
| 4102 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4103 |
-
Self CPU time total: 2.
|
| 4104 |
-
Self CUDA time total: 46.
|
| 4105 |
|
| 4106 |
|
| 4107 |
impl wl p50(ms) ok
|
| 4108 |
hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
|
| 4109 |
-
hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.
|
| 4110 |
-
hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.
|
| 4111 |
hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
|
| 4112 |
</pre></div>
|
| 4113 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4114 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4115 |
<div class="uv-logs-content" style="display: none;">
|
| 4116 |
-
Installed
|
| 4117 |
</div>
|
| 4118 |
</div>
|
| 4119 |
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4120 |
-
Fetching 7 files:
|
| 4121 |
-
Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00,
|
| 4122 |
-
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00,
|
| 4123 |
<div class="cell-artifacts">
|
| 4124 |
<h4>Artifacts:</h4>
|
| 4125 |
<a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
|
|
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: nv | 0.25s
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div id="output-nv" class="cell-output">
|
| 3908 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 23:02:11 2025
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3911 |
+-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3914 |
| | | MIG M. |
|
| 3915 |
|=========================================+========================+======================|
|
| 3916 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3917 |
+
| N/A 42C P0 83W / 350W | 0MiB / 46068MiB | 12% Default |
|
| 3918 |
| | | N/A |
|
| 3919 |
+-----------------------------------------+------------------------+----------------------+
|
| 3920 |
|
|
|
|
| 3938 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3939 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3940 |
</span> |
|
| 3941 |
+
Cell: benchmark | 4.69s
|
| 3942 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3943 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3944 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4005 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4006 |
+
hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 179.167us 708.76% 179.167us 179.167us 1
|
| 4007 |
+
hf_kernels_deformable_detr 6.05% 126.291us 99.56% 2.078ms 2.078ms 0.000us 0.00% 26.335us 26.335us 1
|
| 4008 |
+
_deformable_detr_57c3d32::ms_deform_attn_forward 2.99% 62.312us 93.50% 1.951ms 650.448us 22.366us 88.48% 26.335us 8.778us 3
|
| 4009 |
+
void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.366us 88.48% 22.366us 7.455us 3
|
| 4010 |
+
aten::zeros 0.88% 18.443us 87.81% 1.832ms 610.824us 0.000us 0.00% 3.969us 1.323us 3
|
| 4011 |
+
aten::zero_ 0.60% 12.470us 85.41% 1.782ms 594.116us 0.000us 0.00% 3.969us 1.323us 3
|
| 4012 |
+
aten::fill_ 1.40% 29.180us 84.81% 1.770ms 589.959us 2.913us 11.52% 3.969us 1.323us 3
|
| 4013 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.913us 11.52% 2.913us 0.971us 3
|
| 4014 |
+
Activity Buffer Request 81.41% 1.699ms 81.41% 1.699ms 1.699ms 1.056us 4.18% 1.056us 1.056us 1
|
| 4015 |
+
aten::empty 1.52% 31.680us 1.52% 31.680us 10.560us 0.000us 0.00% 0.000us 0.000us 3
|
| 4016 |
+
cudaLaunchKernel 2.90% 60.481us 2.90% 60.481us 10.080us 0.000us 0.00% 0.000us 0.000us 6
|
| 4017 |
+
aten::view 0.77% 16.170us 0.77% 16.170us 2.695us 0.000us 0.00% 0.000us 0.000us 6
|
| 4018 |
+
aten::select 0.87% 18.140us 1.04% 21.670us 7.223us 0.000us 0.00% 0.000us 0.000us 3
|
| 4019 |
+
aten::as_strided 0.17% 3.530us 0.17% 3.530us 1.177us 0.000us 0.00% 0.000us 0.000us 3
|
| 4020 |
+
cudaDeviceSynchronize 0.44% 9.280us 0.44% 9.280us 9.280us 0.000us 0.00% 0.000us 0.000us 1
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
+
Self CPU time total: 2.087ms
|
| 4023 |
+
Self CUDA time total: 25.279us
|
| 4024 |
|
| 4025 |
|
| 4026 |
|
|
|
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4032 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4033 |
+
hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 134.144us 513.10% 134.144us 134.144us 1
|
| 4034 |
+
hf_kernels_deformable_detr 4.75% 94.541us 99.73% 1.985ms 1.985ms 0.000us 0.00% 27.072us 27.072us 1
|
| 4035 |
+
_deformable_detr_57c3d32::ms_deform_attn_forward 1.59% 31.632us 94.98% 1.890ms 630.031us 23.360us 89.35% 27.072us 9.024us 3
|
| 4036 |
+
void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 23.360us 89.35% 23.360us 7.787us 3
|
| 4037 |
+
aten::zeros 0.38% 7.548us 91.51% 1.821ms 607.010us 0.000us 0.00% 3.712us 1.237us 3
|
| 4038 |
+
aten::zero_ 0.42% 8.279us 90.32% 1.797ms 599.120us 0.000us 0.00% 3.712us 1.237us 3
|
| 4039 |
+
aten::fill_ 1.23% 24.533us 89.90% 1.789ms 596.360us 2.784us 10.65% 3.712us 1.237us 3
|
| 4040 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.784us 10.65% 2.784us 0.928us 3
|
| 4041 |
+
Activity Buffer Request 87.33% 1.738ms 87.33% 1.738ms 1.738ms 0.928us 3.55% 0.928us 0.928us 1
|
| 4042 |
+
aten::empty 0.81% 16.122us 0.81% 16.122us 5.374us 0.000us 0.00% 0.000us 0.000us 3
|
| 4043 |
+
cudaLaunchKernel 2.12% 42.110us 2.12% 42.110us 7.018us 0.000us 0.00% 0.000us 0.000us 6
|
| 4044 |
+
aten::view 0.47% 9.440us 0.47% 9.440us 1.573us 0.000us 0.00% 0.000us 0.000us 6
|
| 4045 |
+
aten::select 0.52% 10.420us 0.63% 12.530us 4.177us 0.000us 0.00% 0.000us 0.000us 3
|
| 4046 |
+
aten::as_strided 0.11% 2.110us 0.11% 2.110us 0.703us 0.000us 0.00% 0.000us 0.000us 3
|
| 4047 |
+
cudaDeviceSynchronize 0.27% 5.390us 0.27% 5.390us 5.390us 0.000us 0.00% 0.000us 0.000us 1
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
+
Self CPU time total: 1.990ms
|
| 4050 |
+
Self CUDA time total: 26.144us
|
| 4051 |
|
| 4052 |
|
| 4053 |
|
|
|
|
| 4057 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4058 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4059 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4060 |
+
hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 132.927us 521.86% 132.927us 132.927us 1
|
| 4061 |
+
hf_kernels_deformable_detr 4.64% 88.002us 99.69% 1.889ms 1.889ms 0.000us 0.00% 26.432us 26.432us 1
|
| 4062 |
+
_deformable_detr_57c3d32::ms_deform_attn_forward 1.65% 31.271us 95.05% 1.801ms 600.270us 22.624us 88.82% 26.432us 8.811us 3
|
| 4063 |
+
void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.624us 88.82% 22.624us 7.541us 3
|
| 4064 |
+
aten::zeros 0.45% 8.600us 91.43% 1.732ms 577.433us 0.000us 0.00% 3.808us 1.269us 3
|
| 4065 |
+
aten::zero_ 0.42% 7.879us 90.13% 1.708ms 569.182us 0.000us 0.00% 3.808us 1.269us 3
|
| 4066 |
+
aten::fill_ 1.34% 25.390us 89.71% 1.700ms 566.556us 2.848us 11.18% 3.808us 1.269us 3
|
| 4067 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.848us 11.18% 2.848us 0.949us 3
|
| 4068 |
+
Activity Buffer Request 87.00% 1.648ms 87.00% 1.648ms 1.648ms 0.960us 3.77% 0.960us 0.960us 1
|
| 4069 |
+
aten::empty 0.85% 16.152us 0.85% 16.152us 5.384us 0.000us 0.00% 0.000us 0.000us 3
|
| 4070 |
+
cudaLaunchKernel 2.16% 40.982us 2.16% 40.982us 6.830us 0.000us 0.00% 0.000us 0.000us 6
|
| 4071 |
+
aten::view 0.49% 9.259us 0.49% 9.259us 1.543us 0.000us 0.00% 0.000us 0.000us 6
|
| 4072 |
+
aten::select 0.57% 10.851us 0.68% 12.901us 4.300us 0.000us 0.00% 0.000us 0.000us 3
|
| 4073 |
+
aten::as_strided 0.11% 2.050us 0.11% 2.050us 0.683us 0.000us 0.00% 0.000us 0.000us 3
|
| 4074 |
+
cudaDeviceSynchronize 0.31% 5.790us 0.31% 5.790us 5.790us 0.000us 0.00% 0.000us 0.000us 1
|
| 4075 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4076 |
+
Self CPU time total: 1.895ms
|
| 4077 |
+
Self CUDA time total: 25.472us
|
| 4078 |
|
| 4079 |
|
| 4080 |
|
|
|
|
| 4084 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4085 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4086 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4087 |
+
hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 141.952us 303.01% 141.952us 141.952us 1
|
| 4088 |
+
hf_kernels_deformable_detr 4.29% 94.562us 99.77% 2.200ms 2.200ms 0.000us 0.00% 47.871us 47.871us 1
|
| 4089 |
+
_deformable_detr_57c3d32::ms_deform_attn_forward 1.45% 32.013us 95.49% 2.106ms 701.872us 43.744us 93.38% 47.871us 15.957us 3
|
| 4090 |
+
void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 43.744us 93.38% 43.744us 14.581us 3
|
| 4091 |
+
aten::zeros 0.35% 7.690us 92.40% 2.038ms 679.194us 0.000us 0.00% 4.127us 1.376us 3
|
| 4092 |
+
aten::zero_ 0.37% 8.230us 91.34% 2.014ms 671.361us 0.000us 0.00% 4.127us 1.376us 3
|
| 4093 |
+
aten::fill_ 1.11% 24.520us 90.96% 2.006ms 668.618us 3.103us 6.62% 4.127us 1.376us 3
|
| 4094 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.103us 6.62% 3.103us 1.034us 3
|
| 4095 |
+
Activity Buffer Request 79.72% 1.758ms 79.72% 1.758ms 1.758ms 1.024us 2.19% 1.024us 1.024us 1
|
| 4096 |
+
aten::empty 0.72% 15.810us 0.72% 15.810us 5.270us 0.000us 0.00% 0.000us 0.000us 3
|
| 4097 |
+
cudaLaunchKernel 10.76% 237.325us 10.76% 237.325us 39.554us 0.000us 0.00% 0.000us 0.000us 6
|
| 4098 |
+
aten::view 0.42% 9.159us 0.42% 9.159us 1.527us 0.000us 0.00% 0.000us 0.000us 6
|
| 4099 |
+
aten::select 0.49% 10.790us 0.58% 12.870us 4.290us 0.000us 0.00% 0.000us 0.000us 3
|
| 4100 |
+
aten::as_strided 0.09% 2.080us 0.09% 2.080us 0.693us 0.000us 0.00% 0.000us 0.000us 3
|
| 4101 |
+
cudaDeviceSynchronize 0.23% 4.980us 0.23% 4.980us 4.980us 0.000us 0.00% 0.000us 0.000us 1
|
| 4102 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4103 |
+
Self CPU time total: 2.205ms
|
| 4104 |
+
Self CUDA time total: 46.847us
|
| 4105 |
|
| 4106 |
|
| 4107 |
impl wl p50(ms) ok
|
| 4108 |
hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
|
| 4109 |
+
hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.04 True
|
| 4110 |
+
hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.04 True
|
| 4111 |
hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
|
| 4112 |
</pre></div>
|
| 4113 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4114 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4115 |
<div class="uv-logs-content" style="display: none;">
|
| 4116 |
+
Installed 14 packages in 12ms
|
| 4117 |
</div>
|
| 4118 |
</div>
|
| 4119 |
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4120 |
+
Fetching 7 files: 29%|██▊ | 2/7 [00:00<00:00, 16.18it/s]
|
| 4121 |
+
Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 9.41it/s]
|
| 4122 |
+
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 13.86it/s]</div>
|
| 4123 |
<div class="cell-artifacts">
|
| 4124 |
<h4>Artifacts:</h4>
|
| 4125 |
<a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
|
deformable_detr/impls/torch_deformable_detr.html
CHANGED
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: nv | 0.
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3904,7 +3904,7 @@ Cell: nv | 0.28s
|
|
| 3904 |
</div>
|
| 3905 |
</div>
|
| 3906 |
<div id="output-nv" class="cell-output">
|
| 3907 |
-
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19
|
| 3908 |
+-----------------------------------------------------------------------------------------+
|
| 3909 |
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
|
@@ -3913,7 +3913,7 @@ Cell: nv | 0.28s
|
|
| 3913 |
| | | MIG M. |
|
| 3914 |
|=========================================+========================+======================|
|
| 3915 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3916 |
-
| N/A
|
| 3917 |
| | | N/A |
|
| 3918 |
+-----------------------------------------+------------------------+----------------------+
|
| 3919 |
|
|
@@ -3935,9 +3935,9 @@ Cell: nv | 0.28s
|
|
| 3935 |
<span class="collapse-indicators">
|
| 3936 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3937 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3938 |
-
<span id="uv-indicator-benchmark" style="cursor:
|
| 3939 |
</span> |
|
| 3940 |
-
Cell: benchmark |
|
| 3941 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3942 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3943 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4077,29 +4077,29 @@ PROFILE TRACE: torch_eager | cuda_B1_Q100_H8_E256_L4_P4
|
|
| 4077 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4078 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4081 |
-
torch_eager 20.
|
| 4082 |
-
aten::index 4.
|
| 4083 |
-
aten::copy_ 4.
|
| 4084 |
-
aten::mul 5.
|
| 4085 |
-
void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 236.
|
| 4086 |
-
aten::to 0.58%
|
| 4087 |
-
aten::_to_copy
|
| 4088 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4089 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 167.
|
| 4090 |
-
aten::contiguous 0.
|
| 4091 |
-
aten::clone 0.
|
| 4092 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.
|
| 4093 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.
|
| 4094 |
-
aten::__and__ 0.
|
| 4095 |
-
aten::bitwise_and 2.
|
| 4096 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 99.
|
| 4097 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 86.
|
| 4098 |
-
aten::sub 2.
|
| 4099 |
-
aten::add 1.
|
| 4100 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4101 |
-
Self CPU time total:
|
| 4102 |
-
Self CUDA time total: 1.
|
| 4103 |
|
| 4104 |
|
| 4105 |
|
|
@@ -4109,29 +4109,29 @@ PROFILE TRACE: torch_eager | cuda_B1_Q300_H8_E256_L4_P4
|
|
| 4109 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4110 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4111 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4112 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.
|
| 4113 |
-
torch_eager 19.
|
| 4114 |
-
aten::index 4.
|
| 4115 |
-
aten::copy_ 4.
|
| 4116 |
-
aten::mul
|
| 4117 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 267.
|
| 4118 |
-
void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4119 |
-
aten::to 0.
|
| 4120 |
-
aten::_to_copy 1.87%
|
| 4121 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 169.
|
| 4122 |
-
aten::contiguous 0.
|
| 4123 |
-
aten::clone 0.
|
| 4124 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 132.
|
| 4125 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 117.
|
| 4126 |
-
aten::__and__ 0.
|
| 4127 |
-
aten::bitwise_and 2.
|
| 4128 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 105.
|
| 4129 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.
|
| 4130 |
-
aten::add 1.62%
|
| 4131 |
-
aten::sub 2.
|
| 4132 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4133 |
-
Self CPU time total:
|
| 4134 |
-
Self CUDA time total: 1.
|
| 4135 |
|
| 4136 |
|
| 4137 |
|
|
@@ -4141,29 +4141,29 @@ PROFILE TRACE: torch_eager | cuda_B2_Q100_H8_E256_L4_P4
|
|
| 4141 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4142 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4143 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4144 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.
|
| 4145 |
-
torch_eager
|
| 4146 |
-
aten::index 4.
|
| 4147 |
-
aten::copy_ 4.
|
| 4148 |
-
aten::mul
|
| 4149 |
-
void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 243.
|
| 4150 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4151 |
-
aten::to 0.
|
| 4152 |
-
aten::_to_copy 1.93%
|
| 4153 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4154 |
-
aten::contiguous 0.37%
|
| 4155 |
-
aten::clone 0.
|
| 4156 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 134.
|
| 4157 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.
|
| 4158 |
-
aten::__and__ 0.
|
| 4159 |
-
aten::bitwise_and 2.
|
| 4160 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.
|
| 4161 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 95.
|
| 4162 |
-
aten::add 1.
|
| 4163 |
-
aten::sub 2.
|
| 4164 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4165 |
-
Self CPU time total: 21.
|
| 4166 |
-
Self CUDA time total: 1.
|
| 4167 |
|
| 4168 |
|
| 4169 |
|
|
@@ -4173,37 +4173,43 @@ PROFILE TRACE: torch_eager | cuda_B2_Q300_H8_E256_L4_P4
|
|
| 4173 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4174 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4175 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4176 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4177 |
-
torch_eager
|
| 4178 |
-
aten::mul
|
| 4179 |
-
aten::index 4.
|
| 4180 |
-
aten::copy_
|
| 4181 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4182 |
-
void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 281.
|
| 4183 |
-
aten::to 0.
|
| 4184 |
-
aten::_to_copy
|
| 4185 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4186 |
-
aten::contiguous 0.
|
| 4187 |
-
aten::clone 0.
|
| 4188 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 137.
|
| 4189 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4190 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4191 |
-
aten::add 1.
|
| 4192 |
-
aten::__and__ 0.
|
| 4193 |
-
aten::bitwise_and 2.
|
| 4194 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 108.
|
| 4195 |
-
aten::sub 2.
|
| 4196 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4197 |
-
Self CPU time total: 21.
|
| 4198 |
-
Self CUDA time total: 1.
|
| 4199 |
|
| 4200 |
|
| 4201 |
impl wl p50(ms) ok
|
| 4202 |
-
torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.
|
| 4203 |
-
torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.
|
| 4204 |
-
torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.
|
| 4205 |
-
torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.
|
| 4206 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4207 |
<div class="cell-artifacts">
|
| 4208 |
<h4>Artifacts:</h4>
|
| 4209 |
<a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
|
|
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: nv | 0.25s
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3904 |
</div>
|
| 3905 |
</div>
|
| 3906 |
<div id="output-nv" class="cell-output">
|
| 3907 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 23:02:11 2025
|
| 3908 |
+-----------------------------------------------------------------------------------------+
|
| 3909 |
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3913 |
| | | MIG M. |
|
| 3914 |
|=========================================+========================+======================|
|
| 3915 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3916 |
+
| N/A 42C P0 83W / 350W | 0MiB / 46068MiB | 12% Default |
|
| 3917 |
| | | N/A |
|
| 3918 |
+-----------------------------------------+------------------------+----------------------+
|
| 3919 |
|
|
|
|
| 3935 |
<span class="collapse-indicators">
|
| 3936 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3937 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3938 |
+
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3939 |
</span> |
|
| 3940 |
+
Cell: benchmark | 9.26s
|
| 3941 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3942 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3943 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4077 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4078 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.976ms 1348.57% 19.976ms 19.976ms 1
|
| 4081 |
+
torch_eager 20.04% 4.395ms 99.96% 21.929ms 21.929ms 0.000us 0.00% 1.482ms 1.482ms 1
|
| 4082 |
+
aten::index 4.53% 992.766us 16.58% 3.638ms 75.786us 236.544us 15.97% 370.336us 7.715us 48
|
| 4083 |
+
aten::copy_ 4.69% 1.028ms 11.56% 2.535ms 11.576us 366.053us 24.71% 366.053us 1.671us 219
|
| 4084 |
+
aten::mul 5.90% 1.295ms 10.04% 2.203ms 11.474us 293.531us 19.82% 293.531us 1.529us 192
|
| 4085 |
+
void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 236.544us 15.97% 236.544us 4.928us 48
|
| 4086 |
+
aten::to 0.58% 126.843us 11.27% 2.473ms 14.461us 0.000us 0.00% 232.261us 1.358us 171
|
| 4087 |
+
aten::_to_copy 1.95% 426.950us 10.69% 2.346ms 19.073us 0.000us 0.00% 232.261us 1.888us 123
|
| 4088 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 201.821us 13.62% 201.821us 1.682us 120
|
| 4089 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 167.778us 11.33% 167.778us 1.997us 84
|
| 4090 |
+
aten::contiguous 0.36% 78.966us 8.52% 1.869ms 19.471us 0.000us 0.00% 133.792us 1.394us 96
|
| 4091 |
+
aten::clone 0.74% 161.750us 8.16% 1.790ms 18.648us 0.000us 0.00% 133.792us 1.394us 96
|
| 4092 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.792us 9.03% 133.792us 1.394us 96
|
| 4093 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.553us 7.80% 115.553us 1.204us 96
|
| 4094 |
+
aten::__and__ 0.42% 91.609us 4.49% 984.808us 11.724us 0.000us 0.00% 99.041us 1.179us 84
|
| 4095 |
+
aten::bitwise_and 2.54% 557.575us 4.07% 893.199us 10.633us 99.041us 6.69% 99.041us 1.179us 84
|
| 4096 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 99.041us 6.69% 99.041us 1.179us 84
|
| 4097 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 86.140us 5.82% 86.140us 1.196us 72
|
| 4098 |
+
aten::sub 2.17% 475.165us 3.61% 791.992us 11.000us 79.197us 5.35% 79.197us 1.100us 72
|
| 4099 |
+
aten::add 1.62% 354.490us 2.70% 592.103us 9.868us 74.334us 5.02% 74.334us 1.239us 60
|
| 4100 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4101 |
+
Self CPU time total: 21.937ms
|
| 4102 |
+
Self CUDA time total: 1.481ms
|
| 4103 |
|
| 4104 |
|
| 4105 |
|
|
|
|
| 4109 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4110 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4111 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4112 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.069ms 1196.67% 19.069ms 19.069ms 1
|
| 4113 |
+
torch_eager 19.87% 4.152ms 99.97% 20.886ms 20.886ms 0.000us 0.00% 1.594ms 1.594ms 1
|
| 4114 |
+
aten::index 4.48% 935.232us 16.67% 3.483ms 72.569us 249.668us 15.67% 382.147us 7.961us 48
|
| 4115 |
+
aten::copy_ 4.80% 1.003ms 11.85% 2.477ms 11.308us 366.556us 23.00% 366.556us 1.674us 219
|
| 4116 |
+
aten::mul 6.04% 1.262ms 10.39% 2.170ms 11.304us 358.714us 22.51% 358.714us 1.868us 192
|
| 4117 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 267.167us 16.77% 267.167us 2.226us 120
|
| 4118 |
+
void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 249.668us 15.67% 249.668us 5.201us 48
|
| 4119 |
+
aten::to 0.60% 125.408us 11.23% 2.347ms 13.724us 0.000us 0.00% 234.077us 1.369us 171
|
| 4120 |
+
aten::_to_copy 1.87% 389.897us 10.63% 2.221ms 18.060us 0.000us 0.00% 234.077us 1.903us 123
|
| 4121 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 169.728us 10.65% 169.728us 2.021us 84
|
| 4122 |
+
aten::contiguous 0.35% 74.120us 8.81% 1.840ms 19.167us 0.000us 0.00% 132.479us 1.380us 96
|
| 4123 |
+
aten::clone 0.79% 164.425us 8.45% 1.766ms 18.395us 0.000us 0.00% 132.479us 1.380us 96
|
| 4124 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 132.479us 8.31% 132.479us 1.380us 96
|
| 4125 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 117.475us 7.37% 117.475us 1.224us 96
|
| 4126 |
+
aten::__and__ 0.44% 90.959us 4.50% 941.006us 11.202us 0.000us 0.00% 105.476us 1.256us 84
|
| 4127 |
+
aten::bitwise_and 2.49% 520.216us 4.07% 850.047us 10.120us 105.476us 6.62% 105.476us 1.256us 84
|
| 4128 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 105.476us 6.62% 105.476us 1.256us 84
|
| 4129 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.197us 6.54% 104.197us 1.447us 72
|
| 4130 |
+
aten::add 1.62% 338.151us 2.73% 570.998us 9.517us 91.678us 5.75% 91.678us 1.528us 60
|
| 4131 |
+
aten::sub 2.14% 447.777us 3.61% 754.447us 10.478us 80.286us 5.04% 80.286us 1.115us 72
|
| 4132 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4133 |
+
Self CPU time total: 20.891ms
|
| 4134 |
+
Self CUDA time total: 1.593ms
|
| 4135 |
|
| 4136 |
|
| 4137 |
|
|
|
|
| 4141 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4142 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4143 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4144 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.677ms 1279.16% 19.677ms 19.677ms 1
|
| 4145 |
+
torch_eager 19.82% 4.280ms 99.97% 21.590ms 21.590ms 0.000us 0.00% 1.539ms 1.539ms 1
|
| 4146 |
+
aten::index 4.49% 970.701us 16.56% 3.576ms 74.506us 243.261us 15.81% 377.688us 7.868us 48
|
| 4147 |
+
aten::copy_ 4.67% 1.008ms 11.52% 2.487ms 11.356us 367.898us 23.92% 367.898us 1.680us 219
|
| 4148 |
+
aten::mul 5.96% 1.287ms 10.22% 2.207ms 11.495us 324.384us 21.09% 324.384us 1.690us 192
|
| 4149 |
+
void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 243.261us 15.81% 243.261us 5.068us 48
|
| 4150 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 233.533us 15.18% 233.533us 1.946us 120
|
| 4151 |
+
aten::to 0.57% 122.968us 11.17% 2.413ms 14.109us 0.000us 0.00% 233.471us 1.365us 171
|
| 4152 |
+
aten::_to_copy 1.93% 415.801us 10.60% 2.290ms 18.615us 0.000us 0.00% 233.471us 1.898us 123
|
| 4153 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 169.053us 10.99% 169.053us 2.013us 84
|
| 4154 |
+
aten::contiguous 0.37% 80.833us 8.61% 1.859ms 19.360us 0.000us 0.00% 134.427us 1.400us 96
|
| 4155 |
+
aten::clone 0.74% 159.128us 8.23% 1.778ms 18.518us 0.000us 0.00% 134.427us 1.400us 96
|
| 4156 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 134.427us 8.74% 134.427us 1.400us 96
|
| 4157 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.871us 7.53% 115.871us 1.207us 96
|
| 4158 |
+
aten::__and__ 0.43% 92.507us 4.50% 971.781us 11.569us 0.000us 0.00% 104.160us 1.240us 84
|
| 4159 |
+
aten::bitwise_and 2.49% 538.828us 4.07% 879.274us 10.468us 104.160us 6.77% 104.160us 1.240us 84
|
| 4160 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.160us 6.77% 104.160us 1.240us 84
|
| 4161 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 95.908us 6.23% 95.908us 1.332us 72
|
| 4162 |
+
aten::add 1.64% 354.089us 2.75% 594.321us 9.905us 83.684us 5.44% 83.684us 1.395us 60
|
| 4163 |
+
aten::sub 2.17% 468.302us 3.66% 789.975us 10.972us 79.297us 5.15% 79.297us 1.101us 72
|
| 4164 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4165 |
+
Self CPU time total: 21.596ms
|
| 4166 |
+
Self CUDA time total: 1.538ms
|
| 4167 |
|
| 4168 |
|
| 4169 |
|
|
|
|
| 4173 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4174 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4175 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4176 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.412ms 1097.11% 19.412ms 19.412ms 1
|
| 4177 |
+
torch_eager 19.43% 4.188ms 99.97% 21.544ms 21.544ms 0.000us 0.00% 1.770ms 1.770ms 1
|
| 4178 |
+
aten::mul 5.88% 1.267ms 10.26% 2.212ms 11.521us 450.496us 25.46% 450.496us 2.346us 192
|
| 4179 |
+
aten::index 4.35% 938.379us 16.41% 3.536ms 73.661us 281.281us 15.90% 418.917us 8.727us 48
|
| 4180 |
+
aten::copy_ 4.72% 1.017ms 12.00% 2.587ms 11.811us 371.333us 20.99% 371.333us 1.696us 219
|
| 4181 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 355.809us 20.11% 355.809us 2.965us 120
|
| 4182 |
+
void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 281.281us 15.90% 281.281us 5.860us 48
|
| 4183 |
+
aten::to 0.57% 122.376us 11.15% 2.403ms 14.050us 0.000us 0.00% 233.697us 1.367us 171
|
| 4184 |
+
aten::_to_copy 1.79% 386.738us 10.58% 2.280ms 18.538us 0.000us 0.00% 233.697us 1.900us 123
|
| 4185 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 167.937us 9.49% 167.937us 1.999us 84
|
| 4186 |
+
aten::contiguous 0.36% 77.297us 8.74% 1.884ms 19.624us 0.000us 0.00% 137.636us 1.434us 96
|
| 4187 |
+
aten::clone 0.72% 155.217us 8.38% 1.807ms 18.819us 0.000us 0.00% 137.636us 1.434us 96
|
| 4188 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 137.636us 7.78% 137.636us 1.434us 96
|
| 4189 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 130.211us 7.36% 130.211us 1.808us 72
|
| 4190 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 119.940us 6.78% 119.940us 1.249us 96
|
| 4191 |
+
aten::add 1.56% 336.953us 2.72% 585.265us 9.754us 114.431us 6.47% 114.431us 1.907us 60
|
| 4192 |
+
aten::__and__ 0.41% 88.309us 4.45% 959.250us 11.420us 0.000us 0.00% 108.994us 1.298us 84
|
| 4193 |
+
aten::bitwise_and 2.40% 517.417us 4.04% 870.941us 10.368us 108.994us 6.16% 108.994us 1.298us 84
|
| 4194 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 108.994us 6.16% 108.994us 1.298us 84
|
| 4195 |
+
aten::sub 2.15% 464.219us 3.68% 792.358us 11.005us 84.546us 4.78% 84.546us 1.174us 72
|
| 4196 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4197 |
+
Self CPU time total: 21.550ms
|
| 4198 |
+
Self CUDA time total: 1.769ms
|
| 4199 |
|
| 4200 |
|
| 4201 |
impl wl p50(ms) ok
|
| 4202 |
+
torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.38 True
|
| 4203 |
+
torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.08 True
|
| 4204 |
+
torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.16 True
|
| 4205 |
+
torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.17 True
|
| 4206 |
</pre></div>
|
| 4207 |
+
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4208 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4209 |
+
<div class="uv-logs-content" style="display: none;">
|
| 4210 |
+
Installed 37 packages in 280ms
|
| 4211 |
+
</div>
|
| 4212 |
+
</div>
|
| 4213 |
<div class="cell-artifacts">
|
| 4214 |
<h4>Artifacts:</h4>
|
| 4215 |
<a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
|
deformable_detr/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
deformable_detr/results/combined_results.html
CHANGED
|
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
-
<dc:date>2025-12-
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
|
@@ -3973,70 +3973,70 @@ body[data-tool="eraser"] .main-content {
|
|
| 3973 |
<g id="matplotlib.axis_2">
|
| 3974 |
<g id="ytick_1">
|
| 3975 |
<g id="grid-y--2" class="grid grid-y">
|
| 3976 |
-
<path d="M 39.870649 410.
|
| 3977 |
</g>
|
| 3978 |
<g id="line2d_5">
|
| 3979 |
<defs>
|
| 3980 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3981 |
</defs>
|
| 3982 |
<g>
|
| 3983 |
-
<use ns4:href="#m0fca2865ba" x="39.870649" y="410.
|
| 3984 |
</g>
|
| 3985 |
</g>
|
| 3986 |
<g id="text_5">
|
| 3987 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="
|
| 3988 |
</g>
|
| 3989 |
</g>
|
| 3990 |
<g id="ytick_2">
|
| 3991 |
<g id="grid-y--3" class="grid grid-y">
|
| 3992 |
-
<path d="M 39.870649
|
| 3993 |
</g>
|
| 3994 |
<g id="line2d_6">
|
| 3995 |
<g>
|
| 3996 |
-
<use ns4:href="#m0fca2865ba" x="39.870649" y="
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="text_6">
|
| 4000 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="
|
| 4001 |
</g>
|
| 4002 |
</g>
|
| 4003 |
<g id="ytick_3">
|
| 4004 |
<g id="grid-y--4" class="grid grid-y">
|
| 4005 |
-
<path d="M 39.870649
|
| 4006 |
</g>
|
| 4007 |
<g id="line2d_7">
|
| 4008 |
<g>
|
| 4009 |
-
<use ns4:href="#m0fca2865ba" x="39.870649" y="
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="text_7">
|
| 4013 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="
|
| 4014 |
</g>
|
| 4015 |
</g>
|
| 4016 |
<g id="ytick_4">
|
| 4017 |
<g id="grid-y--5" class="grid grid-y">
|
| 4018 |
-
<path d="M 39.870649
|
| 4019 |
</g>
|
| 4020 |
<g id="line2d_8">
|
| 4021 |
<g>
|
| 4022 |
-
<use ns4:href="#m0fca2865ba" x="39.870649" y="
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="text_8">
|
| 4026 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="ytick_5">
|
| 4030 |
<g id="grid-y--6" class="grid grid-y">
|
| 4031 |
-
<path d="M 39.870649
|
| 4032 |
</g>
|
| 4033 |
<g id="line2d_9">
|
| 4034 |
<g>
|
| 4035 |
-
<use ns4:href="#m0fca2865ba" x="39.870649" y="
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="text_9">
|
| 4039 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="label--y" class="ylabel">
|
|
@@ -4044,26 +4044,26 @@ body[data-tool="eraser"] .main-content {
|
|
| 4044 |
</g>
|
| 4045 |
</g>
|
| 4046 |
<g id="series--hf-kernels-deformable-detr" class="series">
|
| 4047 |
-
<path d="M 75.521665 407.004793 L 313.195102 406.
|
| 4048 |
<defs>
|
| 4049 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4050 |
</defs>
|
| 4051 |
<g clip-path="url(#pbac879f81a)">
|
| 4052 |
<use ns4:href="#md7efaf3aec" x="75.521665" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4053 |
-
<use ns4:href="#md7efaf3aec" x="313.195102" y="406.
|
| 4054 |
-
<use ns4:href="#md7efaf3aec" x="550.868538" y="406.
|
| 4055 |
-
<use ns4:href="#md7efaf3aec" x="788.541975" y="406.
|
| 4056 |
</g>
|
| 4057 |
</g>
|
| 4058 |
<g id="series--torch-eager" class="series">
|
| 4059 |
-
<path d="M 75.521665
|
| 4060 |
<defs>
|
| 4061 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4062 |
</defs>
|
| 4063 |
<g clip-path="url(#pbac879f81a)">
|
| 4064 |
-
<use ns4:href="#m9b8c54d372" x="75.521665" y="
|
| 4065 |
-
<use ns4:href="#m9b8c54d372" x="313.195102" y="
|
| 4066 |
-
<use ns4:href="#m9b8c54d372" x="550.868538" y="
|
| 4067 |
<use ns4:href="#m9b8c54d372" x="788.541975" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4068 |
</g>
|
| 4069 |
</g>
|
|
@@ -4122,7 +4122,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4122 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4123 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4124 |
</span> |
|
| 4125 |
-
Cell: combine | 4.
|
| 4126 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4127 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4128 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4210,13 +4210,13 @@ COMBINED BENCHMARK SUMMARY
|
|
| 4210 |
|
| 4211 |
impl wl p50(ms) ok
|
| 4212 |
hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
|
| 4213 |
-
hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.
|
| 4214 |
-
hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.
|
| 4215 |
hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
|
| 4216 |
-
torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.
|
| 4217 |
-
torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.
|
| 4218 |
-
torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.
|
| 4219 |
-
torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.
|
| 4220 |
|
| 4221 |
GENERATING COMBINED VISUALIZATION
|
| 4222 |
|
|
@@ -4236,7 +4236,7 @@ Implementations included:
|
|
| 4236 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4237 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4238 |
<div class="uv-logs-content" style="display: none;">
|
| 4239 |
-
Installed 37 packages in
|
| 4240 |
</div>
|
| 4241 |
</div>
|
| 4242 |
<div class="cell-artifacts">
|
|
@@ -4249,7 +4249,7 @@ Installed 37 packages in 311ms
|
|
| 4249 |
<rdf:RDF>
|
| 4250 |
<ns2:Work>
|
| 4251 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4252 |
-
<dc:date>2025-12-
|
| 4253 |
<dc:format>image/svg+xml</dc:format>
|
| 4254 |
<dc:creator>
|
| 4255 |
<ns2:Agent>
|
|
@@ -4333,70 +4333,70 @@ Installed 37 packages in 311ms
|
|
| 4333 |
<g id="matplotlib.axis_2">
|
| 4334 |
<g id="ytick_1">
|
| 4335 |
<g id="grid-y--2" class="grid grid-y">
|
| 4336 |
-
<path d="M 39.870649 410.
|
| 4337 |
</g>
|
| 4338 |
<g id="line2d_5">
|
| 4339 |
<defs>
|
| 4340 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4341 |
</defs>
|
| 4342 |
<g>
|
| 4343 |
-
<use ns4:href="#m0fca2865ba" x="39.870649" y="410.
|
| 4344 |
</g>
|
| 4345 |
</g>
|
| 4346 |
<g id="text_5">
|
| 4347 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="
|
| 4348 |
</g>
|
| 4349 |
</g>
|
| 4350 |
<g id="ytick_2">
|
| 4351 |
<g id="grid-y--3" class="grid grid-y">
|
| 4352 |
-
<path d="M 39.870649
|
| 4353 |
</g>
|
| 4354 |
<g id="line2d_6">
|
| 4355 |
<g>
|
| 4356 |
-
<use ns4:href="#m0fca2865ba" x="39.870649" y="
|
| 4357 |
</g>
|
| 4358 |
</g>
|
| 4359 |
<g id="text_6">
|
| 4360 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="
|
| 4361 |
</g>
|
| 4362 |
</g>
|
| 4363 |
<g id="ytick_3">
|
| 4364 |
<g id="grid-y--4" class="grid grid-y">
|
| 4365 |
-
<path d="M 39.870649
|
| 4366 |
</g>
|
| 4367 |
<g id="line2d_7">
|
| 4368 |
<g>
|
| 4369 |
-
<use ns4:href="#m0fca2865ba" x="39.870649" y="
|
| 4370 |
</g>
|
| 4371 |
</g>
|
| 4372 |
<g id="text_7">
|
| 4373 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="
|
| 4374 |
</g>
|
| 4375 |
</g>
|
| 4376 |
<g id="ytick_4">
|
| 4377 |
<g id="grid-y--5" class="grid grid-y">
|
| 4378 |
-
<path d="M 39.870649
|
| 4379 |
</g>
|
| 4380 |
<g id="line2d_8">
|
| 4381 |
<g>
|
| 4382 |
-
<use ns4:href="#m0fca2865ba" x="39.870649" y="
|
| 4383 |
</g>
|
| 4384 |
</g>
|
| 4385 |
<g id="text_8">
|
| 4386 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="
|
| 4387 |
</g>
|
| 4388 |
</g>
|
| 4389 |
<g id="ytick_5">
|
| 4390 |
<g id="grid-y--6" class="grid grid-y">
|
| 4391 |
-
<path d="M 39.870649
|
| 4392 |
</g>
|
| 4393 |
<g id="line2d_9">
|
| 4394 |
<g>
|
| 4395 |
-
<use ns4:href="#m0fca2865ba" x="39.870649" y="
|
| 4396 |
</g>
|
| 4397 |
</g>
|
| 4398 |
<g id="text_9">
|
| 4399 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="
|
| 4400 |
</g>
|
| 4401 |
</g>
|
| 4402 |
<g id="label--y" class="ylabel">
|
|
@@ -4404,26 +4404,26 @@ Installed 37 packages in 311ms
|
|
| 4404 |
</g>
|
| 4405 |
</g>
|
| 4406 |
<g id="series--hf-kernels-deformable-detr" class="series">
|
| 4407 |
-
<path d="M 75.521665 407.004793 L 313.195102 406.
|
| 4408 |
<defs>
|
| 4409 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4410 |
</defs>
|
| 4411 |
<g clip-path="url(#pbac879f81a)">
|
| 4412 |
<use ns4:href="#md7efaf3aec" x="75.521665" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4413 |
-
<use ns4:href="#md7efaf3aec" x="313.195102" y="406.
|
| 4414 |
-
<use ns4:href="#md7efaf3aec" x="550.868538" y="406.
|
| 4415 |
-
<use ns4:href="#md7efaf3aec" x="788.541975" y="406.
|
| 4416 |
</g>
|
| 4417 |
</g>
|
| 4418 |
<g id="series--torch-eager" class="series">
|
| 4419 |
-
<path d="M 75.521665
|
| 4420 |
<defs>
|
| 4421 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4422 |
</defs>
|
| 4423 |
<g clip-path="url(#pbac879f81a)">
|
| 4424 |
-
<use ns4:href="#m9b8c54d372" x="75.521665" y="
|
| 4425 |
-
<use ns4:href="#m9b8c54d372" x="313.195102" y="
|
| 4426 |
-
<use ns4:href="#m9b8c54d372" x="550.868538" y="
|
| 4427 |
<use ns4:href="#m9b8c54d372" x="788.541975" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4428 |
</g>
|
| 4429 |
</g>
|
|
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
+
<dc:date>2025-12-19T23:02:54.345828</dc:date>
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
|
|
|
| 3973 |
<g id="matplotlib.axis_2">
|
| 3974 |
<g id="ytick_1">
|
| 3975 |
<g id="grid-y--2" class="grid grid-y">
|
| 3976 |
+
<path d="M 39.870649 410.286782 L 824.19299 410.286782 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3977 |
</g>
|
| 3978 |
<g id="line2d_5">
|
| 3979 |
<defs>
|
| 3980 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3981 |
</defs>
|
| 3982 |
<g>
|
| 3983 |
+
<use ns4:href="#m0fca2865ba" x="39.870649" y="410.286782" style="stroke: #000000; stroke-width: 0.8" />
|
| 3984 |
</g>
|
| 3985 |
</g>
|
| 3986 |
<g id="text_5">
|
| 3987 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="414.086" transform="rotate(-0 32.870649 414.086)">0</text>
|
| 3988 |
</g>
|
| 3989 |
</g>
|
| 3990 |
<g id="ytick_2">
|
| 3991 |
<g id="grid-y--3" class="grid grid-y">
|
| 3992 |
+
<path d="M 39.870649 322.746079 L 824.19299 322.746079 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3993 |
</g>
|
| 3994 |
<g id="line2d_6">
|
| 3995 |
<g>
|
| 3996 |
+
<use ns4:href="#m0fca2865ba" x="39.870649" y="322.746079" style="stroke: #000000; stroke-width: 0.8" />
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="text_6">
|
| 4000 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="326.545298" transform="rotate(-0 32.870649 326.545298)">1</text>
|
| 4001 |
</g>
|
| 4002 |
</g>
|
| 4003 |
<g id="ytick_3">
|
| 4004 |
<g id="grid-y--4" class="grid grid-y">
|
| 4005 |
+
<path d="M 39.870649 235.205376 L 824.19299 235.205376 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4006 |
</g>
|
| 4007 |
<g id="line2d_7">
|
| 4008 |
<g>
|
| 4009 |
+
<use ns4:href="#m0fca2865ba" x="39.870649" y="235.205376" style="stroke: #000000; stroke-width: 0.8" />
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="text_7">
|
| 4013 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="239.004595" transform="rotate(-0 32.870649 239.004595)">2</text>
|
| 4014 |
</g>
|
| 4015 |
</g>
|
| 4016 |
<g id="ytick_4">
|
| 4017 |
<g id="grid-y--5" class="grid grid-y">
|
| 4018 |
+
<path d="M 39.870649 147.664674 L 824.19299 147.664674 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4019 |
</g>
|
| 4020 |
<g id="line2d_8">
|
| 4021 |
<g>
|
| 4022 |
+
<use ns4:href="#m0fca2865ba" x="39.870649" y="147.664674" style="stroke: #000000; stroke-width: 0.8" />
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="text_8">
|
| 4026 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="151.463893" transform="rotate(-0 32.870649 151.463893)">3</text>
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="ytick_5">
|
| 4030 |
<g id="grid-y--6" class="grid grid-y">
|
| 4031 |
+
<path d="M 39.870649 60.123971 L 824.19299 60.123971 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4032 |
</g>
|
| 4033 |
<g id="line2d_9">
|
| 4034 |
<g>
|
| 4035 |
+
<use ns4:href="#m0fca2865ba" x="39.870649" y="60.123971" style="stroke: #000000; stroke-width: 0.8" />
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="text_9">
|
| 4039 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="63.92319" transform="rotate(-0 32.870649 63.92319)">4</text>
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4044 |
</g>
|
| 4045 |
</g>
|
| 4046 |
<g id="series--hf-kernels-deformable-detr" class="series">
|
| 4047 |
+
<path d="M 75.521665 407.004793 L 313.195102 406.442782 L 550.868538 406.446283 L 788.541975 406.246691 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4048 |
<defs>
|
| 4049 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4050 |
</defs>
|
| 4051 |
<g clip-path="url(#pbac879f81a)">
|
| 4052 |
<use ns4:href="#md7efaf3aec" x="75.521665" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4053 |
+
<use ns4:href="#md7efaf3aec" x="313.195102" y="406.442782" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4054 |
+
<use ns4:href="#md7efaf3aec" x="550.868538" y="406.446283" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4055 |
+
<use ns4:href="#md7efaf3aec" x="788.541975" y="406.246691" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4056 |
</g>
|
| 4057 |
</g>
|
| 4058 |
<g id="series--torch-eager" class="series">
|
| 4059 |
+
<path d="M 75.521665 114.041778 L 313.195102 52.888032 L 550.868538 46.287288 L 788.541975 44.981181 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4060 |
<defs>
|
| 4061 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4062 |
</defs>
|
| 4063 |
<g clip-path="url(#pbac879f81a)">
|
| 4064 |
+
<use ns4:href="#m9b8c54d372" x="75.521665" y="114.041778" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4065 |
+
<use ns4:href="#m9b8c54d372" x="313.195102" y="52.888032" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4066 |
+
<use ns4:href="#m9b8c54d372" x="550.868538" y="46.287288" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4067 |
<use ns4:href="#m9b8c54d372" x="788.541975" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4068 |
</g>
|
| 4069 |
</g>
|
|
|
|
| 4122 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4123 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4124 |
</span> |
|
| 4125 |
+
Cell: combine | 4.41s
|
| 4126 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4127 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4128 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4210 |
|
| 4211 |
impl wl p50(ms) ok
|
| 4212 |
hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
|
| 4213 |
+
hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.04 True
|
| 4214 |
+
hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.04 True
|
| 4215 |
hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
|
| 4216 |
+
torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.38 True
|
| 4217 |
+
torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.08 True
|
| 4218 |
+
torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.16 True
|
| 4219 |
+
torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.17 True
|
| 4220 |
|
| 4221 |
GENERATING COMBINED VISUALIZATION
|
| 4222 |
|
|
|
|
| 4236 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4237 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4238 |
<div class="uv-logs-content" style="display: none;">
|
| 4239 |
+
Installed 37 packages in 297ms
|
| 4240 |
</div>
|
| 4241 |
</div>
|
| 4242 |
<div class="cell-artifacts">
|
|
|
|
| 4249 |
<rdf:RDF>
|
| 4250 |
<ns2:Work>
|
| 4251 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4252 |
+
<dc:date>2025-12-19T23:02:54.345828</dc:date>
|
| 4253 |
<dc:format>image/svg+xml</dc:format>
|
| 4254 |
<dc:creator>
|
| 4255 |
<ns2:Agent>
|
|
|
|
| 4333 |
<g id="matplotlib.axis_2">
|
| 4334 |
<g id="ytick_1">
|
| 4335 |
<g id="grid-y--2" class="grid grid-y">
|
| 4336 |
+
<path d="M 39.870649 410.286782 L 824.19299 410.286782 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4337 |
</g>
|
| 4338 |
<g id="line2d_5">
|
| 4339 |
<defs>
|
| 4340 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4341 |
</defs>
|
| 4342 |
<g>
|
| 4343 |
+
<use ns4:href="#m0fca2865ba" x="39.870649" y="410.286782" style="stroke: #000000; stroke-width: 0.8" />
|
| 4344 |
</g>
|
| 4345 |
</g>
|
| 4346 |
<g id="text_5">
|
| 4347 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="414.086" transform="rotate(-0 32.870649 414.086)">0</text>
|
| 4348 |
</g>
|
| 4349 |
</g>
|
| 4350 |
<g id="ytick_2">
|
| 4351 |
<g id="grid-y--3" class="grid grid-y">
|
| 4352 |
+
<path d="M 39.870649 322.746079 L 824.19299 322.746079 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4353 |
</g>
|
| 4354 |
<g id="line2d_6">
|
| 4355 |
<g>
|
| 4356 |
+
<use ns4:href="#m0fca2865ba" x="39.870649" y="322.746079" style="stroke: #000000; stroke-width: 0.8" />
|
| 4357 |
</g>
|
| 4358 |
</g>
|
| 4359 |
<g id="text_6">
|
| 4360 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="326.545298" transform="rotate(-0 32.870649 326.545298)">1</text>
|
| 4361 |
</g>
|
| 4362 |
</g>
|
| 4363 |
<g id="ytick_3">
|
| 4364 |
<g id="grid-y--4" class="grid grid-y">
|
| 4365 |
+
<path d="M 39.870649 235.205376 L 824.19299 235.205376 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4366 |
</g>
|
| 4367 |
<g id="line2d_7">
|
| 4368 |
<g>
|
| 4369 |
+
<use ns4:href="#m0fca2865ba" x="39.870649" y="235.205376" style="stroke: #000000; stroke-width: 0.8" />
|
| 4370 |
</g>
|
| 4371 |
</g>
|
| 4372 |
<g id="text_7">
|
| 4373 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="239.004595" transform="rotate(-0 32.870649 239.004595)">2</text>
|
| 4374 |
</g>
|
| 4375 |
</g>
|
| 4376 |
<g id="ytick_4">
|
| 4377 |
<g id="grid-y--5" class="grid grid-y">
|
| 4378 |
+
<path d="M 39.870649 147.664674 L 824.19299 147.664674 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4379 |
</g>
|
| 4380 |
<g id="line2d_8">
|
| 4381 |
<g>
|
| 4382 |
+
<use ns4:href="#m0fca2865ba" x="39.870649" y="147.664674" style="stroke: #000000; stroke-width: 0.8" />
|
| 4383 |
</g>
|
| 4384 |
</g>
|
| 4385 |
<g id="text_8">
|
| 4386 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="151.463893" transform="rotate(-0 32.870649 151.463893)">3</text>
|
| 4387 |
</g>
|
| 4388 |
</g>
|
| 4389 |
<g id="ytick_5">
|
| 4390 |
<g id="grid-y--6" class="grid grid-y">
|
| 4391 |
+
<path d="M 39.870649 60.123971 L 824.19299 60.123971 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4392 |
</g>
|
| 4393 |
<g id="line2d_9">
|
| 4394 |
<g>
|
| 4395 |
+
<use ns4:href="#m0fca2865ba" x="39.870649" y="60.123971" style="stroke: #000000; stroke-width: 0.8" />
|
| 4396 |
</g>
|
| 4397 |
</g>
|
| 4398 |
<g id="text_9">
|
| 4399 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="63.92319" transform="rotate(-0 32.870649 63.92319)">4</text>
|
| 4400 |
</g>
|
| 4401 |
</g>
|
| 4402 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4404 |
</g>
|
| 4405 |
</g>
|
| 4406 |
<g id="series--hf-kernels-deformable-detr" class="series">
|
| 4407 |
+
<path d="M 75.521665 407.004793 L 313.195102 406.442782 L 550.868538 406.446283 L 788.541975 406.246691 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4408 |
<defs>
|
| 4409 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4410 |
</defs>
|
| 4411 |
<g clip-path="url(#pbac879f81a)">
|
| 4412 |
<use ns4:href="#md7efaf3aec" x="75.521665" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4413 |
+
<use ns4:href="#md7efaf3aec" x="313.195102" y="406.442782" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4414 |
+
<use ns4:href="#md7efaf3aec" x="550.868538" y="406.446283" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4415 |
+
<use ns4:href="#md7efaf3aec" x="788.541975" y="406.246691" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4416 |
</g>
|
| 4417 |
</g>
|
| 4418 |
<g id="series--torch-eager" class="series">
|
| 4419 |
+
<path d="M 75.521665 114.041778 L 313.195102 52.888032 L 550.868538 46.287288 L 788.541975 44.981181 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4420 |
<defs>
|
| 4421 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4422 |
</defs>
|
| 4423 |
<g clip-path="url(#pbac879f81a)">
|
| 4424 |
+
<use ns4:href="#m9b8c54d372" x="75.521665" y="114.041778" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4425 |
+
<use ns4:href="#m9b8c54d372" x="313.195102" y="52.888032" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4426 |
+
<use ns4:href="#m9b8c54d372" x="550.868538" y="46.287288" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4427 |
<use ns4:href="#m9b8c54d372" x="788.541975" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4428 |
</g>
|
| 4429 |
</g>
|
flash_attn/impls/artifacts/benchmark/attention.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-12-
|
| 2 |
-
{"ts": "2025-12-
|
| 3 |
-
{"ts": "2025-12-
|
| 4 |
-
{"ts": "2025-12-
|
| 5 |
-
{"ts": "2025-12-
|
| 6 |
-
{"ts": "2025-12-
|
|
|
|
| 1 |
+
{"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2105559999326942, "p50": 1.2135660001604265, "p90": 1.214856999922631, "mean": 1.213000200004899, "iqr": 0.0038309999581542797, "raw_times": [1.2149960000442661, 1.2110259999644768, 1.2105559999326942, 1.2135660001604265, 1.214856999922631], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2067360000855842, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2594280001394509, "p50": 1.2752780000937491, "p90": 1.2771070000781037, "mean": 1.2731776000691752, "iqr": 0.010640000027706265, "raw_times": [1.2752780000937491, 1.2664670000503975, 1.2771070000781037, 1.2594280001394509, 1.287607999984175], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2718570001197804, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2881479999578005, "p50": 1.2985280000066268, "p90": 1.2987470001917245, "mean": 1.2992600000416132, "iqr": 0.008449000233667903, "raw_times": [1.2902979999580566, 1.2881479999578005, 1.2985280000066268, 1.2987470001917245, 1.3205790000938578], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2919179998789332, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.32487900009437, "p50": 1.3346000000638014, "p90": 1.337429000159318, "mean": 1.3341430000764376, "iqr": 0.006821000170020852, "raw_times": [1.32487900009437, 1.337429000159318, 1.3346000000638014, 1.3306079999892972, 1.3431990000754013], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.327048999883118, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.4795820000017557, "p50": 1.4878020001560799, "p90": 1.4919819998340245, "mean": 1.4892582000356924, "iqr": 0.004879999778495403, "raw_times": [1.4795820000017557, 1.4919819998340245, 1.487102000055529, 1.499823000131073, 1.4878020001560799], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.4706619999742543, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.507972999888807, "p50": 1.5174029999798222, "p90": 1.518043000032776, "mean": 1.5156109999679757, "iqr": 0.005300000111674308, "raw_times": [1.518043000032776, 1.5174029999798222, 1.5218930000173714, 1.507972999888807, 1.5127429999211017], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.517042999921614, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.0003566741943359375, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/cells/benchmark.py
CHANGED
|
@@ -4,7 +4,6 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
-
# "xformers",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
@@ -13,18 +12,18 @@
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
-
import xformers.ops as xops
|
| 17 |
|
| 18 |
|
| 19 |
-
def
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
run_benchmark(
|
| 26 |
kernel_type=KernelTypeEnum.ATTENTION,
|
| 27 |
-
impl_name="
|
| 28 |
-
impl_tags={"family": "
|
| 29 |
-
impl_func=
|
| 30 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
|
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
+
def torch_flash(q, k, v):
|
| 18 |
+
qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
|
| 19 |
+
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
|
| 20 |
+
o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
|
| 21 |
+
return o.transpose(1, 2).contiguous()
|
| 22 |
|
| 23 |
|
| 24 |
run_benchmark(
|
| 25 |
kernel_type=KernelTypeEnum.ATTENTION,
|
| 26 |
+
impl_name="torch_flash_ma",
|
| 27 |
+
impl_tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
|
| 28 |
+
impl_func=torch_flash,
|
| 29 |
)
|
flash_attn/impls/flash_attention.html
CHANGED
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: nv | 0.
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3905,7 +3905,7 @@ Cell: nv | 0.25s
|
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div id="output-nv" class="cell-output">
|
| 3908 |
-
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3911 |
+-----------------------------------------+------------------------+----------------------+
|
|
@@ -3914,7 +3914,7 @@ Cell: nv | 0.25s
|
|
| 3914 |
| | | MIG M. |
|
| 3915 |
|=========================================+========================+======================|
|
| 3916 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3917 |
-
| N/A
|
| 3918 |
| | | N/A |
|
| 3919 |
+-----------------------------------------+------------------------+----------------------+
|
| 3920 |
|
|
@@ -3938,7 +3938,7 @@ Cell: nv | 0.25s
|
|
| 3938 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3939 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3940 |
</span> |
|
| 3941 |
-
Cell: benchmark | 4.
|
| 3942 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3943 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3944 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3989,29 +3989,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
|
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 3993 |
-
torch_flash_ma 6.
|
| 3994 |
-
aten::scaled_dot_product_attention 0.
|
| 3995 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 3996 |
-
aten::_flash_attention_forward 0.
|
| 3997 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3998 |
-
aten::contiguous 0.
|
| 3999 |
-
aten::clone 0.
|
| 4000 |
-
aten::copy_ 1.
|
| 4001 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4002 |
-
Activity Buffer Request 31.
|
| 4003 |
-
aten::transpose 1.
|
| 4004 |
-
aten::as_strided 0.
|
| 4005 |
-
aten::empty_like 0.
|
| 4006 |
-
aten::empty 1.
|
| 4007 |
-
cudaLaunchKernel 2.
|
| 4008 |
-
aten::empty_strided 0.
|
| 4009 |
-
cudaDeviceGetAttribute 0.04% 2.
|
| 4010 |
-
cudaFuncSetAttribute 0.
|
| 4011 |
-
cudaDeviceSynchronize 50.
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
-
Self CPU time total: 5.
|
| 4014 |
-
Self CUDA time total: 3.
|
| 4015 |
|
| 4016 |
|
| 4017 |
|
|
@@ -4021,29 +4021,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
|
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4023 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4024 |
-
torch_flash_ma 4.
|
| 4025 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4026 |
-
aten::scaled_dot_product_attention 0.
|
| 4027 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4028 |
-
aten::_flash_attention_forward 0.
|
| 4029 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4030 |
-
aten::contiguous 0.
|
| 4031 |
-
aten::clone 0.
|
| 4032 |
-
aten::copy_ 1.
|
| 4033 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4034 |
-
Activity Buffer Request
|
| 4035 |
-
aten::transpose 0.
|
| 4036 |
-
aten::as_strided 0.
|
| 4037 |
-
aten::empty_like 0.
|
| 4038 |
-
aten::empty 1.
|
| 4039 |
-
cudaLaunchKernel 1.
|
| 4040 |
-
aten::empty_strided 0.25% 13.
|
| 4041 |
-
cudaDeviceGetAttribute 0.03% 1.
|
| 4042 |
-
cudaFuncSetAttribute 0.
|
| 4043 |
-
cudaDeviceSynchronize 55.
|
| 4044 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4045 |
Self CPU time total: 5.534ms
|
| 4046 |
-
Self CUDA time total: 3.
|
| 4047 |
|
| 4048 |
|
| 4049 |
|
|
@@ -4053,29 +4053,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
|
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4055 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4056 |
-
torch_flash_ma 4.
|
| 4057 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4058 |
-
aten::scaled_dot_product_attention 0.
|
| 4059 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4060 |
-
aten::_flash_attention_forward 0.
|
| 4061 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4062 |
-
aten::contiguous 0.
|
| 4063 |
-
aten::clone 0.
|
| 4064 |
-
aten::copy_ 1.
|
| 4065 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4066 |
-
Activity Buffer Request 30.
|
| 4067 |
-
aten::transpose 0.
|
| 4068 |
-
aten::as_strided 0.
|
| 4069 |
-
aten::empty_like 0.
|
| 4070 |
-
aten::empty 1.
|
| 4071 |
-
cudaLaunchKernel 1.
|
| 4072 |
-
aten::empty_strided 0.28% 15.
|
| 4073 |
-
cudaDeviceGetAttribute 0.03% 1.
|
| 4074 |
-
cudaFuncSetAttribute 0.07% 3.
|
| 4075 |
-
cudaDeviceSynchronize
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
-
Self CPU time total: 5.
|
| 4078 |
-
Self CUDA time total: 3.
|
| 4079 |
|
| 4080 |
|
| 4081 |
|
|
@@ -4085,29 +4085,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
|
|
| 4085 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4086 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
-
torch_flash_ma 4.
|
| 4089 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4090 |
-
aten::scaled_dot_product_attention 0.
|
| 4091 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4092 |
-
aten::_flash_attention_forward 0.
|
| 4093 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4094 |
-
aten::contiguous 0.
|
| 4095 |
-
aten::clone 0.
|
| 4096 |
-
aten::copy_ 1.
|
| 4097 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4098 |
-
Activity Buffer Request
|
| 4099 |
-
aten::transpose 0.
|
| 4100 |
-
aten::as_strided 0.
|
| 4101 |
-
aten::empty_like 0.
|
| 4102 |
-
aten::empty 1.
|
| 4103 |
-
cudaLaunchKernel
|
| 4104 |
-
aten::empty_strided 0.
|
| 4105 |
-
cudaDeviceGetAttribute 0.03% 1.
|
| 4106 |
-
cudaFuncSetAttribute 0.
|
| 4107 |
-
cudaDeviceSynchronize
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
-
Self CPU time total: 5.
|
| 4110 |
-
Self CUDA time total: 3.
|
| 4111 |
|
| 4112 |
|
| 4113 |
|
|
@@ -4117,29 +4117,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
|
|
| 4117 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4118 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4119 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4120 |
-
torch_flash_ma 4.
|
| 4121 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4122 |
-
aten::scaled_dot_product_attention 0.
|
| 4123 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4124 |
-
aten::_flash_attention_forward 0.
|
| 4125 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4126 |
-
aten::contiguous 0.
|
| 4127 |
-
aten::clone 0.
|
| 4128 |
-
aten::copy_ 1.
|
| 4129 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4130 |
-
Activity Buffer Request
|
| 4131 |
-
aten::transpose 0.
|
| 4132 |
-
aten::as_strided 0.
|
| 4133 |
-
aten::empty_like 0.34% 21.
|
| 4134 |
-
aten::empty 1.
|
| 4135 |
-
cudaLaunchKernel 4.
|
| 4136 |
-
aten::empty_strided 0.
|
| 4137 |
-
cudaDeviceGetAttribute 0.
|
| 4138 |
-
cudaFuncSetAttribute 0.
|
| 4139 |
-
cudaDeviceSynchronize
|
| 4140 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4141 |
-
Self CPU time total: 6.
|
| 4142 |
-
Self CUDA time total: 4.
|
| 4143 |
|
| 4144 |
|
| 4145 |
|
|
@@ -4149,38 +4149,38 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
|
|
| 4149 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4150 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4151 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4152 |
-
torch_flash_ma
|
| 4153 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4154 |
-
aten::scaled_dot_product_attention 0.38%
|
| 4155 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4156 |
-
aten::_flash_attention_forward 0.
|
| 4157 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4158 |
-
aten::contiguous 0.
|
| 4159 |
-
aten::clone 0.
|
| 4160 |
-
aten::copy_ 1.
|
| 4161 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 811.
|
| 4162 |
-
Activity Buffer Request 26.
|
| 4163 |
-
aten::transpose 0.
|
| 4164 |
-
aten::as_strided 0.
|
| 4165 |
-
aten::empty_like 0.
|
| 4166 |
-
aten::empty 1.
|
| 4167 |
-
cudaLaunchKernel 4.
|
| 4168 |
-
aten::empty_strided 0.23% 14.
|
| 4169 |
-
cudaDeviceGetAttribute 0.03% 1.
|
| 4170 |
-
cudaFuncSetAttribute 0.
|
| 4171 |
-
cudaDeviceSynchronize
|
| 4172 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4173 |
-
Self CPU time total: 6.
|
| 4174 |
-
Self CUDA time total: 4.
|
| 4175 |
|
| 4176 |
|
| 4177 |
impl wl p50(ms) ok
|
| 4178 |
torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
|
| 4179 |
-
torch_flash_ma cuda_attn_L256_bfloat16 1.
|
| 4180 |
-
torch_flash_ma cuda_attn_L320_bfloat16 1.
|
| 4181 |
-
torch_flash_ma cuda_attn_L384_bfloat16 1.
|
| 4182 |
-
torch_flash_ma cuda_attn_L448_bfloat16 1.
|
| 4183 |
-
torch_flash_ma cuda_attn_L512_bfloat16 1.
|
| 4184 |
</pre></div>
|
| 4185 |
<div class="cell-artifacts">
|
| 4186 |
<h4>Artifacts:</h4>
|
|
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: nv | 0.28s
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div id="output-nv" class="cell-output">
|
| 3908 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 23:02:01 2025
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3911 |
+-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3914 |
| | | MIG M. |
|
| 3915 |
|=========================================+========================+======================|
|
| 3916 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3917 |
+
| N/A 42C P0 86W / 350W | 0MiB / 46068MiB | 20% Default |
|
| 3918 |
| | | N/A |
|
| 3919 |
+-----------------------------------------+------------------------+----------------------+
|
| 3920 |
|
|
|
|
| 3938 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3939 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3940 |
</span> |
|
| 3941 |
+
Cell: benchmark | 4.27s
|
| 3942 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3943 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3944 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.630ms 102.63% 3.630ms 3.630ms 1
|
| 3993 |
+
torch_flash_ma 6.38% 352.556us 49.11% 2.714ms 2.714ms 0.000us 0.00% 3.576ms 3.576ms 1
|
| 3994 |
+
aten::scaled_dot_product_attention 0.73% 40.491us 3.98% 220.075us 73.358us 0.000us 0.00% 2.821ms 940.462us 3
|
| 3995 |
+
aten::_scaled_dot_product_flash_attention 0.47% 25.779us 3.25% 179.584us 59.861us 0.000us 0.00% 2.821ms 940.462us 3
|
| 3996 |
+
aten::_flash_attention_forward 0.70% 38.829us 2.35% 129.692us 43.231us 2.821ms 79.77% 2.821ms 940.462us 3
|
| 3997 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.821ms 79.77% 2.821ms 940.462us 3
|
| 3998 |
+
aten::contiguous 0.22% 12.191us 37.53% 2.074ms 172.866us 0.000us 0.00% 755.108us 62.926us 12
|
| 3999 |
+
aten::clone 0.60% 33.381us 37.31% 2.062ms 171.850us 0.000us 0.00% 755.108us 62.926us 12
|
| 4000 |
+
aten::copy_ 1.61% 89.181us 35.26% 1.949ms 162.385us 715.299us 20.23% 755.108us 62.926us 12
|
| 4001 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 715.299us 20.23% 715.299us 59.608us 12
|
| 4002 |
+
Activity Buffer Request 31.81% 1.758ms 31.81% 1.758ms 1.758ms 39.809us 1.13% 39.809us 39.809us 1
|
| 4003 |
+
aten::transpose 1.21% 66.774us 1.65% 91.006us 3.792us 0.000us 0.00% 0.000us 0.000us 24
|
| 4004 |
+
aten::as_strided 0.44% 24.232us 0.44% 24.232us 1.010us 0.000us 0.00% 0.000us 0.000us 24
|
| 4005 |
+
aten::empty_like 0.44% 24.459us 1.87% 103.512us 6.901us 0.000us 0.00% 0.000us 0.000us 15
|
| 4006 |
+
aten::empty 1.67% 92.213us 1.67% 92.213us 3.842us 0.000us 0.00% 0.000us 0.000us 24
|
| 4007 |
+
cudaLaunchKernel 2.28% 126.282us 2.28% 126.282us 8.419us 0.000us 0.00% 0.000us 0.000us 15
|
| 4008 |
+
aten::empty_strided 0.31% 16.960us 0.31% 16.960us 5.653us 0.000us 0.00% 0.000us 0.000us 3
|
| 4009 |
+
cudaDeviceGetAttribute 0.04% 2.141us 0.04% 2.141us 0.357us 0.000us 0.00% 0.000us 0.000us 6
|
| 4010 |
+
cudaFuncSetAttribute 0.19% 10.441us 0.19% 10.441us 3.480us 0.000us 0.00% 0.000us 0.000us 3
|
| 4011 |
+
cudaDeviceSynchronize 50.89% 2.813ms 50.89% 2.813ms 2.813ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
+
Self CPU time total: 5.527ms
|
| 4014 |
+
Self CUDA time total: 3.537ms
|
| 4015 |
|
| 4016 |
|
| 4017 |
|
|
|
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4023 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4024 |
+
torch_flash_ma 4.56% 252.356us 44.41% 2.457ms 2.457ms 0.000us 0.00% 3.793ms 3.793ms 1
|
| 4025 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.748ms 100.29% 3.748ms 3.748ms 1
|
| 4026 |
+
aten::scaled_dot_product_attention 0.44% 24.090us 3.37% 186.293us 62.098us 0.000us 0.00% 2.975ms 991.820us 3
|
| 4027 |
+
aten::_scaled_dot_product_flash_attention 0.34% 18.721us 2.93% 162.203us 54.068us 0.000us 0.00% 2.975ms 991.820us 3
|
| 4028 |
+
aten::_flash_attention_forward 0.77% 42.568us 2.18% 120.522us 40.174us 2.975ms 79.63% 2.975ms 991.820us 3
|
| 4029 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.975ms 79.63% 2.975ms 991.820us 3
|
| 4030 |
+
aten::contiguous 0.18% 9.899us 35.65% 1.973ms 164.423us 0.000us 0.00% 817.633us 68.136us 12
|
| 4031 |
+
aten::clone 0.53% 29.604us 35.48% 1.963ms 163.598us 0.000us 0.00% 817.633us 68.136us 12
|
| 4032 |
+
aten::copy_ 1.46% 80.732us 33.77% 1.869ms 155.723us 761.377us 20.37% 817.633us 68.136us 12
|
| 4033 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 761.377us 20.37% 761.377us 63.448us 12
|
| 4034 |
+
Activity Buffer Request 30.82% 1.705ms 30.82% 1.705ms 1.705ms 56.256us 1.51% 56.256us 56.256us 1
|
| 4035 |
+
aten::transpose 0.91% 50.232us 1.24% 68.680us 2.862us 0.000us 0.00% 0.000us 0.000us 24
|
| 4036 |
+
aten::as_strided 0.33% 18.448us 0.33% 18.448us 0.769us 0.000us 0.00% 0.000us 0.000us 24
|
| 4037 |
+
aten::empty_like 0.42% 23.239us 1.52% 84.240us 5.616us 0.000us 0.00% 0.000us 0.000us 15
|
| 4038 |
+
aten::empty 1.37% 76.011us 1.37% 76.011us 3.167us 0.000us 0.00% 0.000us 0.000us 24
|
| 4039 |
+
cudaLaunchKernel 1.93% 106.693us 1.93% 106.693us 7.113us 0.000us 0.00% 0.000us 0.000us 15
|
| 4040 |
+
aten::empty_strided 0.25% 13.951us 0.25% 13.951us 4.650us 0.000us 0.00% 0.000us 0.000us 3
|
| 4041 |
+
cudaDeviceGetAttribute 0.03% 1.720us 0.03% 1.720us 0.287us 0.000us 0.00% 0.000us 0.000us 6
|
| 4042 |
+
cudaFuncSetAttribute 0.07% 3.701us 0.07% 3.701us 1.234us 0.000us 0.00% 0.000us 0.000us 3
|
| 4043 |
+
cudaDeviceSynchronize 55.59% 3.076ms 55.59% 3.076ms 3.076ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4044 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4045 |
Self CPU time total: 5.534ms
|
| 4046 |
+
Self CUDA time total: 3.737ms
|
| 4047 |
|
| 4048 |
|
| 4049 |
|
|
|
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4055 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4056 |
+
torch_flash_ma 4.60% 257.767us 43.91% 2.459ms 2.459ms 0.000us 0.00% 3.868ms 3.868ms 1
|
| 4057 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.820ms 100.28% 3.820ms 3.820ms 1
|
| 4058 |
+
aten::scaled_dot_product_attention 0.42% 23.451us 3.31% 185.194us 61.731us 0.000us 0.00% 3.025ms 1.008ms 3
|
| 4059 |
+
aten::_scaled_dot_product_flash_attention 0.35% 19.728us 2.89% 161.743us 53.914us 0.000us 0.00% 3.025ms 1.008ms 3
|
| 4060 |
+
aten::_flash_attention_forward 0.72% 40.171us 2.13% 119.133us 39.711us 3.025ms 79.42% 3.025ms 1.008ms 3
|
| 4061 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.025ms 79.42% 3.025ms 1.008ms 3
|
| 4062 |
+
aten::contiguous 0.17% 9.680us 35.16% 1.969ms 164.068us 0.000us 0.00% 843.394us 70.283us 12
|
| 4063 |
+
aten::clone 0.57% 32.118us 34.99% 1.959ms 163.261us 0.000us 0.00% 843.394us 70.283us 12
|
| 4064 |
+
aten::copy_ 1.44% 80.682us 33.24% 1.861ms 155.084us 783.938us 20.58% 843.394us 70.283us 12
|
| 4065 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 783.938us 20.58% 783.938us 65.328us 12
|
| 4066 |
+
Activity Buffer Request 30.29% 1.696ms 30.29% 1.696ms 1.696ms 59.456us 1.56% 59.456us 59.456us 1
|
| 4067 |
+
aten::transpose 0.92% 51.272us 1.25% 69.843us 2.910us 0.000us 0.00% 0.000us 0.000us 24
|
| 4068 |
+
aten::as_strided 0.33% 18.571us 0.33% 18.571us 0.774us 0.000us 0.00% 0.000us 0.000us 24
|
| 4069 |
+
aten::empty_like 0.37% 20.823us 1.56% 87.172us 5.811us 0.000us 0.00% 0.000us 0.000us 15
|
| 4070 |
+
aten::empty 1.42% 79.691us 1.42% 79.691us 3.320us 0.000us 0.00% 0.000us 0.000us 24
|
| 4071 |
+
cudaLaunchKernel 1.92% 107.532us 1.92% 107.532us 7.169us 0.000us 0.00% 0.000us 0.000us 15
|
| 4072 |
+
aten::empty_strided 0.28% 15.890us 0.28% 15.890us 5.297us 0.000us 0.00% 0.000us 0.000us 3
|
| 4073 |
+
cudaDeviceGetAttribute 0.03% 1.700us 0.03% 1.700us 0.283us 0.000us 0.00% 0.000us 0.000us 6
|
| 4074 |
+
cudaFuncSetAttribute 0.07% 3.820us 0.07% 3.820us 1.273us 0.000us 0.00% 0.000us 0.000us 3
|
| 4075 |
+
cudaDeviceSynchronize 56.09% 3.140ms 56.09% 3.140ms 3.140ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
+
Self CPU time total: 5.599ms
|
| 4078 |
+
Self CUDA time total: 3.809ms
|
| 4079 |
|
| 4080 |
|
| 4081 |
|
|
|
|
| 4085 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4086 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
+
torch_flash_ma 4.31% 257.497us 46.49% 2.779ms 2.779ms 0.000us 0.00% 3.937ms 3.937ms 1
|
| 4089 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.890ms 100.28% 3.890ms 3.890ms 1
|
| 4090 |
+
aten::scaled_dot_product_attention 0.42% 25.301us 3.15% 188.584us 62.861us 0.000us 0.00% 3.098ms 1.033ms 3
|
| 4091 |
+
aten::_scaled_dot_product_flash_attention 0.34% 20.249us 2.73% 163.283us 54.428us 0.000us 0.00% 3.098ms 1.033ms 3
|
| 4092 |
+
aten::_flash_attention_forward 0.67% 40.000us 1.99% 118.763us 39.588us 3.098ms 79.85% 3.098ms 1.033ms 3
|
| 4093 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.098ms 79.85% 3.098ms 1.033ms 3
|
| 4094 |
+
aten::contiguous 0.17% 10.243us 38.20% 2.284ms 190.292us 0.000us 0.00% 838.882us 69.907us 12
|
| 4095 |
+
aten::clone 0.53% 31.478us 38.03% 2.273ms 189.439us 0.000us 0.00% 838.882us 69.907us 12
|
| 4096 |
+
aten::copy_ 1.35% 80.860us 36.38% 2.175ms 181.246us 781.730us 20.15% 838.882us 69.907us 12
|
| 4097 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 781.730us 20.15% 781.730us 65.144us 12
|
| 4098 |
+
Activity Buffer Request 30.50% 1.823ms 30.50% 1.823ms 1.823ms 57.152us 1.47% 57.152us 57.152us 1
|
| 4099 |
+
aten::transpose 0.90% 53.920us 1.24% 74.061us 3.086us 0.000us 0.00% 0.000us 0.000us 24
|
| 4100 |
+
aten::as_strided 0.34% 20.141us 0.34% 20.141us 0.839us 0.000us 0.00% 0.000us 0.000us 24
|
| 4101 |
+
aten::empty_like 0.36% 21.362us 1.47% 87.614us 5.841us 0.000us 0.00% 0.000us 0.000us 15
|
| 4102 |
+
aten::empty 1.34% 79.813us 1.34% 79.813us 3.326us 0.000us 0.00% 0.000us 0.000us 24
|
| 4103 |
+
cudaLaunchKernel 4.91% 293.806us 4.91% 293.806us 19.587us 0.000us 0.00% 0.000us 0.000us 15
|
| 4104 |
+
aten::empty_strided 0.26% 15.670us 0.26% 15.670us 5.223us 0.000us 0.00% 0.000us 0.000us 3
|
| 4105 |
+
cudaDeviceGetAttribute 0.03% 1.659us 0.03% 1.659us 0.276us 0.000us 0.00% 0.000us 0.000us 6
|
| 4106 |
+
cudaFuncSetAttribute 0.07% 3.921us 0.07% 3.921us 1.307us 0.000us 0.00% 0.000us 0.000us 3
|
| 4107 |
+
cudaDeviceSynchronize 53.51% 3.199ms 53.51% 3.199ms 3.199ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
+
Self CPU time total: 5.978ms
|
| 4110 |
+
Self CUDA time total: 3.880ms
|
| 4111 |
|
| 4112 |
|
| 4113 |
|
|
|
|
| 4117 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4118 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4119 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4120 |
+
torch_flash_ma 4.81% 305.765us 42.59% 2.710ms 2.710ms 0.000us 0.00% 4.451ms 4.451ms 1
|
| 4121 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.400ms 100.25% 4.400ms 4.400ms 1
|
| 4122 |
+
aten::scaled_dot_product_attention 0.38% 24.020us 2.97% 188.924us 62.975us 0.000us 0.00% 3.579ms 1.193ms 3
|
| 4123 |
+
aten::_scaled_dot_product_flash_attention 0.31% 19.571us 2.59% 164.904us 54.968us 0.000us 0.00% 3.579ms 1.193ms 3
|
| 4124 |
+
aten::_flash_attention_forward 0.68% 43.108us 1.92% 122.012us 40.671us 3.579ms 81.54% 3.579ms 1.193ms 3
|
| 4125 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.579ms 81.54% 3.579ms 1.193ms 3
|
| 4126 |
+
aten::contiguous 0.15% 9.589us 34.07% 2.168ms 180.670us 0.000us 0.00% 871.616us 72.635us 12
|
| 4127 |
+
aten::clone 0.54% 34.360us 33.92% 2.158ms 179.871us 0.000us 0.00% 871.616us 72.635us 12
|
| 4128 |
+
aten::copy_ 1.33% 84.914us 32.32% 2.057ms 171.390us 810.495us 18.46% 871.616us 72.635us 12
|
| 4129 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 810.495us 18.46% 810.495us 67.541us 12
|
| 4130 |
+
Activity Buffer Request 26.57% 1.691ms 26.57% 1.691ms 1.691ms 61.121us 1.39% 61.121us 61.121us 1
|
| 4131 |
+
aten::transpose 0.82% 51.874us 1.12% 70.963us 2.957us 0.000us 0.00% 0.000us 0.000us 24
|
| 4132 |
+
aten::as_strided 0.30% 19.089us 0.30% 19.089us 0.795us 0.000us 0.00% 0.000us 0.000us 24
|
| 4133 |
+
aten::empty_like 0.34% 21.431us 1.39% 88.502us 5.900us 0.000us 0.00% 0.000us 0.000us 15
|
| 4134 |
+
aten::empty 1.27% 80.674us 1.27% 80.674us 3.361us 0.000us 0.00% 0.000us 0.000us 24
|
| 4135 |
+
cudaLaunchKernel 4.78% 304.046us 4.78% 304.046us 20.270us 0.000us 0.00% 0.000us 0.000us 15
|
| 4136 |
+
aten::empty_strided 0.25% 15.780us 0.25% 15.780us 5.260us 0.000us 0.00% 0.000us 0.000us 3
|
| 4137 |
+
cudaDeviceGetAttribute 0.02% 1.550us 0.02% 1.550us 0.258us 0.000us 0.00% 0.000us 0.000us 6
|
| 4138 |
+
cudaFuncSetAttribute 0.06% 3.750us 0.06% 3.750us 1.250us 0.000us 0.00% 0.000us 0.000us 3
|
| 4139 |
+
cudaDeviceSynchronize 57.41% 3.653ms 57.41% 3.653ms 3.653ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4140 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4141 |
+
Self CPU time total: 6.363ms
|
| 4142 |
+
Self CUDA time total: 4.389ms
|
| 4143 |
|
| 4144 |
|
| 4145 |
|
|
|
|
| 4149 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4150 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4151 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4152 |
+
torch_flash_ma 3.57% 230.352us 40.90% 2.641ms 2.641ms 0.000us 0.00% 4.540ms 4.540ms 1
|
| 4153 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.489ms 100.24% 4.489ms 4.489ms 1
|
| 4154 |
+
aten::scaled_dot_product_attention 0.38% 24.551us 2.77% 178.785us 59.595us 0.000us 0.00% 3.667ms 1.222ms 3
|
| 4155 |
+
aten::_scaled_dot_product_flash_attention 0.30% 19.129us 2.39% 154.234us 51.411us 0.000us 0.00% 3.667ms 1.222ms 3
|
| 4156 |
+
aten::_flash_attention_forward 0.55% 35.197us 1.71% 110.631us 36.877us 3.667ms 81.88% 3.667ms 1.222ms 3
|
| 4157 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.667ms 81.88% 3.667ms 1.222ms 3
|
| 4158 |
+
aten::contiguous 0.16% 10.271us 33.78% 2.181ms 181.772us 0.000us 0.00% 873.057us 72.755us 12
|
| 4159 |
+
aten::clone 0.44% 28.652us 33.62% 2.171ms 180.916us 0.000us 0.00% 873.057us 72.755us 12
|
| 4160 |
+
aten::copy_ 1.30% 83.713us 32.19% 2.078ms 173.208us 811.457us 18.12% 873.057us 72.755us 12
|
| 4161 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 811.457us 18.12% 811.457us 67.621us 12
|
| 4162 |
+
Activity Buffer Request 26.83% 1.733ms 26.83% 1.733ms 1.733ms 61.600us 1.38% 61.600us 61.600us 1
|
| 4163 |
+
aten::transpose 0.85% 55.123us 1.17% 75.416us 3.142us 0.000us 0.00% 0.000us 0.000us 24
|
| 4164 |
+
aten::as_strided 0.31% 20.293us 0.31% 20.293us 0.846us 0.000us 0.00% 0.000us 0.000us 24
|
| 4165 |
+
aten::empty_like 0.30% 19.350us 1.29% 83.431us 5.562us 0.000us 0.00% 0.000us 0.000us 15
|
| 4166 |
+
aten::empty 1.21% 78.153us 1.21% 78.153us 3.256us 0.000us 0.00% 0.000us 0.000us 24
|
| 4167 |
+
cudaLaunchKernel 4.40% 284.286us 4.40% 284.286us 18.952us 0.000us 0.00% 0.000us 0.000us 15
|
| 4168 |
+
aten::empty_strided 0.23% 14.650us 0.23% 14.650us 4.883us 0.000us 0.00% 0.000us 0.000us 3
|
| 4169 |
+
cudaDeviceGetAttribute 0.03% 1.640us 0.03% 1.640us 0.273us 0.000us 0.00% 0.000us 0.000us 6
|
| 4170 |
+
cudaFuncSetAttribute 0.05% 3.450us 0.05% 3.450us 1.150us 0.000us 0.00% 0.000us 0.000us 3
|
| 4171 |
+
cudaDeviceSynchronize 59.10% 3.816ms 59.10% 3.816ms 3.816ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4172 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4173 |
+
Self CPU time total: 6.458ms
|
| 4174 |
+
Self CUDA time total: 4.478ms
|
| 4175 |
|
| 4176 |
|
| 4177 |
impl wl p50(ms) ok
|
| 4178 |
torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
|
| 4179 |
+
torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
|
| 4180 |
+
torch_flash_ma cuda_attn_L320_bfloat16 1.30 True
|
| 4181 |
+
torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
|
| 4182 |
+
torch_flash_ma cuda_attn_L448_bfloat16 1.49 True
|
| 4183 |
+
torch_flash_ma cuda_attn_L512_bfloat16 1.52 True
|
| 4184 |
</pre></div>
|
| 4185 |
<div class="cell-artifacts">
|
| 4186 |
<h4>Artifacts:</h4>
|
flash_attn/impls/hf_kernels_flash_attn.html
CHANGED
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: benchmark | 5.
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3943,21 +3943,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
|
|
| 3943 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3944 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3945 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3946 |
-
hf_kernels_flash_attn 3.
|
| 3947 |
-
_flash_attn_9e27194::fwd 1.
|
| 3948 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3949 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3950 |
-
Activity Buffer Request
|
| 3951 |
-
cudaDeviceGetAttribute 0.
|
| 3952 |
-
aten::empty_like 0.
|
| 3953 |
-
aten::empty_strided 0.
|
| 3954 |
-
aten::empty 0.
|
| 3955 |
-
cudaFuncSetAttribute 0.
|
| 3956 |
-
cudaLaunchKernel 0.
|
| 3957 |
-
cudaDeviceSynchronize 55.
|
| 3958 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3959 |
-
Self CPU time total: 4.
|
| 3960 |
-
Self CUDA time total: 2.
|
| 3961 |
|
| 3962 |
|
| 3963 |
|
|
@@ -3967,21 +3967,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
|
|
| 3967 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3968 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3969 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3970 |
-
hf_kernels_flash_attn 1.
|
| 3971 |
-
_flash_attn_9e27194::fwd
|
| 3972 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3973 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3974 |
-
Activity Buffer Request 36.
|
| 3975 |
-
cudaDeviceGetAttribute 0.
|
| 3976 |
-
aten::empty_like 0.
|
| 3977 |
-
aten::empty_strided 0.
|
| 3978 |
-
aten::empty 0.
|
| 3979 |
-
cudaFuncSetAttribute 0.
|
| 3980 |
-
cudaLaunchKernel 0.
|
| 3981 |
-
cudaDeviceSynchronize 58.
|
| 3982 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3983 |
-
Self CPU time total: 4.
|
| 3984 |
-
Self CUDA time total: 2.
|
| 3985 |
|
| 3986 |
|
| 3987 |
|
|
@@ -3991,21 +3991,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
|
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
-
hf_kernels_flash_attn 2.
|
| 3995 |
-
_flash_attn_9e27194::fwd 0.
|
| 3996 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 3997 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 3998 |
-
Activity Buffer Request 35.
|
| 3999 |
-
cudaDeviceGetAttribute 0.
|
| 4000 |
-
aten::empty_like 0.
|
| 4001 |
-
aten::empty_strided 0.
|
| 4002 |
-
aten::empty 0.
|
| 4003 |
-
cudaFuncSetAttribute 0.
|
| 4004 |
-
cudaLaunchKernel 0.
|
| 4005 |
-
cudaDeviceSynchronize 59.
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
-
Self CPU time total: 4.
|
| 4008 |
-
Self CUDA time total: 3.
|
| 4009 |
|
| 4010 |
|
| 4011 |
|
|
@@ -4015,21 +4015,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
|
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
-
hf_kernels_flash_attn
|
| 4019 |
-
_flash_attn_9e27194::fwd
|
| 4020 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4021 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4022 |
-
Activity Buffer Request 33.
|
| 4023 |
-
cudaDeviceGetAttribute 0.
|
| 4024 |
-
aten::empty_like 0.
|
| 4025 |
-
aten::empty_strided 0.
|
| 4026 |
-
aten::empty 0.
|
| 4027 |
-
cudaFuncSetAttribute 0.08%
|
| 4028 |
-
cudaLaunchKernel 4.
|
| 4029 |
-
cudaDeviceSynchronize 58.
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
-
Self CPU time total: 5.
|
| 4032 |
-
Self CUDA time total: 3.
|
| 4033 |
|
| 4034 |
|
| 4035 |
|
|
@@ -4039,21 +4039,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
|
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4041 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4042 |
-
hf_kernels_flash_attn 1.
|
| 4043 |
-
_flash_attn_9e27194::fwd 0.
|
| 4044 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4045 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4046 |
-
Activity Buffer Request
|
| 4047 |
-
cudaDeviceGetAttribute 0.07% 3.
|
| 4048 |
-
aten::empty_like 0.
|
| 4049 |
-
aten::empty_strided 0.
|
| 4050 |
-
aten::empty 0.
|
| 4051 |
-
cudaFuncSetAttribute 0.07% 3.
|
| 4052 |
-
cudaLaunchKernel 3.
|
| 4053 |
-
cudaDeviceSynchronize
|
| 4054 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4055 |
-
Self CPU time total: 5.
|
| 4056 |
-
Self CUDA time total: 3.
|
| 4057 |
|
| 4058 |
|
| 4059 |
|
|
@@ -4063,36 +4063,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
|
|
| 4063 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4064 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
-
hf_kernels_flash_attn 1.
|
| 4067 |
-
_flash_attn_9e27194::fwd 0.
|
| 4068 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4069 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4070 |
-
Activity Buffer Request 29.
|
| 4071 |
-
cudaDeviceGetAttribute 0.
|
| 4072 |
-
aten::empty_like 0.
|
| 4073 |
-
aten::empty_strided 0.
|
| 4074 |
-
aten::empty 0.
|
| 4075 |
-
cudaFuncSetAttribute 0.
|
| 4076 |
-
cudaLaunchKernel 3.
|
| 4077 |
-
cudaDeviceSynchronize 63.
|
| 4078 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4079 |
-
Self CPU time total: 5.
|
| 4080 |
-
Self CUDA time total: 3.
|
| 4081 |
|
| 4082 |
|
| 4083 |
impl wl p50(ms) ok
|
| 4084 |
-
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.
|
| 4085 |
-
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.
|
| 4086 |
-
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.
|
| 4087 |
-
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.
|
| 4088 |
-
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.
|
| 4089 |
-
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.
|
| 4090 |
</pre></div>
|
| 4091 |
<div class="cell-stderr">
|
| 4092 |
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
|
| 4093 |
|
| 4094 |
-
Fetching 20 files: 10%|█ | 2/20 [00:01<00:
|
| 4095 |
-
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00,
|
| 4096 |
</div>
|
| 4097 |
<div class="cell-artifacts">
|
| 4098 |
<h4>Artifacts:</h4>
|
|
|
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: benchmark | 5.83s
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3943 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3944 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3945 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3946 |
+
hf_kernels_flash_attn 3.32% 153.894us 44.44% 2.062ms 2.062ms 0.000us 0.00% 3.741ms 3.741ms 1
|
| 3947 |
+
_flash_attn_9e27194::fwd 1.40% 65.047us 41.12% 1.908ms 636.067us 2.793ms 100.00% 3.741ms 1.247ms 3
|
| 3948 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.05% 2.795ms 2.795ms 1
|
| 3949 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.793ms 100.00% 2.793ms 931.053us 3
|
| 3950 |
+
Activity Buffer Request 36.76% 1.706ms 36.76% 1.706ms 1.706ms 947.811us 33.93% 947.811us 947.811us 1
|
| 3951 |
+
cudaDeviceGetAttribute 0.09% 4.281us 0.09% 4.281us 0.285us 0.000us 0.00% 0.000us 0.000us 15
|
| 3952 |
+
aten::empty_like 0.44% 20.280us 1.17% 54.161us 18.054us 0.000us 0.00% 0.000us 0.000us 3
|
| 3953 |
+
aten::empty_strided 0.73% 33.881us 0.73% 33.881us 11.294us 0.000us 0.00% 0.000us 0.000us 3
|
| 3954 |
+
aten::empty 0.53% 24.740us 0.53% 24.740us 2.749us 0.000us 0.00% 0.000us 0.000us 9
|
| 3955 |
+
cudaFuncSetAttribute 0.29% 13.452us 0.29% 13.452us 4.484us 0.000us 0.00% 0.000us 0.000us 3
|
| 3956 |
+
cudaLaunchKernel 0.87% 40.582us 0.87% 40.582us 13.527us 0.000us 0.00% 0.000us 0.000us 3
|
| 3957 |
+
cudaDeviceSynchronize 55.56% 2.579ms 55.56% 2.579ms 2.579ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3958 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3959 |
+
Self CPU time total: 4.641ms
|
| 3960 |
+
Self CUDA time total: 2.793ms
|
| 3961 |
|
| 3962 |
|
| 3963 |
|
|
|
|
| 3967 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3968 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3969 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3970 |
+
hf_kernels_flash_attn 1.87% 88.452us 41.15% 1.950ms 1.950ms 0.000us 0.00% 3.925ms 3.925ms 1
|
| 3971 |
+
_flash_attn_9e27194::fwd 0.93% 44.030us 39.28% 1.861ms 620.420us 2.932ms 100.00% 3.925ms 1.308ms 3
|
| 3972 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.933ms 100.05% 2.933ms 2.933ms 1
|
| 3973 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.932ms 100.00% 2.932ms 977.209us 3
|
| 3974 |
+
Activity Buffer Request 36.67% 1.738ms 36.67% 1.738ms 1.738ms 993.604us 33.89% 993.604us 993.604us 1
|
| 3975 |
+
cudaDeviceGetAttribute 0.08% 3.589us 0.08% 3.589us 0.239us 0.000us 0.00% 0.000us 0.000us 15
|
| 3976 |
+
aten::empty_like 0.16% 7.361us 0.48% 22.851us 7.617us 0.000us 0.00% 0.000us 0.000us 3
|
| 3977 |
+
aten::empty_strided 0.33% 15.490us 0.33% 15.490us 5.163us 0.000us 0.00% 0.000us 0.000us 3
|
| 3978 |
+
aten::empty 0.44% 21.020us 0.44% 21.020us 2.336us 0.000us 0.00% 0.000us 0.000us 9
|
| 3979 |
+
cudaFuncSetAttribute 0.07% 3.450us 0.07% 3.450us 1.150us 0.000us 0.00% 0.000us 0.000us 3
|
| 3980 |
+
cudaLaunchKernel 0.60% 28.443us 0.60% 28.443us 9.481us 0.000us 0.00% 0.000us 0.000us 3
|
| 3981 |
+
cudaDeviceSynchronize 58.85% 2.789ms 58.85% 2.789ms 2.789ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3982 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3983 |
+
Self CPU time total: 4.739ms
|
| 3984 |
+
Self CUDA time total: 2.932ms
|
| 3985 |
|
| 3986 |
|
| 3987 |
|
|
|
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
+
hf_kernels_flash_attn 2.16% 105.271us 40.16% 1.954ms 1.954ms 0.000us 0.00% 4.088ms 4.088ms 1
|
| 3995 |
+
_flash_attn_9e27194::fwd 0.92% 44.671us 38.00% 1.849ms 616.384us 3.054ms 100.00% 4.088ms 1.363ms 3
|
| 3996 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.056ms 100.05% 3.056ms 3.056ms 1
|
| 3997 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.054ms 100.00% 3.054ms 1.018ms 3
|
| 3998 |
+
Activity Buffer Request 35.43% 1.724ms 35.43% 1.724ms 1.724ms 1.034ms 33.84% 1.034ms 1.034ms 1
|
| 3999 |
+
cudaDeviceGetAttribute 0.08% 3.741us 0.08% 3.741us 0.249us 0.000us 0.00% 0.000us 0.000us 15
|
| 4000 |
+
aten::empty_like 0.15% 7.380us 0.46% 22.580us 7.527us 0.000us 0.00% 0.000us 0.000us 3
|
| 4001 |
+
aten::empty_strided 0.31% 15.200us 0.31% 15.200us 5.067us 0.000us 0.00% 0.000us 0.000us 3
|
| 4002 |
+
aten::empty 0.43% 20.900us 0.43% 20.900us 2.322us 0.000us 0.00% 0.000us 0.000us 9
|
| 4003 |
+
cudaFuncSetAttribute 0.07% 3.441us 0.07% 3.441us 1.147us 0.000us 0.00% 0.000us 0.000us 3
|
| 4004 |
+
cudaLaunchKernel 0.61% 29.670us 0.61% 29.670us 9.890us 0.000us 0.00% 0.000us 0.000us 3
|
| 4005 |
+
cudaDeviceSynchronize 59.84% 2.912ms 59.84% 2.912ms 2.912ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
+
Self CPU time total: 4.867ms
|
| 4008 |
+
Self CUDA time total: 3.054ms
|
| 4009 |
|
| 4010 |
|
| 4011 |
|
|
|
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
+
hf_kernels_flash_attn 1.99% 101.304us 41.40% 2.105ms 2.105ms 0.000us 0.00% 4.182ms 4.182ms 1
|
| 4019 |
+
_flash_attn_9e27194::fwd 0.90% 45.720us 39.41% 2.004ms 667.947us 3.124ms 100.00% 4.182ms 1.394ms 3
|
| 4020 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.125ms 100.05% 3.125ms 3.125ms 1
|
| 4021 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.124ms 100.00% 3.124ms 1.041ms 3
|
| 4022 |
+
Activity Buffer Request 33.36% 1.696ms 33.36% 1.696ms 1.696ms 1.058ms 33.87% 1.058ms 1.058ms 1
|
| 4023 |
+
cudaDeviceGetAttribute 0.07% 3.650us 0.07% 3.650us 0.243us 0.000us 0.00% 0.000us 0.000us 15
|
| 4024 |
+
aten::empty_like 0.15% 7.421us 0.48% 24.201us 8.067us 0.000us 0.00% 0.000us 0.000us 3
|
| 4025 |
+
aten::empty_strided 0.33% 16.780us 0.33% 16.780us 5.593us 0.000us 0.00% 0.000us 0.000us 3
|
| 4026 |
+
aten::empty 0.42% 21.431us 0.42% 21.431us 2.381us 0.000us 0.00% 0.000us 0.000us 9
|
| 4027 |
+
cudaFuncSetAttribute 0.08% 4.070us 0.08% 4.070us 1.357us 0.000us 0.00% 0.000us 0.000us 3
|
| 4028 |
+
cudaLaunchKernel 4.10% 208.474us 4.10% 208.474us 69.491us 0.000us 0.00% 0.000us 0.000us 3
|
| 4029 |
+
cudaDeviceSynchronize 58.60% 2.980ms 58.60% 2.980ms 2.980ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
+
Self CPU time total: 5.085ms
|
| 4032 |
+
Self CUDA time total: 3.124ms
|
| 4033 |
|
| 4034 |
|
| 4035 |
|
|
|
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4041 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4042 |
+
hf_kernels_flash_attn 1.92% 106.253us 37.17% 2.059ms 2.059ms 0.000us 0.00% 4.843ms 4.843ms 1
|
| 4043 |
+
_flash_attn_9e27194::fwd 0.86% 47.751us 35.25% 1.953ms 651.011us 3.628ms 100.00% 4.843ms 1.614ms 3
|
| 4044 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.629ms 100.04% 3.629ms 3.629ms 1
|
| 4045 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.628ms 100.00% 3.628ms 1.209ms 3
|
| 4046 |
+
Activity Buffer Request 30.14% 1.670ms 30.14% 1.670ms 1.670ms 1.215ms 33.50% 1.215ms 1.215ms 1
|
| 4047 |
+
cudaDeviceGetAttribute 0.07% 3.881us 0.07% 3.881us 0.259us 0.000us 0.00% 0.000us 0.000us 15
|
| 4048 |
+
aten::empty_like 0.14% 7.581us 0.43% 24.021us 8.007us 0.000us 0.00% 0.000us 0.000us 3
|
| 4049 |
+
aten::empty_strided 0.30% 16.440us 0.30% 16.440us 5.480us 0.000us 0.00% 0.000us 0.000us 3
|
| 4050 |
+
aten::empty 0.39% 21.710us 0.39% 21.710us 2.412us 0.000us 0.00% 0.000us 0.000us 9
|
| 4051 |
+
cudaFuncSetAttribute 0.07% 3.650us 0.07% 3.650us 1.217us 0.000us 0.00% 0.000us 0.000us 3
|
| 4052 |
+
cudaLaunchKernel 3.29% 182.154us 3.29% 182.154us 60.718us 0.000us 0.00% 0.000us 0.000us 3
|
| 4053 |
+
cudaDeviceSynchronize 62.83% 3.482ms 62.83% 3.482ms 3.482ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4054 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4055 |
+
Self CPU time total: 5.541ms
|
| 4056 |
+
Self CUDA time total: 3.628ms
|
| 4057 |
|
| 4058 |
|
| 4059 |
|
|
|
|
| 4063 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4064 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
+
hf_kernels_flash_attn 1.86% 105.712us 36.76% 2.092ms 2.092ms 0.000us 0.00% 4.990ms 4.990ms 1
|
| 4067 |
+
_flash_attn_9e27194::fwd 0.87% 49.631us 34.91% 1.986ms 661.968us 3.741ms 100.00% 4.990ms 1.663ms 3
|
| 4068 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.743ms 100.05% 3.743ms 3.743ms 1
|
| 4069 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.741ms 100.00% 3.741ms 1.247ms 3
|
| 4070 |
+
Activity Buffer Request 29.90% 1.701ms 29.90% 1.701ms 1.701ms 1.249ms 33.38% 1.249ms 1.249ms 1
|
| 4071 |
+
cudaDeviceGetAttribute 0.06% 3.600us 0.06% 3.600us 0.240us 0.000us 0.00% 0.000us 0.000us 15
|
| 4072 |
+
aten::empty_like 0.14% 7.780us 0.42% 24.150us 8.050us 0.000us 0.00% 0.000us 0.000us 3
|
| 4073 |
+
aten::empty_strided 0.29% 16.370us 0.29% 16.370us 5.457us 0.000us 0.00% 0.000us 0.000us 3
|
| 4074 |
+
aten::empty 0.38% 21.420us 0.38% 21.420us 2.380us 0.000us 0.00% 0.000us 0.000us 9
|
| 4075 |
+
cudaFuncSetAttribute 0.06% 3.580us 0.06% 3.580us 1.193us 0.000us 0.00% 0.000us 0.000us 3
|
| 4076 |
+
cudaLaunchKernel 3.20% 182.154us 3.20% 182.154us 60.718us 0.000us 0.00% 0.000us 0.000us 3
|
| 4077 |
+
cudaDeviceSynchronize 63.24% 3.598ms 63.24% 3.598ms 3.598ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4078 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4079 |
+
Self CPU time total: 5.689ms
|
| 4080 |
+
Self CUDA time total: 3.741ms
|
| 4081 |
|
| 4082 |
|
| 4083 |
impl wl p50(ms) ok
|
| 4084 |
+
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
|
| 4085 |
+
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
|
| 4086 |
+
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
|
| 4087 |
+
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.08 True
|
| 4088 |
+
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True
|
| 4089 |
+
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.25 True
|
| 4090 |
</pre></div>
|
| 4091 |
<div class="cell-stderr">
|
| 4092 |
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
|
| 4093 |
|
| 4094 |
+
Fetching 20 files: 10%|█ | 2/20 [00:01<00:14, 1.28it/s]
|
| 4095 |
+
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 12.76it/s]
|
| 4096 |
</div>
|
| 4097 |
<div class="cell-artifacts">
|
| 4098 |
<h4>Artifacts:</h4>
|
flash_attn/impls/hf_kernels_flash_attn3.html
CHANGED
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: benchmark |
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3942,19 +3942,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
|
|
| 3942 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3943 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3944 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3945 |
-
hf_kernels_flash_attn3 3.
|
| 3946 |
-
FlashAttnFunc 2.
|
| 3947 |
-
_flash_attn3_1d39a44::fwd 1.
|
| 3948 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3949 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3950 |
-
Activity Buffer Request
|
| 3951 |
-
aten::empty 0.
|
| 3952 |
-
cudaFuncSetAttribute 0.32% 14.
|
| 3953 |
-
cudaLaunchKernel 1.
|
| 3954 |
-
cudaDeviceSynchronize
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
-
Self CPU time total: 4.
|
| 3957 |
-
Self CUDA time total: 2.
|
| 3958 |
|
| 3959 |
|
| 3960 |
|
|
@@ -3964,19 +3964,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
|
|
| 3964 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3965 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3966 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3967 |
-
hf_kernels_flash_attn3 2.
|
| 3968 |
-
FlashAttnFunc 1.96%
|
| 3969 |
-
_flash_attn3_1d39a44::fwd 1.
|
| 3970 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3971 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3972 |
-
Activity Buffer Request 37.
|
| 3973 |
-
aten::empty 0.
|
| 3974 |
-
cudaFuncSetAttribute 0.
|
| 3975 |
-
cudaLaunchKernel 0.
|
| 3976 |
-
cudaDeviceSynchronize 55.
|
| 3977 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3978 |
-
Self CPU time total: 4.
|
| 3979 |
-
Self CUDA time total: 2.
|
| 3980 |
|
| 3981 |
|
| 3982 |
|
|
@@ -3986,19 +3986,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
|
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
-
hf_kernels_flash_attn3 2.
|
| 3990 |
-
FlashAttnFunc 1.
|
| 3991 |
-
_flash_attn3_1d39a44::fwd 1.
|
| 3992 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3993 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3994 |
-
Activity Buffer Request
|
| 3995 |
-
aten::empty 0.
|
| 3996 |
-
cudaFuncSetAttribute 0.12% 5.
|
| 3997 |
-
cudaLaunchKernel 0.
|
| 3998 |
-
cudaDeviceSynchronize
|
| 3999 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4000 |
-
Self CPU time total: 4.
|
| 4001 |
-
Self CUDA time total: 2.
|
| 4002 |
|
| 4003 |
|
| 4004 |
|
|
@@ -4008,19 +4008,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
|
|
| 4008 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4009 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
-
hf_kernels_flash_attn3 2.
|
| 4012 |
-
FlashAttnFunc 1.
|
| 4013 |
-
_flash_attn3_1d39a44::fwd
|
| 4014 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4015 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4016 |
-
Activity Buffer Request
|
| 4017 |
-
aten::empty 0.
|
| 4018 |
-
cudaFuncSetAttribute 0.11% 5.
|
| 4019 |
-
cudaLaunchKernel
|
| 4020 |
-
cudaDeviceSynchronize
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
-
Self CPU time total: 4.
|
| 4023 |
-
Self CUDA time total: 2.
|
| 4024 |
|
| 4025 |
|
| 4026 |
|
|
@@ -4030,19 +4030,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
|
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4032 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4033 |
-
hf_kernels_flash_attn3
|
| 4034 |
-
FlashAttnFunc 1.
|
| 4035 |
-
_flash_attn3_1d39a44::fwd 0.
|
| 4036 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4037 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4038 |
-
Activity Buffer Request 31.
|
| 4039 |
-
aten::empty 0.
|
| 4040 |
-
cudaFuncSetAttribute 0.
|
| 4041 |
-
cudaLaunchKernel 3.
|
| 4042 |
-
cudaDeviceSynchronize
|
| 4043 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4044 |
-
Self CPU time total: 5.
|
| 4045 |
-
Self CUDA time total: 3.
|
| 4046 |
|
| 4047 |
|
| 4048 |
|
|
@@ -4052,40 +4052,39 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
|
|
| 4052 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4053 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4054 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4055 |
-
hf_kernels_flash_attn3
|
| 4056 |
-
FlashAttnFunc 1.
|
| 4057 |
-
_flash_attn3_1d39a44::fwd 0.
|
| 4058 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4059 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4060 |
-
Activity Buffer Request
|
| 4061 |
-
aten::empty 0.
|
| 4062 |
-
cudaFuncSetAttribute 0.
|
| 4063 |
-
cudaLaunchKernel 3.
|
| 4064 |
-
cudaDeviceSynchronize
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
-
Self CPU time total: 5.
|
| 4067 |
-
Self CUDA time total: 3.
|
| 4068 |
|
| 4069 |
|
| 4070 |
impl wl p50(ms) ok
|
| 4071 |
-
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.
|
| 4072 |
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
|
| 4073 |
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
|
| 4074 |
-
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.
|
| 4075 |
-
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.
|
| 4076 |
-
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.
|
| 4077 |
</pre></div>
|
| 4078 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4079 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4080 |
<div class="uv-logs-content" style="display: none;">
|
| 4081 |
-
Installed
|
| 4082 |
</div>
|
| 4083 |
</div>
|
| 4084 |
-
<div class="cell-stderr">Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]
|
| 4085 |
-
|
| 4086 |
-
Fetching 5 files:
|
| 4087 |
-
Fetching 5 files:
|
| 4088 |
-
Fetching 5 files: 100%|██████████| 5/5 [00:01<00:00, 3.08it/s]</div>
|
| 4089 |
<div class="cell-artifacts">
|
| 4090 |
<h4>Artifacts:</h4>
|
| 4091 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: benchmark | 10.25s
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3942 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3943 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3944 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3945 |
+
hf_kernels_flash_attn3 3.82% 178.994us 47.00% 2.205ms 2.205ms 0.000us 0.00% 3.693ms 3.693ms 1
|
| 3946 |
+
FlashAttnFunc 2.66% 124.811us 43.19% 2.026ms 675.274us 0.000us 0.00% 3.693ms 1.231ms 3
|
| 3947 |
+
_flash_attn3_1d39a44::fwd 1.59% 74.650us 40.52% 1.901ms 633.671us 2.792ms 100.00% 3.693ms 1.231ms 3
|
| 3948 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.794ms 100.05% 2.794ms 2.794ms 1
|
| 3949 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.792ms 100.00% 2.792ms 930.698us 3
|
| 3950 |
+
Activity Buffer Request 36.63% 1.718ms 36.63% 1.718ms 1.718ms 900.576us 32.25% 900.576us 900.576us 1
|
| 3951 |
+
aten::empty 0.99% 46.443us 0.99% 46.443us 7.741us 0.000us 0.00% 0.000us 0.000us 6
|
| 3952 |
+
cudaFuncSetAttribute 0.32% 14.861us 0.32% 14.861us 4.954us 0.000us 0.00% 0.000us 0.000us 3
|
| 3953 |
+
cudaLaunchKernel 1.00% 46.891us 1.00% 46.891us 15.630us 0.000us 0.00% 0.000us 0.000us 3
|
| 3954 |
+
cudaDeviceSynchronize 53.00% 2.486ms 53.00% 2.486ms 2.486ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
+
Self CPU time total: 4.691ms
|
| 3957 |
+
Self CUDA time total: 2.792ms
|
| 3958 |
|
| 3959 |
|
| 3960 |
|
|
|
|
| 3964 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3965 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3966 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3967 |
+
hf_kernels_flash_attn3 2.16% 100.183us 44.03% 2.042ms 2.042ms 0.000us 0.00% 3.752ms 3.752ms 1
|
| 3968 |
+
FlashAttnFunc 1.96% 91.001us 41.87% 1.942ms 647.204us 0.000us 0.00% 3.752ms 1.251ms 3
|
| 3969 |
+
_flash_attn3_1d39a44::fwd 1.03% 47.561us 39.91% 1.851ms 616.870us 2.814ms 100.00% 3.752ms 1.251ms 3
|
| 3970 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.816ms 100.05% 2.816ms 2.816ms 1
|
| 3971 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.814ms 100.00% 2.814ms 938.079us 3
|
| 3972 |
+
Activity Buffer Request 37.49% 1.739ms 37.49% 1.739ms 1.739ms 937.887us 33.33% 937.887us 937.887us 1
|
| 3973 |
+
aten::empty 0.58% 26.762us 0.58% 26.762us 4.460us 0.000us 0.00% 0.000us 0.000us 6
|
| 3974 |
+
cudaFuncSetAttribute 0.11% 5.220us 0.11% 5.220us 1.740us 0.000us 0.00% 0.000us 0.000us 3
|
| 3975 |
+
cudaLaunchKernel 0.70% 32.410us 0.70% 32.410us 10.803us 0.000us 0.00% 0.000us 0.000us 3
|
| 3976 |
+
cudaDeviceSynchronize 55.97% 2.595ms 55.97% 2.595ms 2.595ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3977 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3978 |
+
Self CPU time total: 4.637ms
|
| 3979 |
+
Self CUDA time total: 2.814ms
|
| 3980 |
|
| 3981 |
|
| 3982 |
|
|
|
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
+
hf_kernels_flash_attn3 2.13% 100.213us 42.34% 1.994ms 1.994ms 0.000us 0.00% 3.924ms 3.924ms 1
|
| 3990 |
+
FlashAttnFunc 1.82% 85.940us 40.21% 1.894ms 631.253us 0.000us 0.00% 3.924ms 1.308ms 3
|
| 3991 |
+
_flash_attn3_1d39a44::fwd 1.03% 48.325us 38.38% 1.808ms 602.607us 2.927ms 100.00% 3.924ms 1.308ms 3
|
| 3992 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.929ms 100.05% 2.929ms 2.929ms 1
|
| 3993 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.927ms 100.00% 2.927ms 975.684us 3
|
| 3994 |
+
Activity Buffer Request 36.02% 1.697ms 36.02% 1.697ms 1.697ms 997.252us 34.07% 997.252us 997.252us 1
|
| 3995 |
+
aten::empty 0.56% 26.419us 0.56% 26.419us 4.403us 0.000us 0.00% 0.000us 0.000us 6
|
| 3996 |
+
cudaFuncSetAttribute 0.12% 5.490us 0.12% 5.490us 1.830us 0.000us 0.00% 0.000us 0.000us 3
|
| 3997 |
+
cudaLaunchKernel 0.66% 31.020us 0.66% 31.020us 10.340us 0.000us 0.00% 0.000us 0.000us 3
|
| 3998 |
+
cudaDeviceSynchronize 57.66% 2.716ms 57.66% 2.716ms 2.716ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3999 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4000 |
+
Self CPU time total: 4.710ms
|
| 4001 |
+
Self CUDA time total: 2.927ms
|
| 4002 |
|
| 4003 |
|
| 4004 |
|
|
|
|
| 4008 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4009 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
+
hf_kernels_flash_attn3 2.19% 98.471us 39.26% 1.764ms 1.764ms 0.000us 0.00% 3.945ms 3.945ms 1
|
| 4012 |
+
FlashAttnFunc 1.97% 88.443us 37.06% 1.666ms 555.216us 0.000us 0.00% 3.945ms 1.315ms 3
|
| 4013 |
+
_flash_attn3_1d39a44::fwd 1.11% 49.881us 35.10% 1.577ms 525.735us 2.942ms 100.00% 3.945ms 1.315ms 3
|
| 4014 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.943ms 100.05% 2.943ms 2.943ms 1
|
| 4015 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.942ms 100.00% 2.942ms 980.556us 3
|
| 4016 |
+
Activity Buffer Request 27.81% 1.250ms 27.81% 1.250ms 1.250ms 1.003ms 34.09% 1.003ms 1.003ms 1
|
| 4017 |
+
aten::empty 0.60% 26.780us 0.60% 26.780us 4.463us 0.000us 0.00% 0.000us 0.000us 6
|
| 4018 |
+
cudaFuncSetAttribute 0.11% 5.141us 0.11% 5.141us 1.714us 0.000us 0.00% 0.000us 0.000us 3
|
| 4019 |
+
cudaLaunchKernel 5.46% 245.555us 5.46% 245.555us 81.852us 0.000us 0.00% 0.000us 0.000us 3
|
| 4020 |
+
cudaDeviceSynchronize 60.74% 2.730ms 60.74% 2.730ms 2.730ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
+
Self CPU time total: 4.494ms
|
| 4023 |
+
Self CUDA time total: 2.942ms
|
| 4024 |
|
| 4025 |
|
| 4026 |
|
|
|
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4032 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4033 |
+
hf_kernels_flash_attn3 1.83% 100.852us 39.93% 2.202ms 2.202ms 0.000us 0.00% 4.714ms 4.714ms 1
|
| 4034 |
+
FlashAttnFunc 1.62% 89.332us 38.10% 2.101ms 700.422us 0.000us 0.00% 4.714ms 1.571ms 3
|
| 4035 |
+
_flash_attn3_1d39a44::fwd 0.86% 47.622us 36.48% 2.012ms 670.645us 3.530ms 100.00% 4.714ms 1.571ms 3
|
| 4036 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.531ms 100.04% 3.531ms 3.531ms 1
|
| 4037 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.530ms 100.00% 3.530ms 1.177ms 3
|
| 4038 |
+
Activity Buffer Request 31.48% 1.736ms 31.48% 1.736ms 1.736ms 1.184ms 33.56% 1.184ms 1.184ms 1
|
| 4039 |
+
aten::empty 0.51% 27.890us 0.51% 27.890us 4.648us 0.000us 0.00% 0.000us 0.000us 6
|
| 4040 |
+
cudaFuncSetAttribute 0.09% 5.140us 0.09% 5.140us 1.713us 0.000us 0.00% 0.000us 0.000us 3
|
| 4041 |
+
cudaLaunchKernel 3.53% 194.875us 3.53% 194.875us 64.958us 0.000us 0.00% 0.000us 0.000us 3
|
| 4042 |
+
cudaDeviceSynchronize 60.07% 3.313ms 60.07% 3.313ms 3.313ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4043 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4044 |
+
Self CPU time total: 5.515ms
|
| 4045 |
+
Self CUDA time total: 3.530ms
|
| 4046 |
|
| 4047 |
|
| 4048 |
|
|
|
|
| 4052 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4053 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4054 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4055 |
+
hf_kernels_flash_attn3 1.85% 100.143us 39.23% 2.129ms 2.129ms 0.000us 0.00% 4.688ms 4.688ms 1
|
| 4056 |
+
FlashAttnFunc 1.59% 86.190us 37.39% 2.029ms 676.324us 0.000us 0.00% 4.688ms 1.563ms 3
|
| 4057 |
+
_flash_attn3_1d39a44::fwd 0.90% 48.962us 35.80% 1.943ms 647.594us 3.510ms 100.00% 4.688ms 1.563ms 3
|
| 4058 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.512ms 100.05% 3.512ms 3.512ms 1
|
| 4059 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.510ms 100.00% 3.510ms 1.170ms 3
|
| 4060 |
+
Activity Buffer Request 31.16% 1.691ms 31.16% 1.691ms 1.691ms 1.178ms 33.55% 1.178ms 1.178ms 1
|
| 4061 |
+
aten::empty 0.49% 26.491us 0.49% 26.491us 4.415us 0.000us 0.00% 0.000us 0.000us 6
|
| 4062 |
+
cudaFuncSetAttribute 0.09% 5.060us 0.09% 5.060us 1.687us 0.000us 0.00% 0.000us 0.000us 3
|
| 4063 |
+
cudaLaunchKernel 3.15% 171.134us 3.15% 171.134us 57.045us 0.000us 0.00% 0.000us 0.000us 3
|
| 4064 |
+
cudaDeviceSynchronize 60.77% 3.297ms 60.77% 3.297ms 3.297ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
+
Self CPU time total: 5.427ms
|
| 4067 |
+
Self CUDA time total: 3.510ms
|
| 4068 |
|
| 4069 |
|
| 4070 |
impl wl p50(ms) ok
|
| 4071 |
+
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True
|
| 4072 |
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
|
| 4073 |
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
|
| 4074 |
+
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
|
| 4075 |
+
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.19 True
|
| 4076 |
+
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.22 True
|
| 4077 |
</pre></div>
|
| 4078 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4079 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4080 |
<div class="uv-logs-content" style="display: none;">
|
| 4081 |
+
Installed 51 packages in 298ms
|
| 4082 |
</div>
|
| 4083 |
</div>
|
| 4084 |
+
<div class="cell-stderr">Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]
|
| 4085 |
+
Fetching 5 files: 20%|██ | 1/5 [00:00<00:00, 9.30it/s]
|
| 4086 |
+
Fetching 5 files: 40%|████ | 2/5 [00:01<00:02, 1.12it/s]
|
| 4087 |
+
Fetching 5 files: 100%|██████████| 5/5 [00:01<00:00, 3.22it/s]</div>
|
|
|
|
| 4088 |
<div class="cell-artifacts">
|
| 4089 |
<h4>Artifacts:</h4>
|
| 4090 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/mem_efficient_attention.html
CHANGED
|
@@ -3886,9 +3886,9 @@ body[data-tool="eraser"] .main-content {
|
|
| 3886 |
<span class="collapse-indicators">
|
| 3887 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
-
<span id="uv-indicator-benchmark"
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: benchmark |
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3941,28 +3941,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
|
|
| 3941 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3942 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3943 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3944 |
-
torch_mem_eff
|
| 3945 |
-
torch_mem_eff
|
| 3946 |
-
aten::scaled_dot_product_attention 0.
|
| 3947 |
-
aten::_scaled_dot_product_efficient_attention 0.32% 23.
|
| 3948 |
-
aten::_efficient_attention_forward 0.
|
| 3949 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3950 |
-
aten::contiguous 0.14% 10.
|
| 3951 |
-
aten::clone 0.
|
| 3952 |
-
aten::copy_ 1.
|
| 3953 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3954 |
-
Activity Buffer Request 23.
|
| 3955 |
-
aten::transpose 0.
|
| 3956 |
-
aten::as_strided 0.
|
| 3957 |
-
aten::empty_like 0.
|
| 3958 |
-
aten::empty 1.
|
| 3959 |
-
cudaLaunchKernel 1.
|
| 3960 |
-
cudaStreamIsCapturing 0.05% 3.
|
| 3961 |
-
cudaFuncSetAttribute 0.
|
| 3962 |
-
cudaDeviceSynchronize
|
| 3963 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3964 |
-
Self CPU time total: 7.
|
| 3965 |
-
Self CUDA time total: 5.
|
| 3966 |
|
| 3967 |
|
| 3968 |
|
|
@@ -3972,28 +3972,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
|
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
-
torch_mem_eff
|
| 3976 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3977 |
-
aten::scaled_dot_product_attention 0.
|
| 3978 |
-
aten::_scaled_dot_product_efficient_attention 0.25% 18.
|
| 3979 |
-
aten::_efficient_attention_forward 0.
|
| 3980 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3981 |
-
aten::contiguous 0.09%
|
| 3982 |
-
aten::clone 0.
|
| 3983 |
-
aten::copy_ 0.
|
| 3984 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3985 |
-
Activity Buffer Request
|
| 3986 |
-
aten::transpose 0.
|
| 3987 |
-
aten::as_strided 0.
|
| 3988 |
-
aten::empty_like 0.
|
| 3989 |
-
aten::empty
|
| 3990 |
-
cudaLaunchKernel 1.
|
| 3991 |
-
cudaStreamIsCapturing 0.
|
| 3992 |
-
cudaFuncSetAttribute 0.04%
|
| 3993 |
-
cudaDeviceSynchronize 68.
|
| 3994 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3995 |
-
Self CPU time total: 7.
|
| 3996 |
-
Self CUDA time total: 5.
|
| 3997 |
|
| 3998 |
|
| 3999 |
|
|
@@ -4003,28 +4003,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
|
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4005 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4006 |
-
torch_mem_eff
|
| 4007 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4008 |
-
aten::scaled_dot_product_attention 0.
|
| 4009 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4010 |
-
aten::_efficient_attention_forward 0.
|
| 4011 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4012 |
-
aten::contiguous 0.
|
| 4013 |
-
aten::clone 0.
|
| 4014 |
-
aten::copy_ 0.
|
| 4015 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4016 |
-
Activity Buffer Request 22.
|
| 4017 |
-
aten::transpose 0.
|
| 4018 |
-
aten::as_strided 0.
|
| 4019 |
-
aten::empty_like 0.15% 11.
|
| 4020 |
-
aten::empty 0.
|
| 4021 |
-
cudaLaunchKernel 1.
|
| 4022 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4023 |
-
cudaFuncSetAttribute 0.
|
| 4024 |
-
cudaDeviceSynchronize 69.
|
| 4025 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4026 |
-
Self CPU time total: 7.
|
| 4027 |
-
Self CUDA time total:
|
| 4028 |
|
| 4029 |
|
| 4030 |
|
|
@@ -4034,28 +4034,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
|
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
-
torch_mem_eff 3.
|
| 4038 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4039 |
-
aten::scaled_dot_product_attention 0.
|
| 4040 |
-
aten::_scaled_dot_product_efficient_attention 0.23%
|
| 4041 |
-
aten::_efficient_attention_forward 0.34%
|
| 4042 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.450ms 89.
|
| 4043 |
-
aten::contiguous 0.09% 7.
|
| 4044 |
-
aten::clone 0.
|
| 4045 |
-
aten::copy_ 0.
|
| 4046 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4047 |
-
Activity Buffer Request 21.
|
| 4048 |
-
aten::transpose 0.59% 47.
|
| 4049 |
-
aten::as_strided 0.
|
| 4050 |
-
aten::empty_like 0.14% 11.
|
| 4051 |
-
aten::empty 0.
|
| 4052 |
-
cudaLaunchKernel 3.
|
| 4053 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4054 |
-
cudaFuncSetAttribute 0.
|
| 4055 |
-
cudaDeviceSynchronize
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
-
Self CPU time total: 8.
|
| 4058 |
-
Self CUDA time total: 6.
|
| 4059 |
|
| 4060 |
|
| 4061 |
|
|
@@ -4065,28 +4065,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
|
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4067 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4068 |
-
torch_mem_eff 2.
|
| 4069 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4070 |
-
aten::scaled_dot_product_attention 0.
|
| 4071 |
-
aten::_scaled_dot_product_efficient_attention 0.23%
|
| 4072 |
-
aten::_efficient_attention_forward 0.
|
| 4073 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4074 |
-
aten::contiguous 0.
|
| 4075 |
-
aten::clone 0.
|
| 4076 |
-
aten::copy_ 0.
|
| 4077 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4078 |
-
Activity Buffer Request
|
| 4079 |
-
aten::transpose 0.
|
| 4080 |
-
aten::as_strided 0.
|
| 4081 |
-
aten::empty_like 0.14% 11.
|
| 4082 |
-
aten::empty 0.81%
|
| 4083 |
-
cudaLaunchKernel 3.
|
| 4084 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4085 |
-
cudaFuncSetAttribute 0.
|
| 4086 |
-
cudaDeviceSynchronize
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
-
Self CPU time total: 8.
|
| 4089 |
-
Self CUDA time total: 6.
|
| 4090 |
|
| 4091 |
|
| 4092 |
|
|
@@ -4096,44 +4096,38 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
|
|
| 4096 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4097 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4098 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4099 |
-
torch_mem_eff 2.
|
| 4100 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4101 |
-
aten::scaled_dot_product_attention 0.
|
| 4102 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4103 |
-
aten::_efficient_attention_forward 0.
|
| 4104 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4105 |
-
aten::contiguous 0.
|
| 4106 |
-
aten::clone 0.
|
| 4107 |
-
aten::copy_ 0.
|
| 4108 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4109 |
-
Activity Buffer Request 20.
|
| 4110 |
-
aten::transpose 0.
|
| 4111 |
-
aten::as_strided 0.
|
| 4112 |
-
aten::empty_like 0.
|
| 4113 |
-
aten::empty 0.
|
| 4114 |
-
cudaLaunchKernel
|
| 4115 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4116 |
-
cudaFuncSetAttribute 0.03%
|
| 4117 |
-
cudaDeviceSynchronize
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
-
Self CPU time total: 8.
|
| 4120 |
-
Self CUDA time total: 6.
|
| 4121 |
|
| 4122 |
|
| 4123 |
impl wl p50(ms) ok
|
| 4124 |
-
torch_mem_eff cuda_attn_L128_bfloat16 1.
|
| 4125 |
-
torch_mem_eff cuda_attn_L256_bfloat16 1.
|
| 4126 |
-
torch_mem_eff cuda_attn_L320_bfloat16
|
| 4127 |
-
torch_mem_eff cuda_attn_L384_bfloat16
|
| 4128 |
-
torch_mem_eff cuda_attn_L448_bfloat16 2.
|
| 4129 |
-
torch_mem_eff cuda_attn_L512_bfloat16 2.
|
| 4130 |
</pre></div>
|
| 4131 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4132 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4133 |
-
<div class="uv-logs-content" style="display: none;">
|
| 4134 |
-
Installed 37 packages in 340ms
|
| 4135 |
-
</div>
|
| 4136 |
-
</div>
|
| 4137 |
<div class="cell-artifacts">
|
| 4138 |
<h4>Artifacts:</h4>
|
| 4139 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3886 |
<span class="collapse-indicators">
|
| 3887 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: benchmark | 4.12s
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3941 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3942 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3943 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3944 |
+
torch_mem_eff 4.25% 311.827us 34.94% 2.563ms 2.563ms 0.000us 0.00% 5.488ms 5.488ms 1
|
| 3945 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.465ms 100.56% 5.465ms 5.465ms 1
|
| 3946 |
+
aten::scaled_dot_product_attention 0.42% 30.830us 2.38% 174.593us 58.198us 0.000us 0.00% 4.817ms 1.606ms 3
|
| 3947 |
+
aten::_scaled_dot_product_efficient_attention 0.32% 23.429us 1.96% 143.763us 47.921us 0.000us 0.00% 4.817ms 1.606ms 3
|
| 3948 |
+
aten::_efficient_attention_forward 0.46% 33.832us 1.33% 97.922us 32.641us 4.817ms 88.64% 4.817ms 1.606ms 3
|
| 3949 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.817ms 88.64% 4.817ms 1.606ms 3
|
| 3950 |
+
aten::contiguous 0.14% 10.180us 27.43% 2.012ms 223.532us 0.000us 0.00% 670.850us 74.539us 9
|
| 3951 |
+
aten::clone 0.43% 31.262us 27.29% 2.002ms 222.401us 0.000us 0.00% 670.850us 74.539us 9
|
| 3952 |
+
aten::copy_ 1.01% 74.042us 25.85% 1.896ms 210.687us 617.346us 11.36% 670.850us 74.539us 9
|
| 3953 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 617.346us 11.36% 617.346us 68.594us 9
|
| 3954 |
+
Activity Buffer Request 23.70% 1.738ms 23.70% 1.738ms 1.738ms 53.504us 0.98% 53.504us 53.504us 1
|
| 3955 |
+
aten::transpose 0.89% 65.502us 1.19% 87.343us 3.639us 0.000us 0.00% 0.000us 0.000us 24
|
| 3956 |
+
aten::as_strided 0.30% 21.841us 0.30% 21.841us 0.910us 0.000us 0.00% 0.000us 0.000us 24
|
| 3957 |
+
aten::empty_like 0.21% 15.520us 1.01% 74.161us 8.240us 0.000us 0.00% 0.000us 0.000us 9
|
| 3958 |
+
aten::empty 1.17% 85.772us 1.17% 85.772us 4.084us 0.000us 0.00% 0.000us 0.000us 21
|
| 3959 |
+
cudaLaunchKernel 1.48% 108.273us 1.48% 108.273us 9.023us 0.000us 0.00% 0.000us 0.000us 12
|
| 3960 |
+
cudaStreamIsCapturing 0.05% 3.869us 0.05% 3.869us 1.290us 0.000us 0.00% 0.000us 0.000us 3
|
| 3961 |
+
cudaFuncSetAttribute 0.12% 8.830us 0.12% 8.830us 2.943us 0.000us 0.00% 0.000us 0.000us 3
|
| 3962 |
+
cudaDeviceSynchronize 65.06% 4.772ms 65.06% 4.772ms 4.772ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3963 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3964 |
+
Self CPU time total: 7.335ms
|
| 3965 |
+
Self CUDA time total: 5.434ms
|
| 3966 |
|
| 3967 |
|
| 3968 |
|
|
|
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
+
torch_mem_eff 3.26% 247.835us 31.36% 2.385ms 2.385ms 0.000us 0.00% 5.867ms 5.867ms 1
|
| 3976 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.821ms 100.14% 5.821ms 5.821ms 1
|
| 3977 |
+
aten::scaled_dot_product_attention 0.22% 16.881us 1.81% 137.424us 45.808us 0.000us 0.00% 5.175ms 1.725ms 3
|
| 3978 |
+
aten::_scaled_dot_product_efficient_attention 0.25% 18.660us 1.59% 120.543us 40.181us 0.000us 0.00% 5.175ms 1.725ms 3
|
| 3979 |
+
aten::_efficient_attention_forward 0.35% 26.843us 1.04% 78.951us 26.317us 5.175ms 89.03% 5.175ms 1.725ms 3
|
| 3980 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.175ms 89.03% 5.175ms 1.725ms 3
|
| 3981 |
+
aten::contiguous 0.09% 7.172us 25.72% 1.955ms 217.264us 0.000us 0.00% 691.584us 76.843us 9
|
| 3982 |
+
aten::clone 0.31% 23.260us 25.62% 1.948ms 216.467us 0.000us 0.00% 691.584us 76.843us 9
|
| 3983 |
+
aten::copy_ 0.84% 64.031us 24.18% 1.839ms 204.318us 637.408us 10.97% 691.584us 76.843us 9
|
| 3984 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 637.408us 10.97% 637.408us 70.823us 9
|
| 3985 |
+
Activity Buffer Request 22.42% 1.705ms 22.42% 1.705ms 1.705ms 54.176us 0.93% 54.176us 54.176us 1
|
| 3986 |
+
aten::transpose 0.64% 49.041us 0.88% 66.991us 2.791us 0.000us 0.00% 0.000us 0.000us 24
|
| 3987 |
+
aten::as_strided 0.24% 17.950us 0.24% 17.950us 0.748us 0.000us 0.00% 0.000us 0.000us 24
|
| 3988 |
+
aten::empty_like 0.17% 12.602us 1.13% 86.083us 9.565us 0.000us 0.00% 0.000us 0.000us 9
|
| 3989 |
+
aten::empty 1.29% 98.070us 1.29% 98.070us 4.670us 0.000us 0.00% 0.000us 0.000us 21
|
| 3990 |
+
cudaLaunchKernel 1.22% 92.470us 1.22% 92.470us 7.706us 0.000us 0.00% 0.000us 0.000us 12
|
| 3991 |
+
cudaStreamIsCapturing 0.04% 2.690us 0.04% 2.690us 0.897us 0.000us 0.00% 0.000us 0.000us 3
|
| 3992 |
+
cudaFuncSetAttribute 0.04% 2.679us 0.04% 2.679us 0.893us 0.000us 0.00% 0.000us 0.000us 3
|
| 3993 |
+
cudaDeviceSynchronize 68.64% 5.219ms 68.64% 5.219ms 5.219ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3994 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3995 |
+
Self CPU time total: 7.603ms
|
| 3996 |
+
Self CUDA time total: 5.812ms
|
| 3997 |
|
| 3998 |
|
| 3999 |
|
|
|
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4005 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4006 |
+
torch_mem_eff 3.07% 241.867us 30.18% 2.381ms 2.381ms 0.000us 0.00% 6.114ms 6.114ms 1
|
| 4007 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.067ms 100.14% 6.067ms 6.067ms 1
|
| 4008 |
+
aten::scaled_dot_product_attention 0.22% 17.069us 1.75% 137.963us 45.988us 0.000us 0.00% 5.411ms 1.804ms 3
|
| 4009 |
+
aten::_scaled_dot_product_efficient_attention 0.24% 18.570us 1.53% 120.894us 40.298us 0.000us 0.00% 5.411ms 1.804ms 3
|
| 4010 |
+
aten::_efficient_attention_forward 0.35% 27.663us 1.02% 80.252us 26.751us 5.411ms 89.32% 5.411ms 1.804ms 3
|
| 4011 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.411ms 89.32% 5.411ms 1.804ms 3
|
| 4012 |
+
aten::contiguous 0.11% 8.338us 24.80% 1.957ms 217.397us 0.000us 0.00% 703.296us 78.144us 9
|
| 4013 |
+
aten::clone 0.29% 22.493us 24.69% 1.948ms 216.470us 0.000us 0.00% 703.296us 78.144us 9
|
| 4014 |
+
aten::copy_ 0.83% 65.242us 23.73% 1.872ms 208.052us 647.296us 10.68% 703.296us 78.144us 9
|
| 4015 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 647.296us 10.68% 647.296us 71.922us 9
|
| 4016 |
+
Activity Buffer Request 22.06% 1.740ms 22.06% 1.740ms 1.740ms 56.000us 0.92% 56.000us 56.000us 1
|
| 4017 |
+
aten::transpose 0.64% 50.792us 0.85% 67.072us 2.795us 0.000us 0.00% 0.000us 0.000us 24
|
| 4018 |
+
aten::as_strided 0.21% 16.280us 0.21% 16.280us 0.678us 0.000us 0.00% 0.000us 0.000us 24
|
| 4019 |
+
aten::empty_like 0.15% 11.839us 0.68% 53.270us 5.919us 0.000us 0.00% 0.000us 0.000us 9
|
| 4020 |
+
aten::empty 0.84% 66.171us 0.84% 66.171us 3.151us 0.000us 0.00% 0.000us 0.000us 21
|
| 4021 |
+
cudaLaunchKernel 1.13% 89.500us 1.13% 89.500us 7.458us 0.000us 0.00% 0.000us 0.000us 12
|
| 4022 |
+
cudaStreamIsCapturing 0.03% 2.430us 0.03% 2.430us 0.810us 0.000us 0.00% 0.000us 0.000us 3
|
| 4023 |
+
cudaFuncSetAttribute 0.03% 2.650us 0.03% 2.650us 0.883us 0.000us 0.00% 0.000us 0.000us 3
|
| 4024 |
+
cudaDeviceSynchronize 69.82% 5.508ms 69.82% 5.508ms 5.508ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4025 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4026 |
+
Self CPU time total: 7.890ms
|
| 4027 |
+
Self CUDA time total: 6.058ms
|
| 4028 |
|
| 4029 |
|
| 4030 |
|
|
|
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
+
torch_mem_eff 3.00% 245.113us 31.96% 2.610ms 2.610ms 0.000us 0.00% 6.162ms 6.162ms 1
|
| 4038 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.113ms 100.14% 6.113ms 6.113ms 1
|
| 4039 |
+
aten::scaled_dot_product_attention 0.20% 16.700us 1.71% 139.473us 46.491us 0.000us 0.00% 5.450ms 1.817ms 3
|
| 4040 |
+
aten::_scaled_dot_product_efficient_attention 0.23% 18.811us 1.50% 122.773us 40.924us 0.000us 0.00% 5.450ms 1.817ms 3
|
| 4041 |
+
aten::_efficient_attention_forward 0.34% 27.691us 0.98% 80.171us 26.724us 5.450ms 89.27% 5.450ms 1.817ms 3
|
| 4042 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.450ms 89.27% 5.450ms 1.817ms 3
|
| 4043 |
+
aten::contiguous 0.09% 7.732us 26.74% 2.184ms 242.673us 0.000us 0.00% 712.645us 79.183us 9
|
| 4044 |
+
aten::clone 0.28% 22.711us 26.65% 2.176ms 241.814us 0.000us 0.00% 712.645us 79.183us 9
|
| 4045 |
+
aten::copy_ 0.78% 63.988us 25.72% 2.101ms 233.430us 654.820us 10.73% 712.645us 79.183us 9
|
| 4046 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.820us 10.73% 654.820us 72.758us 9
|
| 4047 |
+
Activity Buffer Request 21.86% 1.785ms 21.86% 1.785ms 1.785ms 57.825us 0.95% 57.825us 57.825us 1
|
| 4048 |
+
aten::transpose 0.59% 47.982us 0.80% 65.243us 2.718us 0.000us 0.00% 0.000us 0.000us 24
|
| 4049 |
+
aten::as_strided 0.21% 17.261us 0.21% 17.261us 0.719us 0.000us 0.00% 0.000us 0.000us 24
|
| 4050 |
+
aten::empty_like 0.14% 11.742us 0.65% 52.742us 5.860us 0.000us 0.00% 0.000us 0.000us 9
|
| 4051 |
+
aten::empty 0.82% 66.990us 0.82% 66.990us 3.190us 0.000us 0.00% 0.000us 0.000us 21
|
| 4052 |
+
cudaLaunchKernel 3.34% 272.558us 3.34% 272.558us 22.713us 0.000us 0.00% 0.000us 0.000us 12
|
| 4053 |
+
cudaStreamIsCapturing 0.03% 2.519us 0.03% 2.519us 0.840us 0.000us 0.00% 0.000us 0.000us 3
|
| 4054 |
+
cudaFuncSetAttribute 0.03% 2.830us 0.03% 2.830us 0.943us 0.000us 0.00% 0.000us 0.000us 3
|
| 4055 |
+
cudaDeviceSynchronize 68.04% 5.557ms 68.04% 5.557ms 5.557ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
+
Self CPU time total: 8.167ms
|
| 4058 |
+
Self CUDA time total: 6.105ms
|
| 4059 |
|
| 4060 |
|
| 4061 |
|
|
|
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4067 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4068 |
+
torch_mem_eff 2.93% 244.444us 30.49% 2.544ms 2.544ms 0.000us 0.00% 6.411ms 6.411ms 1
|
| 4069 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.361ms 100.14% 6.361ms 6.361ms 1
|
| 4070 |
+
aten::scaled_dot_product_attention 0.20% 16.791us 1.67% 139.273us 46.424us 0.000us 0.00% 5.684ms 1.895ms 3
|
| 4071 |
+
aten::_scaled_dot_product_efficient_attention 0.23% 19.350us 1.47% 122.482us 40.827us 0.000us 0.00% 5.684ms 1.895ms 3
|
| 4072 |
+
aten::_efficient_attention_forward 0.32% 26.939us 0.96% 79.712us 26.571us 5.684ms 89.48% 5.684ms 1.895ms 3
|
| 4073 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.684ms 89.48% 5.684ms 1.895ms 3
|
| 4074 |
+
aten::contiguous 0.10% 8.370us 25.37% 2.117ms 235.225us 0.000us 0.00% 726.946us 80.772us 9
|
| 4075 |
+
aten::clone 0.27% 22.301us 25.27% 2.109ms 234.295us 0.000us 0.00% 726.946us 80.772us 9
|
| 4076 |
+
aten::copy_ 0.79% 65.502us 24.38% 2.034ms 226.048us 668.514us 10.52% 726.946us 80.772us 9
|
| 4077 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 668.514us 10.52% 668.514us 74.279us 9
|
| 4078 |
+
Activity Buffer Request 20.48% 1.709ms 20.48% 1.709ms 1.709ms 58.432us 0.92% 58.432us 58.432us 1
|
| 4079 |
+
aten::transpose 0.59% 49.601us 0.80% 67.072us 2.795us 0.000us 0.00% 0.000us 0.000us 24
|
| 4080 |
+
aten::as_strided 0.21% 17.471us 0.21% 17.471us 0.728us 0.000us 0.00% 0.000us 0.000us 24
|
| 4081 |
+
aten::empty_like 0.14% 11.518us 0.62% 51.920us 5.769us 0.000us 0.00% 0.000us 0.000us 9
|
| 4082 |
+
aten::empty 0.81% 67.173us 0.81% 67.173us 3.199us 0.000us 0.00% 0.000us 0.000us 21
|
| 4083 |
+
cudaLaunchKernel 3.36% 280.595us 3.36% 280.595us 23.383us 0.000us 0.00% 0.000us 0.000us 12
|
| 4084 |
+
cudaStreamIsCapturing 0.03% 2.391us 0.03% 2.391us 0.797us 0.000us 0.00% 0.000us 0.000us 3
|
| 4085 |
+
cudaFuncSetAttribute 0.03% 2.751us 0.03% 2.751us 0.917us 0.000us 0.00% 0.000us 0.000us 3
|
| 4086 |
+
cudaDeviceSynchronize 69.51% 5.799ms 69.51% 5.799ms 5.799ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
+
Self CPU time total: 8.344ms
|
| 4089 |
+
Self CUDA time total: 6.353ms
|
| 4090 |
|
| 4091 |
|
| 4092 |
|
|
|
|
| 4096 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4097 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4098 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4099 |
+
torch_mem_eff 2.83% 247.966us 30.03% 2.630ms 2.630ms 0.000us 0.00% 6.745ms 6.745ms 1
|
| 4100 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.693ms 100.13% 6.693ms 6.693ms 1
|
| 4101 |
+
aten::scaled_dot_product_attention 0.19% 17.071us 1.57% 137.393us 45.798us 0.000us 0.00% 6.009ms 2.003ms 3
|
| 4102 |
+
aten::_scaled_dot_product_efficient_attention 0.21% 18.029us 1.37% 120.322us 40.107us 0.000us 0.00% 6.009ms 2.003ms 3
|
| 4103 |
+
aten::_efficient_attention_forward 0.30% 26.699us 0.92% 80.822us 26.941us 6.009ms 89.89% 6.009ms 2.003ms 3
|
| 4104 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 6.009ms 89.89% 6.009ms 2.003ms 3
|
| 4105 |
+
aten::contiguous 0.09% 8.060us 25.13% 2.201ms 244.542us 0.000us 0.00% 736.293us 81.810us 9
|
| 4106 |
+
aten::clone 0.25% 21.768us 25.04% 2.193ms 243.646us 0.000us 0.00% 736.293us 81.810us 9
|
| 4107 |
+
aten::copy_ 0.76% 66.873us 24.16% 2.115ms 235.039us 675.652us 10.11% 736.293us 81.810us 9
|
| 4108 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 675.652us 10.11% 675.652us 75.072us 9
|
| 4109 |
+
Activity Buffer Request 20.46% 1.792ms 20.46% 1.792ms 1.792ms 60.641us 0.91% 60.641us 60.641us 1
|
| 4110 |
+
aten::transpose 0.56% 48.641us 0.74% 65.181us 2.716us 0.000us 0.00% 0.000us 0.000us 24
|
| 4111 |
+
aten::as_strided 0.19% 16.540us 0.19% 16.540us 0.689us 0.000us 0.00% 0.000us 0.000us 24
|
| 4112 |
+
aten::empty_like 0.14% 12.261us 0.64% 55.702us 6.189us 0.000us 0.00% 0.000us 0.000us 9
|
| 4113 |
+
aten::empty 0.78% 68.633us 0.78% 68.633us 3.268us 0.000us 0.00% 0.000us 0.000us 21
|
| 4114 |
+
cudaLaunchKernel 3.20% 280.067us 3.20% 280.067us 23.339us 0.000us 0.00% 0.000us 0.000us 12
|
| 4115 |
+
cudaStreamIsCapturing 0.03% 2.620us 0.03% 2.620us 0.873us 0.000us 0.00% 0.000us 0.000us 3
|
| 4116 |
+
cudaFuncSetAttribute 0.03% 2.860us 0.03% 2.860us 0.953us 0.000us 0.00% 0.000us 0.000us 3
|
| 4117 |
+
cudaDeviceSynchronize 69.97% 6.127ms 69.97% 6.127ms 6.127ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
+
Self CPU time total: 8.757ms
|
| 4120 |
+
Self CUDA time total: 6.684ms
|
| 4121 |
|
| 4122 |
|
| 4123 |
impl wl p50(ms) ok
|
| 4124 |
+
torch_mem_eff cuda_attn_L128_bfloat16 1.86 True
|
| 4125 |
+
torch_mem_eff cuda_attn_L256_bfloat16 1.92 True
|
| 4126 |
+
torch_mem_eff cuda_attn_L320_bfloat16 2.02 True
|
| 4127 |
+
torch_mem_eff cuda_attn_L384_bfloat16 1.99 True
|
| 4128 |
+
torch_mem_eff cuda_attn_L448_bfloat16 2.10 True
|
| 4129 |
+
torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
|
| 4130 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4131 |
<div class="cell-artifacts">
|
| 4132 |
<h4>Artifacts:</h4>
|
| 4133 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/sage_attention.html
CHANGED
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: benchmark | 4.
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3938,24 +3938,22 @@ Cell: benchmark | 4.72s
|
|
| 3938 |
<div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
|
| 3939 |
impl wl p50(ms) ok
|
| 3940 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 3941 |
-
Error: module '
|
| 3942 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 3943 |
-
Error: module '
|
| 3944 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 3945 |
-
Error: module '
|
| 3946 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 3947 |
-
Error: module '
|
| 3948 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 3949 |
-
Error: module '
|
| 3950 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 3951 |
-
Error: module '
|
| 3952 |
</pre></div>
|
| 3953 |
<div class="cell-stderr">
|
| 3954 |
-
Fetching 8 files: 0%| | 0/8 [00:00<?, ?it/s]
|
| 3955 |
-
|
| 3956 |
-
Fetching 8 files:
|
| 3957 |
-
Fetching 8 files: 38%|███▊ | 3/8 [00:00<00:01, 3.86it/s]
|
| 3958 |
-
Fetching 8 files: 100%|██████████| 8/8 [00:00<00:00, 10.82it/s]
|
| 3959 |
</div>
|
| 3960 |
<div class="cell-artifacts">
|
| 3961 |
<h4>Artifacts:</h4>
|
|
|
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: benchmark | 4.95s
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3938 |
<div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
|
| 3939 |
impl wl p50(ms) ok
|
| 3940 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 3941 |
+
Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
|
| 3942 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 3943 |
+
Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
|
| 3944 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 3945 |
+
Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
|
| 3946 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 3947 |
+
Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
|
| 3948 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 3949 |
+
Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
|
| 3950 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 3951 |
+
Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
|
| 3952 |
</pre></div>
|
| 3953 |
<div class="cell-stderr">
|
| 3954 |
+
Fetching 8 files: 0%| | 0/8 [00:00<?, ?it/s]
|
| 3955 |
+
Fetching 8 files: 38%|███▊ | 3/8 [00:00<00:01, 3.95it/s]
|
| 3956 |
+
Fetching 8 files: 100%|██████████| 8/8 [00:00<00:00, 10.53it/s]
|
|
|
|
|
|
|
| 3957 |
</div>
|
| 3958 |
<div class="cell-artifacts">
|
| 3959 |
<h4>Artifacts:</h4>
|
flash_attn/impls/xformers.html
CHANGED
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: benchmark | 5.
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3940,21 +3940,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
|
|
| 3940 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3941 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3942 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3943 |
-
xformers_meff 9.
|
| 3944 |
-
xformers_flash3::flash_fwd 4.
|
| 3945 |
-
flash_attn_3::fwd 1.
|
| 3946 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3947 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3948 |
-
Activity Buffer Request
|
| 3949 |
-
aten::empty 0.
|
| 3950 |
-
cudaFuncSetAttribute 0.25%
|
| 3951 |
-
cudaLaunchKernel 0.
|
| 3952 |
-
aten::reshape 0.
|
| 3953 |
-
aten::view 0.48%
|
| 3954 |
-
cudaDeviceSynchronize
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
-
Self CPU time total: 4.
|
| 3957 |
-
Self CUDA time total: 2.
|
| 3958 |
|
| 3959 |
|
| 3960 |
|
|
@@ -3964,21 +3964,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
|
|
| 3964 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3965 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3966 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3967 |
-
xformers_meff 6.
|
| 3968 |
-
xformers_flash3::flash_fwd 2.
|
| 3969 |
-
flash_attn_3::fwd 1.
|
| 3970 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3971 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3972 |
-
Activity Buffer Request
|
| 3973 |
-
aten::empty 0.
|
| 3974 |
-
cudaFuncSetAttribute 0.
|
| 3975 |
-
cudaLaunchKernel 0.
|
| 3976 |
-
aten::reshape 0.
|
| 3977 |
-
aten::view 0.
|
| 3978 |
-
cudaDeviceSynchronize
|
| 3979 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3980 |
-
Self CPU time total: 4.
|
| 3981 |
-
Self CUDA time total: 2.
|
| 3982 |
|
| 3983 |
|
| 3984 |
|
|
@@ -3988,21 +3988,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
|
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3990 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3991 |
-
xformers_meff 6.
|
| 3992 |
-
xformers_flash3::flash_fwd
|
| 3993 |
-
flash_attn_3::fwd 1.
|
| 3994 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3995 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3996 |
-
Activity Buffer Request 35.
|
| 3997 |
-
aten::empty 0.
|
| 3998 |
-
cudaFuncSetAttribute 0.11% 5.
|
| 3999 |
-
cudaLaunchKernel 0.
|
| 4000 |
-
aten::reshape 0.
|
| 4001 |
-
aten::view 0.
|
| 4002 |
-
cudaDeviceSynchronize
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
-
Self CPU time total: 4.
|
| 4005 |
-
Self CUDA time total: 2.
|
| 4006 |
|
| 4007 |
|
| 4008 |
|
|
@@ -4012,21 +4012,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
|
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4014 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4015 |
-
xformers_meff 6.
|
| 4016 |
-
xformers_flash3::flash_fwd 2.
|
| 4017 |
-
flash_attn_3::fwd 1.
|
| 4018 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4019 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4020 |
-
Activity Buffer Request 34.
|
| 4021 |
-
aten::empty 0.
|
| 4022 |
-
cudaFuncSetAttribute 0.11% 5.
|
| 4023 |
-
cudaLaunchKernel
|
| 4024 |
-
aten::reshape 0.
|
| 4025 |
-
aten::view 0.
|
| 4026 |
-
cudaDeviceSynchronize
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
-
Self CPU time total:
|
| 4029 |
-
Self CUDA time total: 2.
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
@@ -4036,21 +4036,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
-
xformers_meff 5.
|
| 4040 |
-
xformers_flash3::flash_fwd 2.
|
| 4041 |
-
flash_attn_3::fwd 0.
|
| 4042 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4043 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4044 |
-
Activity Buffer Request
|
| 4045 |
-
aten::empty 0.
|
| 4046 |
-
cudaFuncSetAttribute 0.
|
| 4047 |
-
cudaLaunchKernel 3.
|
| 4048 |
-
aten::reshape 0.15% 8.
|
| 4049 |
-
aten::view 0.25% 13.
|
| 4050 |
-
cudaDeviceSynchronize
|
| 4051 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4052 |
-
Self CPU time total: 5.
|
| 4053 |
-
Self CUDA time total: 3.
|
| 4054 |
|
| 4055 |
|
| 4056 |
|
|
@@ -4060,37 +4060,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
|
|
| 4060 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4061 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4062 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4063 |
-
xformers_meff 5.
|
| 4064 |
-
xformers_flash3::flash_fwd
|
| 4065 |
-
flash_attn_3::fwd 0.
|
| 4066 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4067 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4068 |
-
Activity Buffer Request
|
| 4069 |
-
aten::empty 0.
|
| 4070 |
-
cudaFuncSetAttribute 0.
|
| 4071 |
-
cudaLaunchKernel 3.
|
| 4072 |
-
aten::reshape 0.15% 8.
|
| 4073 |
-
aten::view 0.25%
|
| 4074 |
-
cudaDeviceSynchronize
|
| 4075 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4076 |
-
Self CPU time total: 5.
|
| 4077 |
-
Self CUDA time total: 3.
|
| 4078 |
|
| 4079 |
|
| 4080 |
impl wl p50(ms) ok
|
| 4081 |
-
xformers_meff cuda_attn_L128_bfloat16 0.
|
| 4082 |
xformers_meff cuda_attn_L256_bfloat16 1.04 True
|
| 4083 |
-
xformers_meff cuda_attn_L320_bfloat16 1.
|
| 4084 |
-
xformers_meff cuda_attn_L384_bfloat16 1.
|
| 4085 |
xformers_meff cuda_attn_L448_bfloat16 1.26 True
|
| 4086 |
-
xformers_meff cuda_attn_L512_bfloat16 1.
|
| 4087 |
</pre></div>
|
| 4088 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4089 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4090 |
<div class="uv-logs-content" style="display: none;">
|
| 4091 |
Downloading xformers (111.8MiB)
|
| 4092 |
Downloaded xformers
|
| 4093 |
-
Installed 1 package in
|
| 4094 |
</div>
|
| 4095 |
</div>
|
| 4096 |
<div class="cell-artifacts">
|
|
|
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: benchmark | 5.67s
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3940 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3941 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3942 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3943 |
+
xformers_meff 9.78% 468.612us 53.77% 2.576ms 2.576ms 0.000us 0.00% 3.664ms 3.664ms 1
|
| 3944 |
+
xformers_flash3::flash_fwd 4.05% 193.923us 43.19% 2.069ms 689.708us 0.000us 0.00% 3.664ms 1.221ms 3
|
| 3945 |
+
flash_attn_3::fwd 1.52% 72.582us 39.15% 1.875ms 625.067us 2.752ms 100.00% 3.664ms 1.221ms 3
|
| 3946 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.754ms 100.05% 2.754ms 2.754ms 1
|
| 3947 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.752ms 100.00% 2.752ms 917.464us 3
|
| 3948 |
+
Activity Buffer Request 35.57% 1.704ms 35.57% 1.704ms 1.704ms 911.394us 33.11% 911.394us 911.394us 1
|
| 3949 |
+
aten::empty 0.91% 43.821us 0.91% 43.821us 7.304us 0.000us 0.00% 0.000us 0.000us 6
|
| 3950 |
+
cudaFuncSetAttribute 0.25% 12.121us 0.25% 12.121us 4.040us 0.000us 0.00% 0.000us 0.000us 3
|
| 3951 |
+
cudaLaunchKernel 0.89% 42.701us 0.89% 42.701us 14.234us 0.000us 0.00% 0.000us 0.000us 3
|
| 3952 |
+
aten::reshape 0.31% 15.029us 0.79% 38.050us 6.342us 0.000us 0.00% 0.000us 0.000us 6
|
| 3953 |
+
aten::view 0.48% 23.021us 0.48% 23.021us 3.837us 0.000us 0.00% 0.000us 0.000us 6
|
| 3954 |
+
cudaDeviceSynchronize 46.23% 2.215ms 46.23% 2.215ms 2.215ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
+
Self CPU time total: 4.790ms
|
| 3957 |
+
Self CUDA time total: 2.752ms
|
| 3958 |
|
| 3959 |
|
| 3960 |
|
|
|
|
| 3964 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3965 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3966 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3967 |
+
xformers_meff 6.55% 315.485us 49.52% 2.386ms 2.386ms 0.000us 0.00% 3.791ms 3.791ms 1
|
| 3968 |
+
xformers_flash3::flash_fwd 2.94% 141.873us 42.50% 2.048ms 682.535us 0.000us 0.00% 3.791ms 1.264ms 3
|
| 3969 |
+
flash_attn_3::fwd 1.10% 52.803us 39.56% 1.906ms 635.244us 2.857ms 100.00% 3.791ms 1.264ms 3
|
| 3970 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.858ms 100.05% 2.858ms 2.858ms 1
|
| 3971 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.857ms 100.00% 2.857ms 952.327us 3
|
| 3972 |
+
Activity Buffer Request 37.05% 1.785ms 37.05% 1.785ms 1.785ms 933.660us 32.68% 933.660us 933.660us 1
|
| 3973 |
+
aten::empty 0.60% 29.019us 0.60% 29.019us 4.837us 0.000us 0.00% 0.000us 0.000us 6
|
| 3974 |
+
cudaFuncSetAttribute 0.12% 5.710us 0.12% 5.710us 1.903us 0.000us 0.00% 0.000us 0.000us 3
|
| 3975 |
+
cudaLaunchKernel 0.69% 33.350us 0.69% 33.350us 11.117us 0.000us 0.00% 0.000us 0.000us 3
|
| 3976 |
+
aten::reshape 0.18% 8.801us 0.47% 22.752us 3.792us 0.000us 0.00% 0.000us 0.000us 6
|
| 3977 |
+
aten::view 0.29% 13.951us 0.29% 13.951us 2.325us 0.000us 0.00% 0.000us 0.000us 6
|
| 3978 |
+
cudaDeviceSynchronize 50.48% 2.432ms 50.48% 2.432ms 2.432ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3979 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3980 |
+
Self CPU time total: 4.818ms
|
| 3981 |
+
Self CUDA time total: 2.857ms
|
| 3982 |
|
| 3983 |
|
| 3984 |
|
|
|
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3990 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3991 |
+
xformers_meff 6.44% 303.576us 47.74% 2.252ms 2.252ms 0.000us 0.00% 3.845ms 3.845ms 1
|
| 3992 |
+
xformers_flash3::flash_fwd 3.02% 142.344us 40.83% 1.926ms 641.984us 0.000us 0.00% 3.845ms 1.282ms 3
|
| 3993 |
+
flash_attn_3::fwd 1.11% 52.511us 37.81% 1.784ms 594.536us 2.878ms 100.00% 3.845ms 1.282ms 3
|
| 3994 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.880ms 100.05% 2.880ms 2.880ms 1
|
| 3995 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.878ms 100.00% 2.878ms 959.487us 3
|
| 3996 |
+
Activity Buffer Request 35.25% 1.663ms 35.25% 1.663ms 1.663ms 967.007us 33.59% 967.007us 967.007us 1
|
| 3997 |
+
aten::empty 0.62% 29.170us 0.62% 29.170us 4.862us 0.000us 0.00% 0.000us 0.000us 6
|
| 3998 |
+
cudaFuncSetAttribute 0.11% 5.320us 0.11% 5.320us 1.773us 0.000us 0.00% 0.000us 0.000us 3
|
| 3999 |
+
cudaLaunchKernel 0.72% 33.781us 0.72% 33.781us 11.260us 0.000us 0.00% 0.000us 0.000us 3
|
| 4000 |
+
aten::reshape 0.18% 8.350us 0.47% 21.990us 3.665us 0.000us 0.00% 0.000us 0.000us 6
|
| 4001 |
+
aten::view 0.29% 13.640us 0.29% 13.640us 2.273us 0.000us 0.00% 0.000us 0.000us 6
|
| 4002 |
+
cudaDeviceSynchronize 52.26% 2.465ms 52.26% 2.465ms 2.465ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
+
Self CPU time total: 4.717ms
|
| 4005 |
+
Self CUDA time total: 2.878ms
|
| 4006 |
|
| 4007 |
|
| 4008 |
|
|
|
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4014 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4015 |
+
xformers_meff 6.01% 303.306us 50.06% 2.525ms 2.525ms 0.000us 0.00% 3.923ms 3.923ms 1
|
| 4016 |
+
xformers_flash3::flash_fwd 2.90% 146.364us 43.59% 2.199ms 733.113us 0.000us 0.00% 3.923ms 1.308ms 3
|
| 4017 |
+
flash_attn_3::fwd 1.02% 51.431us 40.69% 2.053ms 684.325us 2.938ms 100.00% 3.923ms 1.308ms 3
|
| 4018 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.939ms 100.05% 2.939ms 2.939ms 1
|
| 4019 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.938ms 100.00% 2.938ms 979.195us 3
|
| 4020 |
+
Activity Buffer Request 34.86% 1.758ms 34.86% 1.758ms 1.758ms 985.691us 33.55% 985.691us 985.691us 1
|
| 4021 |
+
aten::empty 0.57% 28.860us 0.57% 28.860us 4.810us 0.000us 0.00% 0.000us 0.000us 6
|
| 4022 |
+
cudaFuncSetAttribute 0.11% 5.561us 0.11% 5.561us 1.854us 0.000us 0.00% 0.000us 0.000us 3
|
| 4023 |
+
cudaLaunchKernel 4.14% 208.674us 4.14% 208.674us 69.558us 0.000us 0.00% 0.000us 0.000us 3
|
| 4024 |
+
aten::reshape 0.18% 9.230us 0.45% 22.800us 3.800us 0.000us 0.00% 0.000us 0.000us 6
|
| 4025 |
+
aten::view 0.27% 13.570us 0.27% 13.570us 2.262us 0.000us 0.00% 0.000us 0.000us 6
|
| 4026 |
+
cudaDeviceSynchronize 49.94% 2.520ms 49.94% 2.520ms 2.520ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
+
Self CPU time total: 5.045ms
|
| 4029 |
+
Self CUDA time total: 2.938ms
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
+
xformers_meff 5.53% 307.446us 44.37% 2.468ms 2.468ms 0.000us 0.00% 4.694ms 4.694ms 1
|
| 4040 |
+
xformers_flash3::flash_fwd 2.65% 147.575us 38.45% 2.139ms 712.966us 0.000us 0.00% 4.694ms 1.565ms 3
|
| 4041 |
+
flash_attn_3::fwd 0.89% 49.519us 35.79% 1.991ms 663.774us 3.515ms 100.00% 4.694ms 1.565ms 3
|
| 4042 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.517ms 100.05% 3.517ms 3.517ms 1
|
| 4043 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.515ms 100.00% 3.515ms 1.172ms 3
|
| 4044 |
+
Activity Buffer Request 30.66% 1.706ms 30.66% 1.706ms 1.706ms 1.179ms 33.55% 1.179ms 1.179ms 1
|
| 4045 |
+
aten::empty 0.52% 28.861us 0.52% 28.861us 4.810us 0.000us 0.00% 0.000us 0.000us 6
|
| 4046 |
+
cudaFuncSetAttribute 0.11% 6.000us 0.11% 6.000us 2.000us 0.000us 0.00% 0.000us 0.000us 3
|
| 4047 |
+
cudaLaunchKernel 3.61% 201.015us 3.61% 201.015us 67.005us 0.000us 0.00% 0.000us 0.000us 3
|
| 4048 |
+
aten::reshape 0.15% 8.290us 0.39% 21.930us 3.655us 0.000us 0.00% 0.000us 0.000us 6
|
| 4049 |
+
aten::view 0.25% 13.640us 0.25% 13.640us 2.273us 0.000us 0.00% 0.000us 0.000us 6
|
| 4050 |
+
cudaDeviceSynchronize 55.63% 3.095ms 55.63% 3.095ms 3.095ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4051 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4052 |
+
Self CPU time total: 5.563ms
|
| 4053 |
+
Self CUDA time total: 3.515ms
|
| 4054 |
|
| 4055 |
|
| 4056 |
|
|
|
|
| 4060 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4061 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4062 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4063 |
+
xformers_meff 5.46% 305.147us 45.13% 2.521ms 2.521ms 0.000us 0.00% 4.658ms 4.658ms 1
|
| 4064 |
+
xformers_flash3::flash_fwd 2.65% 147.824us 39.28% 2.194ms 731.306us 0.000us 0.00% 4.658ms 1.553ms 3
|
| 4065 |
+
flash_attn_3::fwd 0.94% 52.350us 36.63% 2.046ms 682.031us 3.488ms 100.00% 4.658ms 1.553ms 3
|
| 4066 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.489ms 100.05% 3.489ms 3.489ms 1
|
| 4067 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.488ms 100.00% 3.488ms 1.163ms 3
|
| 4068 |
+
Activity Buffer Request 31.45% 1.757ms 31.45% 1.757ms 1.757ms 1.171ms 33.57% 1.171ms 1.171ms 1
|
| 4069 |
+
aten::empty 0.54% 29.960us 0.54% 29.960us 4.993us 0.000us 0.00% 0.000us 0.000us 6
|
| 4070 |
+
cudaFuncSetAttribute 0.10% 5.370us 0.10% 5.370us 1.790us 0.000us 0.00% 0.000us 0.000us 3
|
| 4071 |
+
cudaLaunchKernel 3.61% 201.885us 3.61% 201.885us 67.295us 0.000us 0.00% 0.000us 0.000us 3
|
| 4072 |
+
aten::reshape 0.15% 8.170us 0.39% 21.900us 3.650us 0.000us 0.00% 0.000us 0.000us 6
|
| 4073 |
+
aten::view 0.25% 13.730us 0.25% 13.730us 2.288us 0.000us 0.00% 0.000us 0.000us 6
|
| 4074 |
+
cudaDeviceSynchronize 54.87% 3.065ms 54.87% 3.065ms 3.065ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4075 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4076 |
+
Self CPU time total: 5.586ms
|
| 4077 |
+
Self CUDA time total: 3.488ms
|
| 4078 |
|
| 4079 |
|
| 4080 |
impl wl p50(ms) ok
|
| 4081 |
+
xformers_meff cuda_attn_L128_bfloat16 0.99 True
|
| 4082 |
xformers_meff cuda_attn_L256_bfloat16 1.04 True
|
| 4083 |
+
xformers_meff cuda_attn_L320_bfloat16 1.07 True
|
| 4084 |
+
xformers_meff cuda_attn_L384_bfloat16 1.08 True
|
| 4085 |
xformers_meff cuda_attn_L448_bfloat16 1.26 True
|
| 4086 |
+
xformers_meff cuda_attn_L512_bfloat16 1.25 True
|
| 4087 |
</pre></div>
|
| 4088 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4089 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4090 |
<div class="uv-logs-content" style="display: none;">
|
| 4091 |
Downloading xformers (111.8MiB)
|
| 4092 |
Downloaded xformers
|
| 4093 |
+
Installed 1 package in 11ms
|
| 4094 |
</div>
|
| 4095 |
</div>
|
| 4096 |
<div class="cell-artifacts">
|
flash_attn/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
flash_attn/results/combined_results.html
CHANGED
|
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
-
<dc:date>2025-12-
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
|
@@ -3999,96 +3999,96 @@ body[data-tool="eraser"] .main-content {
|
|
| 3999 |
<g id="matplotlib.axis_2">
|
| 4000 |
<g id="ytick_1">
|
| 4001 |
<g id="grid-y--2" class="grid grid-y">
|
| 4002 |
-
<path d="M 47.81
|
| 4003 |
</g>
|
| 4004 |
<g id="line2d_7">
|
| 4005 |
<defs>
|
| 4006 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4007 |
</defs>
|
| 4008 |
<g>
|
| 4009 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="text_7">
|
| 4013 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4014 |
</g>
|
| 4015 |
</g>
|
| 4016 |
<g id="ytick_2">
|
| 4017 |
<g id="grid-y--3" class="grid grid-y">
|
| 4018 |
-
<path d="M 47.81
|
| 4019 |
</g>
|
| 4020 |
<g id="line2d_8">
|
| 4021 |
<g>
|
| 4022 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="text_8">
|
| 4026 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="ytick_3">
|
| 4030 |
<g id="grid-y--4" class="grid grid-y">
|
| 4031 |
-
<path d="M 47.81
|
| 4032 |
</g>
|
| 4033 |
<g id="line2d_9">
|
| 4034 |
<g>
|
| 4035 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="text_9">
|
| 4039 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="ytick_4">
|
| 4043 |
<g id="grid-y--5" class="grid grid-y">
|
| 4044 |
-
<path d="M 47.81
|
| 4045 |
</g>
|
| 4046 |
<g id="line2d_10">
|
| 4047 |
<g>
|
| 4048 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="text_10">
|
| 4052 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4053 |
</g>
|
| 4054 |
</g>
|
| 4055 |
<g id="ytick_5">
|
| 4056 |
<g id="grid-y--6" class="grid grid-y">
|
| 4057 |
-
<path d="M 47.81
|
| 4058 |
</g>
|
| 4059 |
<g id="line2d_11">
|
| 4060 |
<g>
|
| 4061 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="text_11">
|
| 4065 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4066 |
</g>
|
| 4067 |
</g>
|
| 4068 |
<g id="ytick_6">
|
| 4069 |
<g id="grid-y--7" class="grid grid-y">
|
| 4070 |
-
<path d="M 47.81
|
| 4071 |
</g>
|
| 4072 |
<g id="line2d_12">
|
| 4073 |
<g>
|
| 4074 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="text_12">
|
| 4078 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="ytick_7">
|
| 4082 |
<g id="grid-y--8" class="grid grid-y">
|
| 4083 |
-
<path d="M 47.81
|
| 4084 |
</g>
|
| 4085 |
<g id="line2d_13">
|
| 4086 |
<g>
|
| 4087 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="text_13">
|
| 4091 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4092 |
</g>
|
| 4093 |
</g>
|
| 4094 |
<g id="label--y" class="ylabel">
|
|
@@ -4096,73 +4096,73 @@ body[data-tool="eraser"] .main-content {
|
|
| 4096 |
</g>
|
| 4097 |
</g>
|
| 4098 |
<g id="series--torch-flash-ma" class="series">
|
| 4099 |
-
<path d="M 83.607806
|
| 4100 |
<defs>
|
| 4101 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4102 |
</defs>
|
| 4103 |
<g clip-path="url(#p09feef2583)">
|
| 4104 |
-
<use ns4:href="#md7efaf3aec" x="83.607806" y="
|
| 4105 |
-
<use ns4:href="#md7efaf3aec" x="226.799032" y="
|
| 4106 |
-
<use ns4:href="#md7efaf3aec" x="369.990258" y="
|
| 4107 |
-
<use ns4:href="#md7efaf3aec" x="513.181484" y="
|
| 4108 |
-
<use ns4:href="#md7efaf3aec" x="656.37271" y="
|
| 4109 |
-
<use ns4:href="#md7efaf3aec" x="799.563935" y="
|
| 4110 |
</g>
|
| 4111 |
</g>
|
| 4112 |
<g id="series--torch-mem-eff" class="series">
|
| 4113 |
-
<path d="M 83.607806
|
| 4114 |
<defs>
|
| 4115 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4116 |
</defs>
|
| 4117 |
<g clip-path="url(#p09feef2583)">
|
| 4118 |
-
<use ns4:href="#m9b8c54d372" x="83.607806" y="
|
| 4119 |
-
<use ns4:href="#m9b8c54d372" x="226.799032" y="
|
| 4120 |
-
<use ns4:href="#m9b8c54d372" x="369.990258" y="111.
|
| 4121 |
-
<use ns4:href="#m9b8c54d372" x="513.181484" y="
|
| 4122 |
-
<use ns4:href="#m9b8c54d372" x="656.37271" y="
|
| 4123 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4124 |
</g>
|
| 4125 |
</g>
|
| 4126 |
<g id="series--xformers-meff" class="series">
|
| 4127 |
-
<path d="M 83.607806
|
| 4128 |
<defs>
|
| 4129 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4130 |
</defs>
|
| 4131 |
<g clip-path="url(#p09feef2583)">
|
| 4132 |
-
<use ns4:href="#mc655281e0b" x="83.607806" y="
|
| 4133 |
-
<use ns4:href="#mc655281e0b" x="226.799032" y="
|
| 4134 |
-
<use ns4:href="#mc655281e0b" x="369.990258" y="
|
| 4135 |
-
<use ns4:href="#mc655281e0b" x="513.181484" y="
|
| 4136 |
-
<use ns4:href="#mc655281e0b" x="656.37271" y="
|
| 4137 |
-
<use ns4:href="#mc655281e0b" x="799.563935" y="
|
| 4138 |
</g>
|
| 4139 |
</g>
|
| 4140 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4141 |
-
<path d="M 83.607806 417.
|
| 4142 |
<defs>
|
| 4143 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4144 |
</defs>
|
| 4145 |
<g clip-path="url(#p09feef2583)">
|
| 4146 |
-
<use ns4:href="#m61c8040d7e" x="83.607806" y="417.
|
| 4147 |
-
<use ns4:href="#m61c8040d7e" x="226.799032" y="
|
| 4148 |
-
<use ns4:href="#m61c8040d7e" x="369.990258" y="
|
| 4149 |
-
<use ns4:href="#m61c8040d7e" x="513.181484" y="383.
|
| 4150 |
-
<use ns4:href="#m61c8040d7e" x="656.37271" y="
|
| 4151 |
-
<use ns4:href="#m61c8040d7e" x="799.563935" y="
|
| 4152 |
</g>
|
| 4153 |
</g>
|
| 4154 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4155 |
-
<path d="M 83.607806 428.387702 L 226.799032
|
| 4156 |
<defs>
|
| 4157 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4158 |
</defs>
|
| 4159 |
<g clip-path="url(#p09feef2583)">
|
| 4160 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4161 |
-
<use ns4:href="#m7cd35be9cc" x="226.799032" y="
|
| 4162 |
-
<use ns4:href="#m7cd35be9cc" x="369.990258" y="
|
| 4163 |
-
<use ns4:href="#m7cd35be9cc" x="513.181484" y="
|
| 4164 |
-
<use ns4:href="#m7cd35be9cc" x="656.37271" y="
|
| 4165 |
-
<use ns4:href="#m7cd35be9cc" x="799.563935" y="
|
| 4166 |
</g>
|
| 4167 |
</g>
|
| 4168 |
<g id="patch_3">
|
|
@@ -4247,7 +4247,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4247 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4248 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4249 |
</span> |
|
| 4250 |
-
Cell: combine | 4.
|
| 4251 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4252 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4253 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4356,48 +4356,48 @@ Summary: 6 found, 0 skipped, 0 missing
|
|
| 4356 |
COMBINED BENCHMARK SUMMARY
|
| 4357 |
|
| 4358 |
impl wl p50(ms) ok
|
| 4359 |
-
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.
|
| 4360 |
-
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.
|
| 4361 |
-
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.
|
| 4362 |
-
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.
|
| 4363 |
-
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.
|
| 4364 |
-
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.
|
| 4365 |
-
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.
|
| 4366 |
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
|
| 4367 |
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
|
| 4368 |
-
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.
|
| 4369 |
-
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.
|
| 4370 |
-
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.
|
| 4371 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4372 |
-
Error: module '
|
| 4373 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4374 |
-
Error: module '
|
| 4375 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4376 |
-
Error: module '
|
| 4377 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4378 |
-
Error: module '
|
| 4379 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4380 |
-
Error: module '
|
| 4381 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4382 |
-
Error: module '
|
| 4383 |
torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
|
| 4384 |
-
torch_flash_ma cuda_attn_L256_bfloat16 1.
|
| 4385 |
-
torch_flash_ma cuda_attn_L320_bfloat16 1.
|
| 4386 |
-
torch_flash_ma cuda_attn_L384_bfloat16 1.
|
| 4387 |
-
torch_flash_ma cuda_attn_L448_bfloat16 1.
|
| 4388 |
-
torch_flash_ma cuda_attn_L512_bfloat16 1.
|
| 4389 |
-
torch_mem_eff cuda_attn_L128_bfloat16 1.
|
| 4390 |
-
torch_mem_eff cuda_attn_L256_bfloat16 1.
|
| 4391 |
-
torch_mem_eff cuda_attn_L320_bfloat16
|
| 4392 |
-
torch_mem_eff cuda_attn_L384_bfloat16
|
| 4393 |
-
torch_mem_eff cuda_attn_L448_bfloat16 2.
|
| 4394 |
-
torch_mem_eff cuda_attn_L512_bfloat16 2.
|
| 4395 |
-
xformers_meff cuda_attn_L128_bfloat16 0.
|
| 4396 |
xformers_meff cuda_attn_L256_bfloat16 1.04 True
|
| 4397 |
-
xformers_meff cuda_attn_L320_bfloat16 1.
|
| 4398 |
-
xformers_meff cuda_attn_L384_bfloat16 1.
|
| 4399 |
xformers_meff cuda_attn_L448_bfloat16 1.26 True
|
| 4400 |
-
xformers_meff cuda_attn_L512_bfloat16 1.
|
| 4401 |
|
| 4402 |
GENERATING COMBINED VISUALIZATION
|
| 4403 |
|
|
@@ -4421,7 +4421,7 @@ Implementations included:
|
|
| 4421 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4422 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4423 |
<div class="uv-logs-content" style="display: none;">
|
| 4424 |
-
Installed 37 packages in
|
| 4425 |
</div>
|
| 4426 |
</div>
|
| 4427 |
<div class="cell-artifacts">
|
|
@@ -4434,7 +4434,7 @@ Installed 37 packages in 206ms
|
|
| 4434 |
<rdf:RDF>
|
| 4435 |
<ns2:Work>
|
| 4436 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4437 |
-
<dc:date>2025-12-
|
| 4438 |
<dc:format>image/svg+xml</dc:format>
|
| 4439 |
<dc:creator>
|
| 4440 |
<ns2:Agent>
|
|
@@ -4544,96 +4544,96 @@ Installed 37 packages in 206ms
|
|
| 4544 |
<g id="matplotlib.axis_2">
|
| 4545 |
<g id="ytick_1">
|
| 4546 |
<g id="grid-y--2" class="grid grid-y">
|
| 4547 |
-
<path d="M 47.81
|
| 4548 |
</g>
|
| 4549 |
<g id="line2d_7">
|
| 4550 |
<defs>
|
| 4551 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4552 |
</defs>
|
| 4553 |
<g>
|
| 4554 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4555 |
</g>
|
| 4556 |
</g>
|
| 4557 |
<g id="text_7">
|
| 4558 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4559 |
</g>
|
| 4560 |
</g>
|
| 4561 |
<g id="ytick_2">
|
| 4562 |
<g id="grid-y--3" class="grid grid-y">
|
| 4563 |
-
<path d="M 47.81
|
| 4564 |
</g>
|
| 4565 |
<g id="line2d_8">
|
| 4566 |
<g>
|
| 4567 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4568 |
</g>
|
| 4569 |
</g>
|
| 4570 |
<g id="text_8">
|
| 4571 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4572 |
</g>
|
| 4573 |
</g>
|
| 4574 |
<g id="ytick_3">
|
| 4575 |
<g id="grid-y--4" class="grid grid-y">
|
| 4576 |
-
<path d="M 47.81
|
| 4577 |
</g>
|
| 4578 |
<g id="line2d_9">
|
| 4579 |
<g>
|
| 4580 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4581 |
</g>
|
| 4582 |
</g>
|
| 4583 |
<g id="text_9">
|
| 4584 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4585 |
</g>
|
| 4586 |
</g>
|
| 4587 |
<g id="ytick_4">
|
| 4588 |
<g id="grid-y--5" class="grid grid-y">
|
| 4589 |
-
<path d="M 47.81
|
| 4590 |
</g>
|
| 4591 |
<g id="line2d_10">
|
| 4592 |
<g>
|
| 4593 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4594 |
</g>
|
| 4595 |
</g>
|
| 4596 |
<g id="text_10">
|
| 4597 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4598 |
</g>
|
| 4599 |
</g>
|
| 4600 |
<g id="ytick_5">
|
| 4601 |
<g id="grid-y--6" class="grid grid-y">
|
| 4602 |
-
<path d="M 47.81
|
| 4603 |
</g>
|
| 4604 |
<g id="line2d_11">
|
| 4605 |
<g>
|
| 4606 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4607 |
</g>
|
| 4608 |
</g>
|
| 4609 |
<g id="text_11">
|
| 4610 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4611 |
</g>
|
| 4612 |
</g>
|
| 4613 |
<g id="ytick_6">
|
| 4614 |
<g id="grid-y--7" class="grid grid-y">
|
| 4615 |
-
<path d="M 47.81
|
| 4616 |
</g>
|
| 4617 |
<g id="line2d_12">
|
| 4618 |
<g>
|
| 4619 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4620 |
</g>
|
| 4621 |
</g>
|
| 4622 |
<g id="text_12">
|
| 4623 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4624 |
</g>
|
| 4625 |
</g>
|
| 4626 |
<g id="ytick_7">
|
| 4627 |
<g id="grid-y--8" class="grid grid-y">
|
| 4628 |
-
<path d="M 47.81
|
| 4629 |
</g>
|
| 4630 |
<g id="line2d_13">
|
| 4631 |
<g>
|
| 4632 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4633 |
</g>
|
| 4634 |
</g>
|
| 4635 |
<g id="text_13">
|
| 4636 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4637 |
</g>
|
| 4638 |
</g>
|
| 4639 |
<g id="label--y" class="ylabel">
|
|
@@ -4641,73 +4641,73 @@ Installed 37 packages in 206ms
|
|
| 4641 |
</g>
|
| 4642 |
</g>
|
| 4643 |
<g id="series--torch-flash-ma" class="series">
|
| 4644 |
-
<path d="M 83.607806
|
| 4645 |
<defs>
|
| 4646 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4647 |
</defs>
|
| 4648 |
<g clip-path="url(#p09feef2583)">
|
| 4649 |
-
<use ns4:href="#md7efaf3aec" x="83.607806" y="
|
| 4650 |
-
<use ns4:href="#md7efaf3aec" x="226.799032" y="
|
| 4651 |
-
<use ns4:href="#md7efaf3aec" x="369.990258" y="
|
| 4652 |
-
<use ns4:href="#md7efaf3aec" x="513.181484" y="
|
| 4653 |
-
<use ns4:href="#md7efaf3aec" x="656.37271" y="
|
| 4654 |
-
<use ns4:href="#md7efaf3aec" x="799.563935" y="
|
| 4655 |
</g>
|
| 4656 |
</g>
|
| 4657 |
<g id="series--torch-mem-eff" class="series">
|
| 4658 |
-
<path d="M 83.607806
|
| 4659 |
<defs>
|
| 4660 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4661 |
</defs>
|
| 4662 |
<g clip-path="url(#p09feef2583)">
|
| 4663 |
-
<use ns4:href="#m9b8c54d372" x="83.607806" y="
|
| 4664 |
-
<use ns4:href="#m9b8c54d372" x="226.799032" y="
|
| 4665 |
-
<use ns4:href="#m9b8c54d372" x="369.990258" y="111.
|
| 4666 |
-
<use ns4:href="#m9b8c54d372" x="513.181484" y="
|
| 4667 |
-
<use ns4:href="#m9b8c54d372" x="656.37271" y="
|
| 4668 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4669 |
</g>
|
| 4670 |
</g>
|
| 4671 |
<g id="series--xformers-meff" class="series">
|
| 4672 |
-
<path d="M 83.607806
|
| 4673 |
<defs>
|
| 4674 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4675 |
</defs>
|
| 4676 |
<g clip-path="url(#p09feef2583)">
|
| 4677 |
-
<use ns4:href="#mc655281e0b" x="83.607806" y="
|
| 4678 |
-
<use ns4:href="#mc655281e0b" x="226.799032" y="
|
| 4679 |
-
<use ns4:href="#mc655281e0b" x="369.990258" y="
|
| 4680 |
-
<use ns4:href="#mc655281e0b" x="513.181484" y="
|
| 4681 |
-
<use ns4:href="#mc655281e0b" x="656.37271" y="
|
| 4682 |
-
<use ns4:href="#mc655281e0b" x="799.563935" y="
|
| 4683 |
</g>
|
| 4684 |
</g>
|
| 4685 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4686 |
-
<path d="M 83.607806 417.
|
| 4687 |
<defs>
|
| 4688 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4689 |
</defs>
|
| 4690 |
<g clip-path="url(#p09feef2583)">
|
| 4691 |
-
<use ns4:href="#m61c8040d7e" x="83.607806" y="417.
|
| 4692 |
-
<use ns4:href="#m61c8040d7e" x="226.799032" y="
|
| 4693 |
-
<use ns4:href="#m61c8040d7e" x="369.990258" y="
|
| 4694 |
-
<use ns4:href="#m61c8040d7e" x="513.181484" y="383.
|
| 4695 |
-
<use ns4:href="#m61c8040d7e" x="656.37271" y="
|
| 4696 |
-
<use ns4:href="#m61c8040d7e" x="799.563935" y="
|
| 4697 |
</g>
|
| 4698 |
</g>
|
| 4699 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4700 |
-
<path d="M 83.607806 428.387702 L 226.799032
|
| 4701 |
<defs>
|
| 4702 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4703 |
</defs>
|
| 4704 |
<g clip-path="url(#p09feef2583)">
|
| 4705 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4706 |
-
<use ns4:href="#m7cd35be9cc" x="226.799032" y="
|
| 4707 |
-
<use ns4:href="#m7cd35be9cc" x="369.990258" y="
|
| 4708 |
-
<use ns4:href="#m7cd35be9cc" x="513.181484" y="
|
| 4709 |
-
<use ns4:href="#m7cd35be9cc" x="656.37271" y="
|
| 4710 |
-
<use ns4:href="#m7cd35be9cc" x="799.563935" y="
|
| 4711 |
</g>
|
| 4712 |
</g>
|
| 4713 |
<g id="patch_3">
|
|
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
+
<dc:date>2025-12-19T23:02:45.375383</dc:date>
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
|
|
|
| 3999 |
<g id="matplotlib.axis_2">
|
| 4000 |
<g id="ytick_1">
|
| 4001 |
<g id="grid-y--2" class="grid grid-y">
|
| 4002 |
+
<path d="M 47.81 407.59176 L 835.361742 407.59176 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4003 |
</g>
|
| 4004 |
<g id="line2d_7">
|
| 4005 |
<defs>
|
| 4006 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4007 |
</defs>
|
| 4008 |
<g>
|
| 4009 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="407.59176" style="stroke: #000000; stroke-width: 0.8" />
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="text_7">
|
| 4013 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="411.390978" transform="rotate(-0 40.81 411.390978)">1.0</text>
|
| 4014 |
</g>
|
| 4015 |
</g>
|
| 4016 |
<g id="ytick_2">
|
| 4017 |
<g id="grid-y--3" class="grid grid-y">
|
| 4018 |
+
<path d="M 47.81 349.696597 L 835.361742 349.696597 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4019 |
</g>
|
| 4020 |
<g id="line2d_8">
|
| 4021 |
<g>
|
| 4022 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="349.696597" style="stroke: #000000; stroke-width: 0.8" />
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="text_8">
|
| 4026 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="353.495815" transform="rotate(-0 40.81 353.495815)">1.2</text>
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="ytick_3">
|
| 4030 |
<g id="grid-y--4" class="grid grid-y">
|
| 4031 |
+
<path d="M 47.81 291.801434 L 835.361742 291.801434 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4032 |
</g>
|
| 4033 |
<g id="line2d_9">
|
| 4034 |
<g>
|
| 4035 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="291.801434" style="stroke: #000000; stroke-width: 0.8" />
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="text_9">
|
| 4039 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="295.600653" transform="rotate(-0 40.81 295.600653)">1.4</text>
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="ytick_4">
|
| 4043 |
<g id="grid-y--5" class="grid grid-y">
|
| 4044 |
+
<path d="M 47.81 233.906271 L 835.361742 233.906271 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4045 |
</g>
|
| 4046 |
<g id="line2d_10">
|
| 4047 |
<g>
|
| 4048 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="233.906271" style="stroke: #000000; stroke-width: 0.8" />
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="text_10">
|
| 4052 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="237.70549" transform="rotate(-0 40.81 237.70549)">1.6</text>
|
| 4053 |
</g>
|
| 4054 |
</g>
|
| 4055 |
<g id="ytick_5">
|
| 4056 |
<g id="grid-y--6" class="grid grid-y">
|
| 4057 |
+
<path d="M 47.81 176.011108 L 835.361742 176.011108 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4058 |
</g>
|
| 4059 |
<g id="line2d_11">
|
| 4060 |
<g>
|
| 4061 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="176.011108" style="stroke: #000000; stroke-width: 0.8" />
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="text_11">
|
| 4065 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="179.810327" transform="rotate(-0 40.81 179.810327)">1.8</text>
|
| 4066 |
</g>
|
| 4067 |
</g>
|
| 4068 |
<g id="ytick_6">
|
| 4069 |
<g id="grid-y--7" class="grid grid-y">
|
| 4070 |
+
<path d="M 47.81 118.115945 L 835.361742 118.115945 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4071 |
</g>
|
| 4072 |
<g id="line2d_12">
|
| 4073 |
<g>
|
| 4074 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="118.115945" style="stroke: #000000; stroke-width: 0.8" />
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="text_12">
|
| 4078 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="121.915164" transform="rotate(-0 40.81 121.915164)">2.0</text>
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="ytick_7">
|
| 4082 |
<g id="grid-y--8" class="grid grid-y">
|
| 4083 |
+
<path d="M 47.81 60.220782 L 835.361742 60.220782 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4084 |
</g>
|
| 4085 |
<g id="line2d_13">
|
| 4086 |
<g>
|
| 4087 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="60.220782" style="stroke: #000000; stroke-width: 0.8" />
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="text_13">
|
| 4091 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="64.020001" transform="rotate(-0 40.81 64.020001)">2.2</text>
|
| 4092 |
</g>
|
| 4093 |
</g>
|
| 4094 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4096 |
</g>
|
| 4097 |
</g>
|
| 4098 |
<g id="series--torch-flash-ma" class="series">
|
| 4099 |
+
<path d="M 83.607806 345.769568 L 226.799032 327.905436 L 369.990258 321.175124 L 513.181484 310.733152 L 656.37271 266.384878 L 799.563935 257.816105 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4100 |
<defs>
|
| 4101 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4102 |
</defs>
|
| 4103 |
<g clip-path="url(#p09feef2583)">
|
| 4104 |
+
<use ns4:href="#md7efaf3aec" x="83.607806" y="345.769568" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4105 |
+
<use ns4:href="#md7efaf3aec" x="226.799032" y="327.905436" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4106 |
+
<use ns4:href="#md7efaf3aec" x="369.990258" y="321.175124" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4107 |
+
<use ns4:href="#md7efaf3aec" x="513.181484" y="310.733152" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4108 |
+
<use ns4:href="#md7efaf3aec" x="656.37271" y="266.384878" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4109 |
+
<use ns4:href="#md7efaf3aec" x="799.563935" y="257.816105" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4110 |
</g>
|
| 4111 |
</g>
|
| 4112 |
<g id="series--torch-mem-eff" class="series">
|
| 4113 |
+
<path d="M 83.607806 158.269135 L 226.799032 141.655829 L 369.990258 111.190815 L 513.181484 119.895642 L 656.37271 90.298767 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4114 |
<defs>
|
| 4115 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4116 |
</defs>
|
| 4117 |
<g clip-path="url(#p09feef2583)">
|
| 4118 |
+
<use ns4:href="#m9b8c54d372" x="83.607806" y="158.269135" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4119 |
+
<use ns4:href="#m9b8c54d372" x="226.799032" y="141.655829" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4120 |
+
<use ns4:href="#m9b8c54d372" x="369.990258" y="111.190815" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4121 |
+
<use ns4:href="#m9b8c54d372" x="513.181484" y="119.895642" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4122 |
+
<use ns4:href="#m9b8c54d372" x="656.37271" y="90.298767" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4123 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4124 |
</g>
|
| 4125 |
</g>
|
| 4126 |
<g id="series--xformers-meff" class="series">
|
| 4127 |
+
<path d="M 83.607806 410.251753 L 226.799032 397.439264 L 369.990258 386.870212 L 513.181484 384.759934 L 656.37271 333.145238 L 799.563935 334.337878 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4128 |
<defs>
|
| 4129 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4130 |
</defs>
|
| 4131 |
<g clip-path="url(#p09feef2583)">
|
| 4132 |
+
<use ns4:href="#mc655281e0b" x="83.607806" y="410.251753" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4133 |
+
<use ns4:href="#mc655281e0b" x="226.799032" y="397.439264" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4134 |
+
<use ns4:href="#mc655281e0b" x="369.990258" y="386.870212" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4135 |
+
<use ns4:href="#mc655281e0b" x="513.181484" y="384.759934" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4136 |
+
<use ns4:href="#mc655281e0b" x="656.37271" y="333.145238" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4137 |
+
<use ns4:href="#mc655281e0b" x="799.563935" y="334.337878" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4138 |
</g>
|
| 4139 |
</g>
|
| 4140 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4141 |
+
<path d="M 83.607806 417.931836 L 226.799032 403.327202 L 369.990258 389.605759 L 513.181484 383.179396 L 656.37271 337.13711 L 799.563935 336.430789 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4142 |
<defs>
|
| 4143 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4144 |
</defs>
|
| 4145 |
<g clip-path="url(#p09feef2583)">
|
| 4146 |
+
<use ns4:href="#m61c8040d7e" x="83.607806" y="417.931836" style="fill: #d62728; stroke: #d62728" />
|
| 4147 |
+
<use ns4:href="#m61c8040d7e" x="226.799032" y="403.327202" style="fill: #d62728; stroke: #d62728" />
|
| 4148 |
+
<use ns4:href="#m61c8040d7e" x="369.990258" y="389.605759" style="fill: #d62728; stroke: #d62728" />
|
| 4149 |
+
<use ns4:href="#m61c8040d7e" x="513.181484" y="383.179396" style="fill: #d62728; stroke: #d62728" />
|
| 4150 |
+
<use ns4:href="#m61c8040d7e" x="656.37271" y="337.13711" style="fill: #d62728; stroke: #d62728" />
|
| 4151 |
+
<use ns4:href="#m61c8040d7e" x="799.563935" y="336.430789" style="fill: #d62728; stroke: #d62728" />
|
| 4152 |
</g>
|
| 4153 |
</g>
|
| 4154 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4155 |
+
<path d="M 83.607806 428.387702 L 226.799032 413.453355 L 369.990258 405.098794 L 513.181484 402.988515 L 656.37271 353.264386 L 799.563935 343.383998 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4156 |
<defs>
|
| 4157 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4158 |
</defs>
|
| 4159 |
<g clip-path="url(#p09feef2583)">
|
| 4160 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4161 |
+
<use ns4:href="#m7cd35be9cc" x="226.799032" y="413.453355" style="fill: #9467bd; stroke: #9467bd" />
|
| 4162 |
+
<use ns4:href="#m7cd35be9cc" x="369.990258" y="405.098794" style="fill: #9467bd; stroke: #9467bd" />
|
| 4163 |
+
<use ns4:href="#m7cd35be9cc" x="513.181484" y="402.988515" style="fill: #9467bd; stroke: #9467bd" />
|
| 4164 |
+
<use ns4:href="#m7cd35be9cc" x="656.37271" y="353.264386" style="fill: #9467bd; stroke: #9467bd" />
|
| 4165 |
+
<use ns4:href="#m7cd35be9cc" x="799.563935" y="343.383998" style="fill: #9467bd; stroke: #9467bd" />
|
| 4166 |
</g>
|
| 4167 |
</g>
|
| 4168 |
<g id="patch_3">
|
|
|
|
| 4247 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4248 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4249 |
</span> |
|
| 4250 |
+
Cell: combine | 4.45s
|
| 4251 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4252 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4253 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4356 |
COMBINED BENCHMARK SUMMARY
|
| 4357 |
|
| 4358 |
impl wl p50(ms) ok
|
| 4359 |
+
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
|
| 4360 |
+
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
|
| 4361 |
+
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
|
| 4362 |
+
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.08 True
|
| 4363 |
+
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True
|
| 4364 |
+
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.25 True
|
| 4365 |
+
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True
|
| 4366 |
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
|
| 4367 |
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
|
| 4368 |
+
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
|
| 4369 |
+
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.19 True
|
| 4370 |
+
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.22 True
|
| 4371 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4372 |
+
Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
|
| 4373 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4374 |
+
Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
|
| 4375 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4376 |
+
Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
|
| 4377 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4378 |
+
Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
|
| 4379 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4380 |
+
Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
|
| 4381 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4382 |
+
Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
|
| 4383 |
torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
|
| 4384 |
+
torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
|
| 4385 |
+
torch_flash_ma cuda_attn_L320_bfloat16 1.30 True
|
| 4386 |
+
torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
|
| 4387 |
+
torch_flash_ma cuda_attn_L448_bfloat16 1.49 True
|
| 4388 |
+
torch_flash_ma cuda_attn_L512_bfloat16 1.52 True
|
| 4389 |
+
torch_mem_eff cuda_attn_L128_bfloat16 1.86 True
|
| 4390 |
+
torch_mem_eff cuda_attn_L256_bfloat16 1.92 True
|
| 4391 |
+
torch_mem_eff cuda_attn_L320_bfloat16 2.02 True
|
| 4392 |
+
torch_mem_eff cuda_attn_L384_bfloat16 1.99 True
|
| 4393 |
+
torch_mem_eff cuda_attn_L448_bfloat16 2.10 True
|
| 4394 |
+
torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
|
| 4395 |
+
xformers_meff cuda_attn_L128_bfloat16 0.99 True
|
| 4396 |
xformers_meff cuda_attn_L256_bfloat16 1.04 True
|
| 4397 |
+
xformers_meff cuda_attn_L320_bfloat16 1.07 True
|
| 4398 |
+
xformers_meff cuda_attn_L384_bfloat16 1.08 True
|
| 4399 |
xformers_meff cuda_attn_L448_bfloat16 1.26 True
|
| 4400 |
+
xformers_meff cuda_attn_L512_bfloat16 1.25 True
|
| 4401 |
|
| 4402 |
GENERATING COMBINED VISUALIZATION
|
| 4403 |
|
|
|
|
| 4421 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4422 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4423 |
<div class="uv-logs-content" style="display: none;">
|
| 4424 |
+
Installed 37 packages in 247ms
|
| 4425 |
</div>
|
| 4426 |
</div>
|
| 4427 |
<div class="cell-artifacts">
|
|
|
|
| 4434 |
<rdf:RDF>
|
| 4435 |
<ns2:Work>
|
| 4436 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4437 |
+
<dc:date>2025-12-19T23:02:45.375383</dc:date>
|
| 4438 |
<dc:format>image/svg+xml</dc:format>
|
| 4439 |
<dc:creator>
|
| 4440 |
<ns2:Agent>
|
|
|
|
| 4544 |
<g id="matplotlib.axis_2">
|
| 4545 |
<g id="ytick_1">
|
| 4546 |
<g id="grid-y--2" class="grid grid-y">
|
| 4547 |
+
<path d="M 47.81 407.59176 L 835.361742 407.59176 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4548 |
</g>
|
| 4549 |
<g id="line2d_7">
|
| 4550 |
<defs>
|
| 4551 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4552 |
</defs>
|
| 4553 |
<g>
|
| 4554 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="407.59176" style="stroke: #000000; stroke-width: 0.8" />
|
| 4555 |
</g>
|
| 4556 |
</g>
|
| 4557 |
<g id="text_7">
|
| 4558 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="411.390978" transform="rotate(-0 40.81 411.390978)">1.0</text>
|
| 4559 |
</g>
|
| 4560 |
</g>
|
| 4561 |
<g id="ytick_2">
|
| 4562 |
<g id="grid-y--3" class="grid grid-y">
|
| 4563 |
+
<path d="M 47.81 349.696597 L 835.361742 349.696597 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4564 |
</g>
|
| 4565 |
<g id="line2d_8">
|
| 4566 |
<g>
|
| 4567 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="349.696597" style="stroke: #000000; stroke-width: 0.8" />
|
| 4568 |
</g>
|
| 4569 |
</g>
|
| 4570 |
<g id="text_8">
|
| 4571 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="353.495815" transform="rotate(-0 40.81 353.495815)">1.2</text>
|
| 4572 |
</g>
|
| 4573 |
</g>
|
| 4574 |
<g id="ytick_3">
|
| 4575 |
<g id="grid-y--4" class="grid grid-y">
|
| 4576 |
+
<path d="M 47.81 291.801434 L 835.361742 291.801434 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4577 |
</g>
|
| 4578 |
<g id="line2d_9">
|
| 4579 |
<g>
|
| 4580 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="291.801434" style="stroke: #000000; stroke-width: 0.8" />
|
| 4581 |
</g>
|
| 4582 |
</g>
|
| 4583 |
<g id="text_9">
|
| 4584 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="295.600653" transform="rotate(-0 40.81 295.600653)">1.4</text>
|
| 4585 |
</g>
|
| 4586 |
</g>
|
| 4587 |
<g id="ytick_4">
|
| 4588 |
<g id="grid-y--5" class="grid grid-y">
|
| 4589 |
+
<path d="M 47.81 233.906271 L 835.361742 233.906271 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4590 |
</g>
|
| 4591 |
<g id="line2d_10">
|
| 4592 |
<g>
|
| 4593 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="233.906271" style="stroke: #000000; stroke-width: 0.8" />
|
| 4594 |
</g>
|
| 4595 |
</g>
|
| 4596 |
<g id="text_10">
|
| 4597 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="237.70549" transform="rotate(-0 40.81 237.70549)">1.6</text>
|
| 4598 |
</g>
|
| 4599 |
</g>
|
| 4600 |
<g id="ytick_5">
|
| 4601 |
<g id="grid-y--6" class="grid grid-y">
|
| 4602 |
+
<path d="M 47.81 176.011108 L 835.361742 176.011108 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4603 |
</g>
|
| 4604 |
<g id="line2d_11">
|
| 4605 |
<g>
|
| 4606 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="176.011108" style="stroke: #000000; stroke-width: 0.8" />
|
| 4607 |
</g>
|
| 4608 |
</g>
|
| 4609 |
<g id="text_11">
|
| 4610 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="179.810327" transform="rotate(-0 40.81 179.810327)">1.8</text>
|
| 4611 |
</g>
|
| 4612 |
</g>
|
| 4613 |
<g id="ytick_6">
|
| 4614 |
<g id="grid-y--7" class="grid grid-y">
|
| 4615 |
+
<path d="M 47.81 118.115945 L 835.361742 118.115945 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4616 |
</g>
|
| 4617 |
<g id="line2d_12">
|
| 4618 |
<g>
|
| 4619 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="118.115945" style="stroke: #000000; stroke-width: 0.8" />
|
| 4620 |
</g>
|
| 4621 |
</g>
|
| 4622 |
<g id="text_12">
|
| 4623 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="121.915164" transform="rotate(-0 40.81 121.915164)">2.0</text>
|
| 4624 |
</g>
|
| 4625 |
</g>
|
| 4626 |
<g id="ytick_7">
|
| 4627 |
<g id="grid-y--8" class="grid grid-y">
|
| 4628 |
+
<path d="M 47.81 60.220782 L 835.361742 60.220782 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4629 |
</g>
|
| 4630 |
<g id="line2d_13">
|
| 4631 |
<g>
|
| 4632 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="60.220782" style="stroke: #000000; stroke-width: 0.8" />
|
| 4633 |
</g>
|
| 4634 |
</g>
|
| 4635 |
<g id="text_13">
|
| 4636 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="64.020001" transform="rotate(-0 40.81 64.020001)">2.2</text>
|
| 4637 |
</g>
|
| 4638 |
</g>
|
| 4639 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4641 |
</g>
|
| 4642 |
</g>
|
| 4643 |
<g id="series--torch-flash-ma" class="series">
|
| 4644 |
+
<path d="M 83.607806 345.769568 L 226.799032 327.905436 L 369.990258 321.175124 L 513.181484 310.733152 L 656.37271 266.384878 L 799.563935 257.816105 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4645 |
<defs>
|
| 4646 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4647 |
</defs>
|
| 4648 |
<g clip-path="url(#p09feef2583)">
|
| 4649 |
+
<use ns4:href="#md7efaf3aec" x="83.607806" y="345.769568" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4650 |
+
<use ns4:href="#md7efaf3aec" x="226.799032" y="327.905436" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4651 |
+
<use ns4:href="#md7efaf3aec" x="369.990258" y="321.175124" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4652 |
+
<use ns4:href="#md7efaf3aec" x="513.181484" y="310.733152" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4653 |
+
<use ns4:href="#md7efaf3aec" x="656.37271" y="266.384878" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4654 |
+
<use ns4:href="#md7efaf3aec" x="799.563935" y="257.816105" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4655 |
</g>
|
| 4656 |
</g>
|
| 4657 |
<g id="series--torch-mem-eff" class="series">
|
| 4658 |
+
<path d="M 83.607806 158.269135 L 226.799032 141.655829 L 369.990258 111.190815 L 513.181484 119.895642 L 656.37271 90.298767 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4659 |
<defs>
|
| 4660 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4661 |
</defs>
|
| 4662 |
<g clip-path="url(#p09feef2583)">
|
| 4663 |
+
<use ns4:href="#m9b8c54d372" x="83.607806" y="158.269135" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4664 |
+
<use ns4:href="#m9b8c54d372" x="226.799032" y="141.655829" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4665 |
+
<use ns4:href="#m9b8c54d372" x="369.990258" y="111.190815" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4666 |
+
<use ns4:href="#m9b8c54d372" x="513.181484" y="119.895642" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4667 |
+
<use ns4:href="#m9b8c54d372" x="656.37271" y="90.298767" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4668 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4669 |
</g>
|
| 4670 |
</g>
|
| 4671 |
<g id="series--xformers-meff" class="series">
|
| 4672 |
+
<path d="M 83.607806 410.251753 L 226.799032 397.439264 L 369.990258 386.870212 L 513.181484 384.759934 L 656.37271 333.145238 L 799.563935 334.337878 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4673 |
<defs>
|
| 4674 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4675 |
</defs>
|
| 4676 |
<g clip-path="url(#p09feef2583)">
|
| 4677 |
+
<use ns4:href="#mc655281e0b" x="83.607806" y="410.251753" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4678 |
+
<use ns4:href="#mc655281e0b" x="226.799032" y="397.439264" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4679 |
+
<use ns4:href="#mc655281e0b" x="369.990258" y="386.870212" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4680 |
+
<use ns4:href="#mc655281e0b" x="513.181484" y="384.759934" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4681 |
+
<use ns4:href="#mc655281e0b" x="656.37271" y="333.145238" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4682 |
+
<use ns4:href="#mc655281e0b" x="799.563935" y="334.337878" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4683 |
</g>
|
| 4684 |
</g>
|
| 4685 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4686 |
+
<path d="M 83.607806 417.931836 L 226.799032 403.327202 L 369.990258 389.605759 L 513.181484 383.179396 L 656.37271 337.13711 L 799.563935 336.430789 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4687 |
<defs>
|
| 4688 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4689 |
</defs>
|
| 4690 |
<g clip-path="url(#p09feef2583)">
|
| 4691 |
+
<use ns4:href="#m61c8040d7e" x="83.607806" y="417.931836" style="fill: #d62728; stroke: #d62728" />
|
| 4692 |
+
<use ns4:href="#m61c8040d7e" x="226.799032" y="403.327202" style="fill: #d62728; stroke: #d62728" />
|
| 4693 |
+
<use ns4:href="#m61c8040d7e" x="369.990258" y="389.605759" style="fill: #d62728; stroke: #d62728" />
|
| 4694 |
+
<use ns4:href="#m61c8040d7e" x="513.181484" y="383.179396" style="fill: #d62728; stroke: #d62728" />
|
| 4695 |
+
<use ns4:href="#m61c8040d7e" x="656.37271" y="337.13711" style="fill: #d62728; stroke: #d62728" />
|
| 4696 |
+
<use ns4:href="#m61c8040d7e" x="799.563935" y="336.430789" style="fill: #d62728; stroke: #d62728" />
|
| 4697 |
</g>
|
| 4698 |
</g>
|
| 4699 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4700 |
+
<path d="M 83.607806 428.387702 L 226.799032 413.453355 L 369.990258 405.098794 L 513.181484 402.988515 L 656.37271 353.264386 L 799.563935 343.383998 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4701 |
<defs>
|
| 4702 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4703 |
</defs>
|
| 4704 |
<g clip-path="url(#p09feef2583)">
|
| 4705 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4706 |
+
<use ns4:href="#m7cd35be9cc" x="226.799032" y="413.453355" style="fill: #9467bd; stroke: #9467bd" />
|
| 4707 |
+
<use ns4:href="#m7cd35be9cc" x="369.990258" y="405.098794" style="fill: #9467bd; stroke: #9467bd" />
|
| 4708 |
+
<use ns4:href="#m7cd35be9cc" x="513.181484" y="402.988515" style="fill: #9467bd; stroke: #9467bd" />
|
| 4709 |
+
<use ns4:href="#m7cd35be9cc" x="656.37271" y="353.264386" style="fill: #9467bd; stroke: #9467bd" />
|
| 4710 |
+
<use ns4:href="#m7cd35be9cc" x="799.563935" y="343.383998" style="fill: #9467bd; stroke: #9467bd" />
|
| 4711 |
</g>
|
| 4712 |
</g>
|
| 4713 |
<g id="patch_3">
|
index.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Darwin arm64 | macOS-15.7.2-arm64-arm-64bit
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
{"ts": "2025-12-
|
| 2 |
-
{"ts": "2025-12-
|
| 3 |
-
{"ts": "2025-12-
|
| 4 |
-
{"ts": "2025-12-
|
|
|
|
| 1 |
+
{"ts": "2025-12-19T23:02:16Z", "run": "32d018bc53624a45997f9dda67216816", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8274980000351206, "p50": 0.8322979999775271, "p90": 0.8378580000680813, "mean": 0.8332618000167713, "iqr": 0.0071710001066094264, "raw_times": [0.8322979999775271, 0.8378580000680813, 0.8379680000416556, 0.8306869999614719, 0.8274980000351206], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8445380001376179, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 2 |
+
{"ts": "2025-12-19T23:02:16Z", "run": "32d018bc53624a45997f9dda67216816", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6395549998833303, "p50": 1.6463560000374855, "p90": 1.6514159999587719, "mean": 1.6487175999827741, "iqr": 0.00707099979990744, "raw_times": [1.6395549998833303, 1.6514159999587719, 1.6463560000374855, 1.6443450001588644, 1.6619159998754185], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6726759999983187, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 3 |
+
{"ts": "2025-12-19T23:02:16Z", "run": "32d018bc53624a45997f9dda67216816", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6412159998253628, "p50": 1.644736000116609, "p90": 1.6461760001220682, "mean": 1.6448379999474128, "iqr": 0.0036900003124173963, "raw_times": [1.644736000116609, 1.6412159998253628, 1.649575999863373, 1.6461760001220682, 1.6424859998096508], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.646575999984634, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 4 |
+
{"ts": "2025-12-19T23:02:17Z", "run": "32d018bc53624a45997f9dda67216816", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.2493999999587686, "p50": 3.2569499999226537, "p90": 3.2582300000285613, "mean": 3.2570102000136103, "iqr": 0.006920000032550888, "raw_times": [3.2493999999587686, 3.2691610001620575, 3.2513099999960104, 3.2569499999226537, 3.2582300000285613], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2572910001817945, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
|
layer_norm/impls/hf_kernels_layer_norm.html
CHANGED
|
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3889 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3890 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3891 |
</span> |
|
| 3892 |
-
Cell: benchmark | 6.
|
| 3893 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3894 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3895 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3961,19 +3961,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
|
|
| 3961 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3962 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3963 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3964 |
-
hf_kernels_layer_norm 4.
|
| 3965 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 1.
|
| 3966 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3967 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3968 |
-
Activity Buffer Request 40.
|
| 3969 |
-
aten::view 0.
|
| 3970 |
-
aten::empty 1.
|
| 3971 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.
|
| 3972 |
-
cudaLaunchKernel 1.
|
| 3973 |
-
cudaDeviceSynchronize
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
-
Self CPU time total: 4.
|
| 3976 |
-
Self CUDA time total: 2.
|
| 3977 |
|
| 3978 |
|
| 3979 |
|
|
@@ -3983,19 +3983,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
|
|
| 3983 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3984 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3985 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3986 |
-
hf_kernels_layer_norm 2.
|
| 3987 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.
|
| 3988 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3989 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3990 |
-
Activity Buffer Request
|
| 3991 |
-
aten::view 0.
|
| 3992 |
-
aten::empty 0.
|
| 3993 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.
|
| 3994 |
-
cudaLaunchKernel 0.
|
| 3995 |
-
cudaDeviceSynchronize 71.
|
| 3996 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3997 |
-
Self CPU time total: 6.
|
| 3998 |
-
Self CUDA time total: 4.
|
| 3999 |
|
| 4000 |
|
| 4001 |
|
|
@@ -4005,19 +4005,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
|
|
| 4005 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4006 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
-
hf_kernels_layer_norm
|
| 4009 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.
|
| 4010 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4011 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4012 |
-
Activity Buffer Request 26.
|
| 4013 |
-
aten::view 0.
|
| 4014 |
-
aten::empty 0.
|
| 4015 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.
|
| 4016 |
-
cudaLaunchKernel 0.
|
| 4017 |
-
cudaDeviceSynchronize 69.
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
-
Self CPU time total: 6.
|
| 4020 |
-
Self CUDA time total: 4.
|
| 4021 |
|
| 4022 |
|
| 4023 |
|
|
@@ -4027,40 +4027,37 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
|
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4029 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4030 |
-
hf_kernels_layer_norm 1.
|
| 4031 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.
|
| 4032 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4033 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4034 |
-
Activity Buffer Request 14.
|
| 4035 |
-
aten::view 0.
|
| 4036 |
-
aten::empty 0.
|
| 4037 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.
|
| 4038 |
-
cudaLaunchKernel 2.
|
| 4039 |
-
cudaDeviceSynchronize
|
| 4040 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4041 |
-
Self CPU time total: 11.
|
| 4042 |
-
Self CUDA time total: 9.
|
| 4043 |
|
| 4044 |
|
| 4045 |
impl wl p50(ms) ok
|
| 4046 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
|
| 4047 |
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
|
| 4048 |
-
hf_kernels_layer_norm LN_B16_S4096_D4096 1.
|
| 4049 |
hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
|
| 4050 |
</pre></div>
|
| 4051 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4052 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4053 |
<div class="uv-logs-content" style="display: none;">
|
| 4054 |
-
Downloading hf-xet (3.2MiB)
|
| 4055 |
-
Downloaded hf-xet
|
| 4056 |
Installed 14 packages in 12ms
|
| 4057 |
</div>
|
| 4058 |
</div>
|
| 4059 |
-
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4060 |
-
|
| 4061 |
-
Fetching 4 files:
|
| 4062 |
-
Fetching 4 files:
|
| 4063 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.30it/s]</div>
|
| 4064 |
<div class="cell-artifacts">
|
| 4065 |
<h4>Artifacts:</h4>
|
| 4066 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
|
|
|
| 3889 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3890 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3891 |
</span> |
|
| 3892 |
+
Cell: benchmark | 6.38s
|
| 3893 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3894 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3895 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3961 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3962 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3963 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3964 |
+
hf_kernels_layer_norm 4.50% 190.523us 50.02% 2.118ms 2.118ms 0.000us 0.00% 3.104ms 3.104ms 1
|
| 3965 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 1.66% 70.302us 44.96% 1.904ms 634.711us 2.362ms 100.00% 3.104ms 1.035ms 3
|
| 3966 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.364ms 100.07% 2.364ms 2.364ms 1
|
| 3967 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.362ms 100.00% 2.362ms 787.316us 3
|
| 3968 |
+
Activity Buffer Request 40.99% 1.736ms 40.99% 1.736ms 1.736ms 741.567us 31.40% 741.567us 741.567us 1
|
| 3969 |
+
aten::view 0.56% 23.541us 0.56% 23.541us 3.923us 0.000us 0.00% 0.000us 0.000us 6
|
| 3970 |
+
aten::empty 1.07% 45.480us 1.07% 45.480us 5.053us 0.000us 0.00% 0.000us 0.000us 9
|
| 3971 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.24% 10.011us 0.24% 10.011us 3.337us 0.000us 0.00% 0.000us 0.000us 3
|
| 3972 |
+
cudaLaunchKernel 1.01% 42.571us 1.01% 42.571us 14.190us 0.000us 0.00% 0.000us 0.000us 3
|
| 3973 |
+
cudaDeviceSynchronize 49.98% 2.117ms 49.98% 2.117ms 2.117ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
+
Self CPU time total: 4.235ms
|
| 3976 |
+
Self CUDA time total: 2.362ms
|
| 3977 |
|
| 3978 |
|
| 3979 |
|
|
|
|
| 3983 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3984 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3985 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3986 |
+
hf_kernels_layer_norm 2.21% 144.492us 28.97% 1.894ms 1.894ms 0.000us 0.00% 6.395ms 6.395ms 1
|
| 3987 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.69% 45.222us 26.58% 1.738ms 579.353us 4.814ms 100.00% 6.395ms 2.132ms 3
|
| 3988 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.816ms 100.03% 4.816ms 4.816ms 1
|
| 3989 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.814ms 100.00% 4.814ms 1.605ms 3
|
| 3990 |
+
Activity Buffer Request 24.92% 1.629ms 24.92% 1.629ms 1.629ms 1.581ms 32.84% 1.581ms 1.581ms 1
|
| 3991 |
+
aten::view 0.18% 11.541us 0.18% 11.541us 1.923us 0.000us 0.00% 0.000us 0.000us 6
|
| 3992 |
+
aten::empty 0.45% 29.440us 0.45% 29.440us 3.271us 0.000us 0.00% 0.000us 0.000us 9
|
| 3993 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.060us 0.08% 5.060us 1.687us 0.000us 0.00% 0.000us 0.000us 3
|
| 3994 |
+
cudaLaunchKernel 0.45% 29.150us 0.45% 29.150us 9.717us 0.000us 0.00% 0.000us 0.000us 3
|
| 3995 |
+
cudaDeviceSynchronize 71.03% 4.644ms 71.03% 4.644ms 4.644ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3996 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3997 |
+
Self CPU time total: 6.538ms
|
| 3998 |
+
Self CUDA time total: 4.814ms
|
| 3999 |
|
| 4000 |
|
| 4001 |
|
|
|
|
| 4005 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4006 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
+
hf_kernels_layer_norm 2.04% 135.241us 30.10% 1.992ms 1.992ms 0.000us 0.00% 6.361ms 6.361ms 1
|
| 4009 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.68% 45.331us 27.89% 1.846ms 615.254us 4.793ms 100.00% 6.361ms 2.120ms 3
|
| 4010 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.794ms 100.03% 4.794ms 4.794ms 1
|
| 4011 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.793ms 100.00% 4.793ms 1.598ms 3
|
| 4012 |
+
Activity Buffer Request 26.25% 1.737ms 26.25% 1.737ms 1.737ms 1.569ms 32.73% 1.569ms 1.569ms 1
|
| 4013 |
+
aten::view 0.17% 11.061us 0.17% 11.061us 1.844us 0.000us 0.00% 0.000us 0.000us 6
|
| 4014 |
+
aten::empty 0.44% 29.151us 0.44% 29.151us 3.239us 0.000us 0.00% 0.000us 0.000us 9
|
| 4015 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.09% 5.831us 0.09% 5.831us 1.944us 0.000us 0.00% 0.000us 0.000us 3
|
| 4016 |
+
cudaLaunchKernel 0.43% 28.320us 0.43% 28.320us 9.440us 0.000us 0.00% 0.000us 0.000us 3
|
| 4017 |
+
cudaDeviceSynchronize 69.90% 4.626ms 69.90% 4.626ms 4.626ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
+
Self CPU time total: 6.618ms
|
| 4020 |
+
Self CUDA time total: 4.793ms
|
| 4021 |
|
| 4022 |
|
| 4023 |
|
|
|
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4029 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4030 |
+
hf_kernels_layer_norm 1.16% 134.713us 18.89% 2.202ms 2.202ms 0.000us 0.00% 12.808ms 12.808ms 1
|
| 4031 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.38% 44.369us 17.64% 2.056ms 685.371us 9.627ms 100.00% 12.808ms 4.269ms 3
|
| 4032 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.628ms 100.02% 9.628ms 9.628ms 1
|
| 4033 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.627ms 100.00% 9.627ms 3.209ms 3
|
| 4034 |
+
Activity Buffer Request 14.91% 1.739ms 14.91% 1.739ms 1.739ms 3.182ms 33.05% 3.182ms 3.182ms 1
|
| 4035 |
+
aten::view 0.10% 11.381us 0.10% 11.381us 1.897us 0.000us 0.00% 0.000us 0.000us 6
|
| 4036 |
+
aten::empty 0.26% 29.940us 0.26% 29.940us 3.327us 0.000us 0.00% 0.000us 0.000us 9
|
| 4037 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.04% 4.960us 0.04% 4.960us 1.653us 0.000us 0.00% 0.000us 0.000us 3
|
| 4038 |
+
cudaLaunchKernel 2.04% 237.996us 2.04% 237.996us 79.332us 0.000us 0.00% 0.000us 0.000us 3
|
| 4039 |
+
cudaDeviceSynchronize 81.11% 9.457ms 81.11% 9.457ms 9.457ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4040 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4041 |
+
Self CPU time total: 11.659ms
|
| 4042 |
+
Self CUDA time total: 9.627ms
|
| 4043 |
|
| 4044 |
|
| 4045 |
impl wl p50(ms) ok
|
| 4046 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
|
| 4047 |
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
|
| 4048 |
+
hf_kernels_layer_norm LN_B16_S4096_D4096 1.64 True
|
| 4049 |
hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
|
| 4050 |
</pre></div>
|
| 4051 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4052 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4053 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
| 4054 |
Installed 14 packages in 12ms
|
| 4055 |
</div>
|
| 4056 |
</div>
|
| 4057 |
+
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4058 |
+
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 9.25it/s]
|
| 4059 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.14it/s]
|
| 4060 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.62it/s]</div>
|
|
|
|
| 4061 |
<div class="cell-artifacts">
|
| 4062 |
<h4>Artifacts:</h4>
|
| 4063 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
layer_norm/impls/torch_layer_norm.html
CHANGED
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: nv | 0.
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3904,7 +3904,7 @@ Cell: nv | 0.30s
|
|
| 3904 |
</div>
|
| 3905 |
</div>
|
| 3906 |
<div id="output-nv" class="cell-output">
|
| 3907 |
-
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19
|
| 3908 |
+-----------------------------------------------------------------------------------------+
|
| 3909 |
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
|
@@ -3913,7 +3913,7 @@ Cell: nv | 0.30s
|
|
| 3913 |
| | | MIG M. |
|
| 3914 |
|=========================================+========================+======================|
|
| 3915 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3916 |
-
| N/A
|
| 3917 |
| | | N/A |
|
| 3918 |
+-----------------------------------------+------------------------+----------------------+
|
| 3919 |
|
|
@@ -3937,7 +3937,7 @@ Cell: nv | 0.30s
|
|
| 3937 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3938 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3939 |
</span> |
|
| 3940 |
-
Cell: benchmark |
|
| 3941 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3942 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3943 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3985,19 +3985,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
|
|
| 3985 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3986 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
-
torch_layer_norm
|
| 3989 |
-
aten::layer_norm 0.
|
| 3990 |
-
aten::native_layer_norm
|
| 3991 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3992 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3993 |
-
Activity Buffer Request
|
| 3994 |
-
aten::empty 1.
|
| 3995 |
-
cudaLaunchKernel
|
| 3996 |
-
aten::view 0.
|
| 3997 |
-
cudaDeviceSynchronize
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
-
Self CPU time total:
|
| 4000 |
-
Self CUDA time total: 2.
|
| 4001 |
|
| 4002 |
|
| 4003 |
|
|
@@ -4007,19 +4007,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
|
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4009 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4010 |
-
torch_layer_norm 1.
|
| 4011 |
-
aten::layer_norm 0.
|
| 4012 |
-
aten::native_layer_norm 0.
|
| 4013 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4014 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4015 |
-
Activity Buffer Request 25.
|
| 4016 |
-
aten::empty 0.
|
| 4017 |
-
cudaLaunchKernel 0.
|
| 4018 |
-
aten::view 0.
|
| 4019 |
-
cudaDeviceSynchronize 71.
|
| 4020 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4021 |
-
Self CPU time total: 6.
|
| 4022 |
-
Self CUDA time total: 4.
|
| 4023 |
|
| 4024 |
|
| 4025 |
|
|
@@ -4029,19 +4029,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
|
|
| 4029 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4030 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4031 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4032 |
-
torch_layer_norm 1.
|
| 4033 |
-
aten::layer_norm 0.
|
| 4034 |
-
aten::native_layer_norm 0.
|
| 4035 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4036 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4037 |
-
Activity Buffer Request 26.
|
| 4038 |
-
aten::empty 0.
|
| 4039 |
-
cudaLaunchKernel 0.43%
|
| 4040 |
-
aten::view 0.06%
|
| 4041 |
-
cudaDeviceSynchronize 70.
|
| 4042 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4043 |
-
Self CPU time total: 6.
|
| 4044 |
-
Self CUDA time total: 4.
|
| 4045 |
|
| 4046 |
|
| 4047 |
|
|
@@ -4051,23 +4051,23 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
|
|
| 4051 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4052 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
-
torch_layer_norm 0.
|
| 4055 |
-
aten::layer_norm 0.08%
|
| 4056 |
-
aten::native_layer_norm 0.
|
| 4057 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4058 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4059 |
-
Activity Buffer Request 11.
|
| 4060 |
-
aten::empty 0.
|
| 4061 |
-
cudaLaunchKernel
|
| 4062 |
-
aten::view 0.04% 4.
|
| 4063 |
-
cudaDeviceSynchronize
|
| 4064 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4065 |
-
Self CPU time total: 11.
|
| 4066 |
-
Self CUDA time total: 9.
|
| 4067 |
|
| 4068 |
|
| 4069 |
impl wl p50(ms) ok
|
| 4070 |
-
torch_layer_norm LN_B16_S2048_D4096 0.
|
| 4071 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4072 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4073 |
torch_layer_norm LN_B16_S4096_D8192 3.32 True
|
|
@@ -4075,53 +4075,7 @@ torch_layer_norm LN_B16_S4096_D8192 3.32 True
|
|
| 4075 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4076 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4077 |
<div class="uv-logs-content" style="display: none;">
|
| 4078 |
-
|
| 4079 |
-
Downloading sympy (6.0MiB)
|
| 4080 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4081 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4082 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4083 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4084 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4085 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4086 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4087 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4088 |
-
Downloading numpy (16.1MiB)
|
| 4089 |
-
Downloading setuptools (1.1MiB)
|
| 4090 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4091 |
-
Downloading kiwisolver (1.4MiB)
|
| 4092 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4093 |
-
Downloading matplotlib (8.3MiB)
|
| 4094 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4095 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4096 |
-
Downloading fonttools (4.8MiB)
|
| 4097 |
-
Downloading pillow (6.7MiB)
|
| 4098 |
-
Downloading networkx (2.0MiB)
|
| 4099 |
-
Downloading torch (846.9MiB)
|
| 4100 |
-
Downloading triton (148.3MiB)
|
| 4101 |
-
Downloaded nvidia-cufile-cu12
|
| 4102 |
-
Downloaded kiwisolver
|
| 4103 |
-
Downloaded setuptools
|
| 4104 |
-
Downloaded networkx
|
| 4105 |
-
Downloaded fonttools
|
| 4106 |
-
Downloaded pillow
|
| 4107 |
-
Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4108 |
-
Downloaded nvidia-cuda-cupti-cu12
|
| 4109 |
-
Downloaded matplotlib
|
| 4110 |
-
Downloaded numpy
|
| 4111 |
-
Downloaded sympy
|
| 4112 |
-
Downloaded nvidia-nvjitlink-cu12
|
| 4113 |
-
Downloaded nvidia-curand-cu12
|
| 4114 |
-
Downloaded nvidia-cuda-nvrtc-cu12
|
| 4115 |
-
Downloaded triton
|
| 4116 |
-
Downloaded nvidia-cufft-cu12
|
| 4117 |
-
Downloaded nvidia-cusolver-cu12
|
| 4118 |
-
Downloaded nvidia-cusparselt-cu12
|
| 4119 |
-
Downloaded nvidia-cusparse-cu12
|
| 4120 |
-
Downloaded nvidia-nccl-cu12
|
| 4121 |
-
Downloaded nvidia-cublas-cu12
|
| 4122 |
-
Downloaded nvidia-cudnn-cu12
|
| 4123 |
-
Downloaded torch
|
| 4124 |
-
Installed 37 packages in 284ms
|
| 4125 |
</div>
|
| 4126 |
</div>
|
| 4127 |
<div class="cell-artifacts">
|
|
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: nv | 0.25s
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3904 |
</div>
|
| 3905 |
</div>
|
| 3906 |
<div id="output-nv" class="cell-output">
|
| 3907 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 22:48:33 2025
|
| 3908 |
+-----------------------------------------------------------------------------------------+
|
| 3909 |
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3913 |
| | | MIG M. |
|
| 3914 |
|=========================================+========================+======================|
|
| 3915 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3916 |
+
| N/A 30C P0 107W / 350W | 0MiB / 46068MiB | 68% Default |
|
| 3917 |
| | | N/A |
|
| 3918 |
+-----------------------------------------+------------------------+----------------------+
|
| 3919 |
|
|
|
|
| 3937 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3938 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3939 |
</span> |
|
| 3940 |
+
Cell: benchmark | 7.61s
|
| 3941 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3942 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3943 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3985 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3986 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
+
torch_layer_norm 3.61% 151.022us 49.72% 2.081ms 2.081ms 0.000us 0.00% 3.037ms 3.037ms 1
|
| 3989 |
+
aten::layer_norm 0.35% 14.701us 46.11% 1.930ms 643.468us 0.000us 0.00% 3.037ms 1.012ms 3
|
| 3990 |
+
aten::native_layer_norm 1.79% 75.131us 45.76% 1.916ms 638.567us 2.326ms 100.00% 3.037ms 1.012ms 3
|
| 3991 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.327ms 100.06% 2.327ms 2.327ms 1
|
| 3992 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.326ms 100.00% 2.326ms 775.187us 3
|
| 3993 |
+
Activity Buffer Request 41.50% 1.738ms 41.50% 1.738ms 1.738ms 711.774us 30.61% 711.774us 711.774us 1
|
| 3994 |
+
aten::empty 1.17% 48.860us 1.17% 48.860us 5.429us 0.000us 0.00% 0.000us 0.000us 9
|
| 3995 |
+
cudaLaunchKernel 1.12% 46.753us 1.12% 46.753us 15.584us 0.000us 0.00% 0.000us 0.000us 3
|
| 3996 |
+
aten::view 0.18% 7.441us 0.18% 7.441us 1.240us 0.000us 0.00% 0.000us 0.000us 6
|
| 3997 |
+
cudaDeviceSynchronize 50.28% 2.105ms 50.28% 2.105ms 2.105ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
+
Self CPU time total: 4.186ms
|
| 4000 |
+
Self CUDA time total: 2.326ms
|
| 4001 |
|
| 4002 |
|
| 4003 |
|
|
|
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4009 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4010 |
+
torch_layer_norm 1.05% 69.561us 28.39% 1.886ms 1.886ms 0.000us 0.00% 6.477ms 6.477ms 1
|
| 4011 |
+
aten::layer_norm 0.13% 8.670us 27.34% 1.816ms 605.463us 0.000us 0.00% 6.477ms 2.159ms 3
|
| 4012 |
+
aten::native_layer_norm 0.77% 50.957us 27.21% 1.808ms 602.573us 4.891ms 100.00% 6.477ms 2.159ms 3
|
| 4013 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.893ms 100.03% 4.893ms 4.893ms 1
|
| 4014 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.891ms 100.00% 4.891ms 1.630ms 3
|
| 4015 |
+
Activity Buffer Request 25.53% 1.696ms 25.53% 1.696ms 1.696ms 1.586ms 32.42% 1.586ms 1.586ms 1
|
| 4016 |
+
aten::empty 0.45% 29.753us 0.45% 29.753us 3.306us 0.000us 0.00% 0.000us 0.000us 9
|
| 4017 |
+
cudaLaunchKernel 0.41% 27.542us 0.41% 27.542us 9.181us 0.000us 0.00% 0.000us 0.000us 3
|
| 4018 |
+
aten::view 0.05% 3.522us 0.05% 3.522us 0.587us 0.000us 0.00% 0.000us 0.000us 6
|
| 4019 |
+
cudaDeviceSynchronize 71.61% 4.758ms 71.61% 4.758ms 4.758ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4020 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4021 |
+
Self CPU time total: 6.643ms
|
| 4022 |
+
Self CUDA time total: 4.891ms
|
| 4023 |
|
| 4024 |
|
| 4025 |
|
|
|
|
| 4029 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4030 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4031 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4032 |
+
torch_layer_norm 1.06% 68.562us 29.18% 1.889ms 1.889ms 0.000us 0.00% 6.234ms 6.234ms 1
|
| 4033 |
+
aten::layer_norm 0.14% 9.330us 28.12% 1.821ms 606.966us 0.000us 0.00% 6.234ms 2.078ms 3
|
| 4034 |
+
aten::native_layer_norm 0.78% 50.590us 27.97% 1.812ms 603.856us 4.719ms 100.00% 6.234ms 2.078ms 3
|
| 4035 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.721ms 100.03% 4.721ms 4.721ms 1
|
| 4036 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.719ms 100.00% 4.719ms 1.573ms 3
|
| 4037 |
+
Activity Buffer Request 26.26% 1.700ms 26.26% 1.700ms 1.700ms 1.515ms 32.11% 1.515ms 1.515ms 1
|
| 4038 |
+
aten::empty 0.44% 28.660us 0.44% 28.660us 3.184us 0.000us 0.00% 0.000us 0.000us 9
|
| 4039 |
+
cudaLaunchKernel 0.43% 28.042us 0.43% 28.042us 9.347us 0.000us 0.00% 0.000us 0.000us 3
|
| 4040 |
+
aten::view 0.06% 3.840us 0.06% 3.840us 0.640us 0.000us 0.00% 0.000us 0.000us 6
|
| 4041 |
+
cudaDeviceSynchronize 70.82% 4.586ms 70.82% 4.586ms 4.586ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4042 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4043 |
+
Self CPU time total: 6.476ms
|
| 4044 |
+
Self CUDA time total: 4.719ms
|
| 4045 |
|
| 4046 |
|
| 4047 |
|
|
|
|
| 4051 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4052 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
+
torch_layer_norm 0.64% 72.823us 14.96% 1.710ms 1.710ms 0.000us 0.00% 13.144ms 13.144ms 1
|
| 4055 |
+
aten::layer_norm 0.08% 8.940us 14.32% 1.637ms 545.678us 0.000us 0.00% 13.144ms 4.381ms 3
|
| 4056 |
+
aten::native_layer_norm 0.49% 56.431us 14.24% 1.628ms 542.698us 9.871ms 100.00% 13.144ms 4.381ms 3
|
| 4057 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.872ms 100.02% 9.872ms 9.872ms 1
|
| 4058 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.871ms 100.00% 9.871ms 3.290ms 3
|
| 4059 |
+
Activity Buffer Request 11.76% 1.344ms 11.76% 1.344ms 1.344ms 3.273ms 33.16% 3.273ms 3.273ms 1
|
| 4060 |
+
aten::empty 0.26% 29.920us 0.26% 29.920us 3.324us 0.000us 0.00% 0.000us 0.000us 9
|
| 4061 |
+
cudaLaunchKernel 1.69% 193.294us 1.69% 193.294us 64.431us 0.000us 0.00% 0.000us 0.000us 3
|
| 4062 |
+
aten::view 0.04% 4.390us 0.04% 4.390us 0.732us 0.000us 0.00% 0.000us 0.000us 6
|
| 4063 |
+
cudaDeviceSynchronize 85.04% 9.722ms 85.04% 9.722ms 9.722ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4064 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4065 |
+
Self CPU time total: 11.432ms
|
| 4066 |
+
Self CUDA time total: 9.871ms
|
| 4067 |
|
| 4068 |
|
| 4069 |
impl wl p50(ms) ok
|
| 4070 |
+
torch_layer_norm LN_B16_S2048_D4096 0.82 True
|
| 4071 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4072 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4073 |
torch_layer_norm LN_B16_S4096_D8192 3.32 True
|
|
|
|
| 4075 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4076 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4077 |
<div class="uv-logs-content" style="display: none;">
|
| 4078 |
+
Installed 37 packages in 201ms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4079 |
</div>
|
| 4080 |
</div>
|
| 4081 |
<div class="cell-artifacts">
|
layer_norm/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
layer_norm/results/combined_results.html
CHANGED
|
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
-
<dc:date>2025-12-
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
|
@@ -3973,70 +3973,70 @@ body[data-tool="eraser"] .main-content {
|
|
| 3973 |
<g id="matplotlib.axis_2">
|
| 3974 |
<g id="ytick_1">
|
| 3975 |
<g id="grid-y--2" class="grid grid-y">
|
| 3976 |
-
<path d="M 47.72
|
| 3977 |
</g>
|
| 3978 |
<g id="line2d_5">
|
| 3979 |
<defs>
|
| 3980 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3981 |
</defs>
|
| 3982 |
<g>
|
| 3983 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 3984 |
</g>
|
| 3985 |
</g>
|
| 3986 |
<g id="text_5">
|
| 3987 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 3988 |
</g>
|
| 3989 |
</g>
|
| 3990 |
<g id="ytick_2">
|
| 3991 |
<g id="grid-y--3" class="grid grid-y">
|
| 3992 |
-
<path d="M 47.72
|
| 3993 |
</g>
|
| 3994 |
<g id="line2d_6">
|
| 3995 |
<g>
|
| 3996 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="text_6">
|
| 4000 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.
|
| 4001 |
</g>
|
| 4002 |
</g>
|
| 4003 |
<g id="ytick_3">
|
| 4004 |
<g id="grid-y--4" class="grid grid-y">
|
| 4005 |
-
<path d="M 47.72 252.
|
| 4006 |
</g>
|
| 4007 |
<g id="line2d_7">
|
| 4008 |
<g>
|
| 4009 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="252.
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="text_7">
|
| 4013 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.
|
| 4014 |
</g>
|
| 4015 |
</g>
|
| 4016 |
<g id="ytick_4">
|
| 4017 |
<g id="grid-y--5" class="grid grid-y">
|
| 4018 |
-
<path d="M 47.72 174.
|
| 4019 |
</g>
|
| 4020 |
<g id="line2d_8">
|
| 4021 |
<g>
|
| 4022 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="174.
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="text_8">
|
| 4026 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="ytick_5">
|
| 4030 |
<g id="grid-y--6" class="grid grid-y">
|
| 4031 |
-
<path d="M 47.72 96.
|
| 4032 |
</g>
|
| 4033 |
<g id="line2d_9">
|
| 4034 |
<g>
|
| 4035 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="96.
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="text_9">
|
| 4039 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="100.
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="label--y" class="ylabel">
|
|
@@ -4044,27 +4044,27 @@ body[data-tool="eraser"] .main-content {
|
|
| 4044 |
</g>
|
| 4045 |
</g>
|
| 4046 |
<g id="series--torch-layer-norm" class="series">
|
| 4047 |
-
<path d="M 83.741924 437.689571 L 323.888085
|
| 4048 |
<defs>
|
| 4049 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4050 |
</defs>
|
| 4051 |
<g clip-path="url(#p2214f54723)">
|
| 4052 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4053 |
-
<use ns4:href="#md7efaf3aec" x="323.888085" y="
|
| 4054 |
-
<use ns4:href="#md7efaf3aec" x="564.034245" y="
|
| 4055 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4056 |
</g>
|
| 4057 |
</g>
|
| 4058 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4059 |
-
<path d="M 83.741924
|
| 4060 |
<defs>
|
| 4061 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4062 |
</defs>
|
| 4063 |
<g clip-path="url(#p2214f54723)">
|
| 4064 |
-
<use ns4:href="#m9b8c54d372" x="83.741924" y="
|
| 4065 |
-
<use ns4:href="#m9b8c54d372" x="323.888085" y="
|
| 4066 |
-
<use ns4:href="#m9b8c54d372" x="564.034245" y="
|
| 4067 |
-
<use ns4:href="#m9b8c54d372" x="804.180406" y="56.
|
| 4068 |
</g>
|
| 4069 |
</g>
|
| 4070 |
<g id="patch_3">
|
|
@@ -4122,7 +4122,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4122 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4123 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4124 |
</span> |
|
| 4125 |
-
Cell: combine | 4.
|
| 4126 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4127 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4128 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4211,9 +4211,9 @@ COMBINED BENCHMARK SUMMARY
|
|
| 4211 |
impl wl p50(ms) ok
|
| 4212 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
|
| 4213 |
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
|
| 4214 |
-
hf_kernels_layer_norm LN_B16_S4096_D4096 1.
|
| 4215 |
hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
|
| 4216 |
-
torch_layer_norm LN_B16_S2048_D4096 0.
|
| 4217 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4218 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4219 |
torch_layer_norm LN_B16_S4096_D8192 3.32 True
|
|
@@ -4236,7 +4236,7 @@ Implementations included:
|
|
| 4236 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4237 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4238 |
<div class="uv-logs-content" style="display: none;">
|
| 4239 |
-
Installed 37 packages in
|
| 4240 |
</div>
|
| 4241 |
</div>
|
| 4242 |
<div class="cell-artifacts">
|
|
@@ -4249,7 +4249,7 @@ Installed 37 packages in 299ms
|
|
| 4249 |
<rdf:RDF>
|
| 4250 |
<ns2:Work>
|
| 4251 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4252 |
-
<dc:date>2025-12-
|
| 4253 |
<dc:format>image/svg+xml</dc:format>
|
| 4254 |
<dc:creator>
|
| 4255 |
<ns2:Agent>
|
|
@@ -4333,70 +4333,70 @@ Installed 37 packages in 299ms
|
|
| 4333 |
<g id="matplotlib.axis_2">
|
| 4334 |
<g id="ytick_1">
|
| 4335 |
<g id="grid-y--2" class="grid grid-y">
|
| 4336 |
-
<path d="M 47.72
|
| 4337 |
</g>
|
| 4338 |
<g id="line2d_5">
|
| 4339 |
<defs>
|
| 4340 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4341 |
</defs>
|
| 4342 |
<g>
|
| 4343 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4344 |
</g>
|
| 4345 |
</g>
|
| 4346 |
<g id="text_5">
|
| 4347 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4348 |
</g>
|
| 4349 |
</g>
|
| 4350 |
<g id="ytick_2">
|
| 4351 |
<g id="grid-y--3" class="grid grid-y">
|
| 4352 |
-
<path d="M 47.72
|
| 4353 |
</g>
|
| 4354 |
<g id="line2d_6">
|
| 4355 |
<g>
|
| 4356 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4357 |
</g>
|
| 4358 |
</g>
|
| 4359 |
<g id="text_6">
|
| 4360 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.
|
| 4361 |
</g>
|
| 4362 |
</g>
|
| 4363 |
<g id="ytick_3">
|
| 4364 |
<g id="grid-y--4" class="grid grid-y">
|
| 4365 |
-
<path d="M 47.72 252.
|
| 4366 |
</g>
|
| 4367 |
<g id="line2d_7">
|
| 4368 |
<g>
|
| 4369 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="252.
|
| 4370 |
</g>
|
| 4371 |
</g>
|
| 4372 |
<g id="text_7">
|
| 4373 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.
|
| 4374 |
</g>
|
| 4375 |
</g>
|
| 4376 |
<g id="ytick_4">
|
| 4377 |
<g id="grid-y--5" class="grid grid-y">
|
| 4378 |
-
<path d="M 47.72 174.
|
| 4379 |
</g>
|
| 4380 |
<g id="line2d_8">
|
| 4381 |
<g>
|
| 4382 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="174.
|
| 4383 |
</g>
|
| 4384 |
</g>
|
| 4385 |
<g id="text_8">
|
| 4386 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.
|
| 4387 |
</g>
|
| 4388 |
</g>
|
| 4389 |
<g id="ytick_5">
|
| 4390 |
<g id="grid-y--6" class="grid grid-y">
|
| 4391 |
-
<path d="M 47.72 96.
|
| 4392 |
</g>
|
| 4393 |
<g id="line2d_9">
|
| 4394 |
<g>
|
| 4395 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="96.
|
| 4396 |
</g>
|
| 4397 |
</g>
|
| 4398 |
<g id="text_9">
|
| 4399 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="100.
|
| 4400 |
</g>
|
| 4401 |
</g>
|
| 4402 |
<g id="label--y" class="ylabel">
|
|
@@ -4404,27 +4404,27 @@ Installed 37 packages in 299ms
|
|
| 4404 |
</g>
|
| 4405 |
</g>
|
| 4406 |
<g id="series--torch-layer-norm" class="series">
|
| 4407 |
-
<path d="M 83.741924 437.689571 L 323.888085
|
| 4408 |
<defs>
|
| 4409 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4410 |
</defs>
|
| 4411 |
<g clip-path="url(#p2214f54723)">
|
| 4412 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4413 |
-
<use ns4:href="#md7efaf3aec" x="323.888085" y="
|
| 4414 |
-
<use ns4:href="#md7efaf3aec" x="564.034245" y="
|
| 4415 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4416 |
</g>
|
| 4417 |
</g>
|
| 4418 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4419 |
-
<path d="M 83.741924
|
| 4420 |
<defs>
|
| 4421 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4422 |
</defs>
|
| 4423 |
<g clip-path="url(#p2214f54723)">
|
| 4424 |
-
<use ns4:href="#m9b8c54d372" x="83.741924" y="
|
| 4425 |
-
<use ns4:href="#m9b8c54d372" x="323.888085" y="
|
| 4426 |
-
<use ns4:href="#m9b8c54d372" x="564.034245" y="
|
| 4427 |
-
<use ns4:href="#m9b8c54d372" x="804.180406" y="56.
|
| 4428 |
</g>
|
| 4429 |
</g>
|
| 4430 |
<g id="patch_3">
|
|
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
+
<dc:date>2025-12-19T23:02:49.888978</dc:date>
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
|
|
|
| 3973 |
<g id="matplotlib.axis_2">
|
| 3974 |
<g id="ytick_1">
|
| 3975 |
<g id="grid-y--2" class="grid grid-y">
|
| 3976 |
+
<path d="M 47.72 409.375905 L 840.20233 409.375905 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3977 |
</g>
|
| 3978 |
<g id="line2d_5">
|
| 3979 |
<defs>
|
| 3980 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3981 |
</defs>
|
| 3982 |
<g>
|
| 3983 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="409.375905" style="stroke: #000000; stroke-width: 0.8" />
|
| 3984 |
</g>
|
| 3985 |
</g>
|
| 3986 |
<g id="text_5">
|
| 3987 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.175123" transform="rotate(-0 40.72 413.175123)">1.0</text>
|
| 3988 |
</g>
|
| 3989 |
</g>
|
| 3990 |
<g id="ytick_2">
|
| 3991 |
<g id="grid-y--3" class="grid grid-y">
|
| 3992 |
+
<path d="M 47.72 331.097781 L 840.20233 331.097781 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3993 |
</g>
|
| 3994 |
<g id="line2d_6">
|
| 3995 |
<g>
|
| 3996 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="331.097781" style="stroke: #000000; stroke-width: 0.8" />
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="text_6">
|
| 4000 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.897" transform="rotate(-0 40.72 334.897)">1.5</text>
|
| 4001 |
</g>
|
| 4002 |
</g>
|
| 4003 |
<g id="ytick_3">
|
| 4004 |
<g id="grid-y--4" class="grid grid-y">
|
| 4005 |
+
<path d="M 47.72 252.819658 L 840.20233 252.819658 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4006 |
</g>
|
| 4007 |
<g id="line2d_7">
|
| 4008 |
<g>
|
| 4009 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="252.819658" style="stroke: #000000; stroke-width: 0.8" />
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="text_7">
|
| 4013 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.618877" transform="rotate(-0 40.72 256.618877)">2.0</text>
|
| 4014 |
</g>
|
| 4015 |
</g>
|
| 4016 |
<g id="ytick_4">
|
| 4017 |
<g id="grid-y--5" class="grid grid-y">
|
| 4018 |
+
<path d="M 47.72 174.541535 L 840.20233 174.541535 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4019 |
</g>
|
| 4020 |
<g id="line2d_8">
|
| 4021 |
<g>
|
| 4022 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="174.541535" style="stroke: #000000; stroke-width: 0.8" />
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="text_8">
|
| 4026 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.340753" transform="rotate(-0 40.72 178.340753)">2.5</text>
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="ytick_5">
|
| 4030 |
<g id="grid-y--6" class="grid grid-y">
|
| 4031 |
+
<path d="M 47.72 96.263411 L 840.20233 96.263411 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4032 |
</g>
|
| 4033 |
<g id="line2d_9">
|
| 4034 |
<g>
|
| 4035 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="96.263411" style="stroke: #000000; stroke-width: 0.8" />
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="text_9">
|
| 4039 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="100.06263" transform="rotate(-0 40.72 100.06263)">3.0</text>
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4044 |
</g>
|
| 4045 |
</g>
|
| 4046 |
<g id="series--torch-layer-norm" class="series">
|
| 4047 |
+
<path d="M 83.741924 437.689571 L 323.888085 303.198519 L 564.034245 314.331547 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4048 |
<defs>
|
| 4049 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4050 |
</defs>
|
| 4051 |
<g clip-path="url(#p2214f54723)">
|
| 4052 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4053 |
+
<use ns4:href="#md7efaf3aec" x="323.888085" y="303.198519" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4054 |
+
<use ns4:href="#md7efaf3aec" x="564.034245" y="314.331547" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4055 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4056 |
</g>
|
| 4057 |
</g>
|
| 4058 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4059 |
+
<path d="M 83.741924 435.6307 L 323.888085 308.184835 L 564.034245 308.438456 L 804.180406 56.036284 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4060 |
<defs>
|
| 4061 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4062 |
</defs>
|
| 4063 |
<g clip-path="url(#p2214f54723)">
|
| 4064 |
+
<use ns4:href="#m9b8c54d372" x="83.741924" y="435.6307" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4065 |
+
<use ns4:href="#m9b8c54d372" x="323.888085" y="308.184835" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4066 |
+
<use ns4:href="#m9b8c54d372" x="564.034245" y="308.438456" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4067 |
+
<use ns4:href="#m9b8c54d372" x="804.180406" y="56.036284" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4068 |
</g>
|
| 4069 |
</g>
|
| 4070 |
<g id="patch_3">
|
|
|
|
| 4122 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4123 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4124 |
</span> |
|
| 4125 |
+
Cell: combine | 4.43s
|
| 4126 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4127 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4128 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4211 |
impl wl p50(ms) ok
|
| 4212 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
|
| 4213 |
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
|
| 4214 |
+
hf_kernels_layer_norm LN_B16_S4096_D4096 1.64 True
|
| 4215 |
hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
|
| 4216 |
+
torch_layer_norm LN_B16_S2048_D4096 0.82 True
|
| 4217 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4218 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4219 |
torch_layer_norm LN_B16_S4096_D8192 3.32 True
|
|
|
|
| 4236 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4237 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4238 |
<div class="uv-logs-content" style="display: none;">
|
| 4239 |
+
Installed 37 packages in 283ms
|
| 4240 |
</div>
|
| 4241 |
</div>
|
| 4242 |
<div class="cell-artifacts">
|
|
|
|
| 4249 |
<rdf:RDF>
|
| 4250 |
<ns2:Work>
|
| 4251 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4252 |
+
<dc:date>2025-12-19T23:02:49.888978</dc:date>
|
| 4253 |
<dc:format>image/svg+xml</dc:format>
|
| 4254 |
<dc:creator>
|
| 4255 |
<ns2:Agent>
|
|
|
|
| 4333 |
<g id="matplotlib.axis_2">
|
| 4334 |
<g id="ytick_1">
|
| 4335 |
<g id="grid-y--2" class="grid grid-y">
|
| 4336 |
+
<path d="M 47.72 409.375905 L 840.20233 409.375905 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4337 |
</g>
|
| 4338 |
<g id="line2d_5">
|
| 4339 |
<defs>
|
| 4340 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4341 |
</defs>
|
| 4342 |
<g>
|
| 4343 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="409.375905" style="stroke: #000000; stroke-width: 0.8" />
|
| 4344 |
</g>
|
| 4345 |
</g>
|
| 4346 |
<g id="text_5">
|
| 4347 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.175123" transform="rotate(-0 40.72 413.175123)">1.0</text>
|
| 4348 |
</g>
|
| 4349 |
</g>
|
| 4350 |
<g id="ytick_2">
|
| 4351 |
<g id="grid-y--3" class="grid grid-y">
|
| 4352 |
+
<path d="M 47.72 331.097781 L 840.20233 331.097781 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4353 |
</g>
|
| 4354 |
<g id="line2d_6">
|
| 4355 |
<g>
|
| 4356 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="331.097781" style="stroke: #000000; stroke-width: 0.8" />
|
| 4357 |
</g>
|
| 4358 |
</g>
|
| 4359 |
<g id="text_6">
|
| 4360 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.897" transform="rotate(-0 40.72 334.897)">1.5</text>
|
| 4361 |
</g>
|
| 4362 |
</g>
|
| 4363 |
<g id="ytick_3">
|
| 4364 |
<g id="grid-y--4" class="grid grid-y">
|
| 4365 |
+
<path d="M 47.72 252.819658 L 840.20233 252.819658 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4366 |
</g>
|
| 4367 |
<g id="line2d_7">
|
| 4368 |
<g>
|
| 4369 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="252.819658" style="stroke: #000000; stroke-width: 0.8" />
|
| 4370 |
</g>
|
| 4371 |
</g>
|
| 4372 |
<g id="text_7">
|
| 4373 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.618877" transform="rotate(-0 40.72 256.618877)">2.0</text>
|
| 4374 |
</g>
|
| 4375 |
</g>
|
| 4376 |
<g id="ytick_4">
|
| 4377 |
<g id="grid-y--5" class="grid grid-y">
|
| 4378 |
+
<path d="M 47.72 174.541535 L 840.20233 174.541535 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4379 |
</g>
|
| 4380 |
<g id="line2d_8">
|
| 4381 |
<g>
|
| 4382 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="174.541535" style="stroke: #000000; stroke-width: 0.8" />
|
| 4383 |
</g>
|
| 4384 |
</g>
|
| 4385 |
<g id="text_8">
|
| 4386 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.340753" transform="rotate(-0 40.72 178.340753)">2.5</text>
|
| 4387 |
</g>
|
| 4388 |
</g>
|
| 4389 |
<g id="ytick_5">
|
| 4390 |
<g id="grid-y--6" class="grid grid-y">
|
| 4391 |
+
<path d="M 47.72 96.263411 L 840.20233 96.263411 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4392 |
</g>
|
| 4393 |
<g id="line2d_9">
|
| 4394 |
<g>
|
| 4395 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="96.263411" style="stroke: #000000; stroke-width: 0.8" />
|
| 4396 |
</g>
|
| 4397 |
</g>
|
| 4398 |
<g id="text_9">
|
| 4399 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="100.06263" transform="rotate(-0 40.72 100.06263)">3.0</text>
|
| 4400 |
</g>
|
| 4401 |
</g>
|
| 4402 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4404 |
</g>
|
| 4405 |
</g>
|
| 4406 |
<g id="series--torch-layer-norm" class="series">
|
| 4407 |
+
<path d="M 83.741924 437.689571 L 323.888085 303.198519 L 564.034245 314.331547 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4408 |
<defs>
|
| 4409 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4410 |
</defs>
|
| 4411 |
<g clip-path="url(#p2214f54723)">
|
| 4412 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4413 |
+
<use ns4:href="#md7efaf3aec" x="323.888085" y="303.198519" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4414 |
+
<use ns4:href="#md7efaf3aec" x="564.034245" y="314.331547" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4415 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4416 |
</g>
|
| 4417 |
</g>
|
| 4418 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4419 |
+
<path d="M 83.741924 435.6307 L 323.888085 308.184835 L 564.034245 308.438456 L 804.180406 56.036284 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4420 |
<defs>
|
| 4421 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4422 |
</defs>
|
| 4423 |
<g clip-path="url(#p2214f54723)">
|
| 4424 |
+
<use ns4:href="#m9b8c54d372" x="83.741924" y="435.6307" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4425 |
+
<use ns4:href="#m9b8c54d372" x="323.888085" y="308.184835" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4426 |
+
<use ns4:href="#m9b8c54d372" x="564.034245" y="308.438456" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4427 |
+
<use ns4:href="#m9b8c54d372" x="804.180406" y="56.036284" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4428 |
</g>
|
| 4429 |
</g>
|
| 4430 |
<g id="patch_3">
|
openai_moe/impls/artifacts/benchmark/openai_moe.jsonl
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
-
{"ts": "2025-12-
|
| 2 |
-
{"ts": "2025-12-
|
| 3 |
-
{"ts": "2025-12-
|
| 4 |
-
{"ts": "2025-12-
|
| 5 |
-
{"ts": "2025-12-
|
| 6 |
-
{"ts": "2025-12-
|
| 7 |
-
{"ts": "2025-12-
|
| 8 |
-
{"ts": "2025-12-
|
|
|
|
| 1 |
+
{"ts": "2025-12-19T23:01:31Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S512_E2", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 2.642566999838891, "p50": 2.6590969998778746, "p90": 2.673486999810848, "mean": 2.659981199894901, "iqr": 0.023999999939405825, "raw_times": [2.6590969998778746, 2.675268000075448, 2.649486999871442, 2.642566999838891, 2.673486999810848], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.7064890000474406, "peak_bytes": 311252992, "ok": true, "absmax": 1.0818243026733398e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.0818243026733398e-05, "mae": 1.0733322142186807e-06, "mse": 1.9560496885423495e-12, "ref": "naive_moe"}, "err": null}
|
| 2 |
+
{"ts": "2025-12-19T23:01:32Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S512_E4", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.936204999945403, "p50": 3.9538260000426817, "p90": 3.9835660002154327, "mean": 3.9606518000255164, "iqr": 0.039130000232034945, "raw_times": [3.936204999945403, 3.9538260000426817, 3.9835660002154327, 3.985225999940667, 3.944435999983398], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.9596259998688765, "peak_bytes": 632822272, "ok": true, "absmax": 7.82310962677002e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 7.82310962677002e-06, "mae": 5.576844728238939e-07, "mse": 5.436189692842319e-13, "ref": "naive_moe"}, "err": null}
|
| 3 |
+
{"ts": "2025-12-19T23:01:33Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S1024_E2", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.817872999889005, "p50": 3.868872999873929, "p90": 3.9019339999413205, "mean": 3.8749997999275365, "iqr": 0.044331000026431866, "raw_times": [3.817872999889005, 3.8576029999148886, 3.9287160000185395, 3.9019339999413205, 3.868872999873929], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.836012999954619, "peak_bytes": 645417472, "ok": true, "absmax": 1.5497207641601562e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.5497207641601562e-05, "mae": 1.1454358173068613e-06, "mse": 2.2412421311207575e-12, "ref": "naive_moe"}, "err": null}
|
| 4 |
+
{"ts": "2025-12-19T23:01:34Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S1024_E4", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 5.3247949999786215, "p50": 5.3401449999910255, "p90": 5.39184700005535, "mean": 5.356893600037438, "iqr": 0.06286200004979037, "raw_times": [5.39184700005535, 5.3247949999786215, 5.3401449999910255, 5.328985000005559, 5.398696000156633], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 5.315443999961644, "peak_bytes": 657099264, "ok": true, "absmax": 6.556510925292969e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 6.556510925292969e-06, "mae": 4.852234951613354e-07, "mse": 4.015021550906467e-13, "ref": "naive_moe"}, "err": null}
|
| 5 |
+
{"ts": "2025-12-19T23:01:36Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S512_E2", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 6.797146999815595, "p50": 6.804686999885234, "p90": 6.806136000022889, "mean": 6.814822799969988, "iqr": 0.0027099999897473026, "raw_times": [6.862718000093082, 6.806136000022889, 6.797146999815595, 6.8034260000331415, 6.804686999885234], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 6.6412220000984235, "peak_bytes": 678357504, "ok": true, "absmax": 1.3589859008789062e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.3589859008789062e-05, "mae": 1.1745952406272409e-06, "mse": 2.316181968442521e-12, "ref": "naive_moe"}, "err": null}
|
| 6 |
+
{"ts": "2025-12-19T23:01:38Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S512_E4", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 7.520542000065689, "p50": 7.530022999844732, "p90": 7.53409300000385, "mean": 7.531816999926377, "iqr": 0.0065400001858506585, "raw_times": [7.520542000065689, 7.527552999817999, 7.546873999899617, 7.530022999844732, 7.53409300000385], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 7.356247999950938, "peak_bytes": 701983232, "ok": true, "absmax": 8.58306884765625e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 8.58306884765625e-06, "mae": 5.268635732136318e-07, "mse": 4.753664909623589e-13, "ref": "naive_moe"}, "err": null}
|
| 7 |
+
{"ts": "2025-12-19T23:01:42Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S1024_E2", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 13.378247999980886, "p50": 13.385679999828426, "p90": 13.397299999951429, "mean": 13.394303199947899, "iqr": 0.012501999890446314, "raw_times": [13.378247999980886, 13.384798000060982, 13.425489999917772, 13.385679999828426, 13.397299999951429], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 13.166785000066739, "peak_bytes": 1012207616, "ok": true, "absmax": 1.71661376953125e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.71661376953125e-05, "mae": 1.797086838450923e-06, "mse": 5.3811247992252564e-12, "ref": "naive_moe"}, "err": null}
|
| 8 |
+
{"ts": "2025-12-19T23:01:46Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S1024_E4", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 13.328448000038406, "p50": 13.40927800015379, "p90": 13.441681000131211, "mean": 13.402939000070546, "iqr": 0.0636730001133401, "raw_times": [13.457280000011451, 13.441681000131211, 13.40927800015379, 13.378008000017871, 13.328448000038406], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 13.036729999839736, "peak_bytes": 910968320, "ok": true, "absmax": 8.344650268554688e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 8.344650268554688e-06, "mae": 5.471991357808292e-07, "mse": 5.06310813587485e-13, "ref": "naive_moe"}, "err": null}
|
openai_moe/impls/binned_torch.html
CHANGED
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: nv | 0.
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3904,7 +3904,7 @@ Cell: nv | 0.28s
|
|
| 3904 |
</div>
|
| 3905 |
</div>
|
| 3906 |
<div id="output-nv" class="cell-output">
|
| 3907 |
-
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19
|
| 3908 |
+-----------------------------------------------------------------------------------------+
|
| 3909 |
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
|
@@ -3913,7 +3913,7 @@ Cell: nv | 0.28s
|
|
| 3913 |
| | | MIG M. |
|
| 3914 |
|=========================================+========================+======================|
|
| 3915 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3916 |
-
| N/A
|
| 3917 |
| | | N/A |
|
| 3918 |
+-----------------------------------------+------------------------+----------------------+
|
| 3919 |
|
|
@@ -3937,7 +3937,7 @@ Cell: nv | 0.28s
|
|
| 3937 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3938 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3939 |
</span> |
|
| 3940 |
-
Cell: benchmark |
|
| 3941 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3942 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3943 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4095,29 +4095,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S512_E2
|
|
| 4095 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4096 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4097 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4098 |
-
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4099 |
-
binned_torch 24.
|
| 4100 |
-
aten::item 1.
|
| 4101 |
-
aten::_local_scalar_dense
|
| 4102 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 15.
|
| 4103 |
-
|
| 4104 |
-
|
| 4105 |
-
|
| 4106 |
-
aten::copy_ 3.
|
| 4107 |
-
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.585ms
|
| 4108 |
-
aten::mul 3.
|
| 4109 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4110 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4111 |
-
aten::remainder 3.
|
| 4112 |
-
aten::add 2.
|
| 4113 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4114 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4115 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4116 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4117 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
-
Self CPU time total:
|
| 4120 |
-
Self CUDA time total: 50.
|
| 4121 |
|
| 4122 |
|
| 4123 |
|
|
@@ -4127,29 +4127,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S512_E4
|
|
| 4127 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4128 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4129 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4130 |
-
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4131 |
-
binned_torch 24.
|
| 4132 |
-
aten::item 1.
|
| 4133 |
-
aten::_local_scalar_dense 6.
|
| 4134 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 17.
|
| 4135 |
-
aten::bmm 0.02%
|
| 4136 |
-
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4137 |
-
aten::floor_divide
|
| 4138 |
-
aten::copy_ 3.
|
| 4139 |
-
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4140 |
-
aten::add 3.
|
| 4141 |
-
aten::mul
|
| 4142 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4143 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4144 |
-
aten::remainder 2.
|
| 4145 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4146 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4147 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4148 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4149 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4150 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4151 |
-
Self CPU time total:
|
| 4152 |
-
Self CUDA time total:
|
| 4153 |
|
| 4154 |
|
| 4155 |
|
|
@@ -4159,29 +4159,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S1024_E2
|
|
| 4159 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4160 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4161 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4162 |
-
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4163 |
-
binned_torch 24.
|
| 4164 |
-
aten::item 1.
|
| 4165 |
-
aten::_local_scalar_dense
|
| 4166 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 31.
|
| 4167 |
-
|
| 4168 |
-
|
| 4169 |
-
|
| 4170 |
-
aten::copy_ 3.
|
| 4171 |
-
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.
|
| 4172 |
-
aten::mul 3.
|
| 4173 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4174 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.
|
| 4175 |
-
aten::remainder 3.09%
|
| 4176 |
-
aten::add 2.
|
| 4177 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4178 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4179 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4180 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4181 |
-
aten::clamp 0.00%
|
| 4182 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4183 |
-
Self CPU time total: 1.
|
| 4184 |
-
Self CUDA time total: 103.
|
| 4185 |
|
| 4186 |
|
| 4187 |
|
|
@@ -4191,29 +4191,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S1024_E4
|
|
| 4191 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4192 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4193 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4194 |
-
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4195 |
-
binned_torch 24.
|
| 4196 |
-
aten::item 1.
|
| 4197 |
-
aten::_local_scalar_dense 6.
|
| 4198 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4199 |
-
|
| 4200 |
-
|
| 4201 |
-
|
| 4202 |
-
aten::copy_ 3.
|
| 4203 |
-
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.
|
| 4204 |
-
aten::mul
|
| 4205 |
-
aten::add
|
| 4206 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.572ms 8.
|
| 4207 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.
|
| 4208 |
-
aten::remainder 2.
|
| 4209 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4210 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4211 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4212 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4213 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4214 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4215 |
-
Self CPU time total: 1.
|
| 4216 |
-
Self CUDA time total:
|
| 4217 |
|
| 4218 |
|
| 4219 |
|
|
@@ -4223,29 +4223,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S512_E2
|
|
| 4223 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4224 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4225 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4226 |
-
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4227 |
-
binned_torch
|
| 4228 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 63.
|
| 4229 |
-
aten::item 1.
|
| 4230 |
-
aten::_local_scalar_dense 6.
|
| 4231 |
-
aten::floor_divide 5.
|
| 4232 |
-
aten::bmm 0.01% 232.
|
| 4233 |
-
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 29.
|
| 4234 |
-
aten::copy_ 3.
|
| 4235 |
-
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.
|
| 4236 |
-
aten::mul 3.
|
| 4237 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.
|
| 4238 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.
|
| 4239 |
-
aten::add 2.81%
|
| 4240 |
-
aten::remainder 3.
|
| 4241 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.
|
| 4242 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4243 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4244 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4245 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4246 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4247 |
-
Self CPU time total: 3.
|
| 4248 |
-
Self CUDA time total:
|
| 4249 |
|
| 4250 |
|
| 4251 |
|
|
@@ -4255,29 +4255,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S512_E4
|
|
| 4255 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4256 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4257 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4258 |
-
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4259 |
-
binned_torch
|
| 4260 |
-
aten::item 1.
|
| 4261 |
-
aten::_local_scalar_dense 6.
|
| 4262 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 69.
|
| 4263 |
-
aten::floor_divide
|
| 4264 |
-
aten::bmm 0.01%
|
| 4265 |
-
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 29.
|
| 4266 |
-
aten::copy_ 3.
|
| 4267 |
-
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.
|
| 4268 |
-
aten::mul
|
| 4269 |
-
aten::add
|
| 4270 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.
|
| 4271 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4272 |
-
aten::remainder 2.
|
| 4273 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4274 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.
|
| 4275 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4276 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4277 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4278 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4279 |
-
Self CPU time total: 3.
|
| 4280 |
-
Self CUDA time total:
|
| 4281 |
|
| 4282 |
|
| 4283 |
|
|
@@ -4287,29 +4287,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S1024_E2
|
|
| 4287 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4288 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4289 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4290 |
-
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4291 |
-
binned_torch
|
| 4292 |
-
aten::item 1.
|
| 4293 |
-
aten::_local_scalar_dense
|
| 4294 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 127.
|
| 4295 |
-
aten::floor_divide 5.
|
| 4296 |
-
|
| 4297 |
-
|
| 4298 |
-
aten::copy_ 3.
|
| 4299 |
-
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.
|
| 4300 |
-
aten::mul 3.
|
| 4301 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.
|
| 4302 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4303 |
-
aten::add 2.
|
| 4304 |
-
aten::remainder 3.
|
| 4305 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4306 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4307 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4308 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.
|
| 4309 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4310 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4311 |
-
Self CPU time total:
|
| 4312 |
-
Self CUDA time total:
|
| 4313 |
|
| 4314 |
|
| 4315 |
|
|
@@ -4319,45 +4319,45 @@ PROFILE TRACE: binned_torch | cuda_B4_S1024_E4
|
|
| 4319 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4320 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4321 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4322 |
-
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4323 |
-
binned_torch
|
| 4324 |
-
aten::item 1.
|
| 4325 |
-
aten::_local_scalar_dense 6.
|
| 4326 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4327 |
-
aten::floor_divide
|
| 4328 |
-
aten::bmm 0.00%
|
| 4329 |
-
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4330 |
-
aten::copy_ 3.
|
| 4331 |
-
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.
|
| 4332 |
-
aten::mul 3.08%
|
| 4333 |
-
|
| 4334 |
-
|
| 4335 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4336 |
-
aten::remainder 2.
|
| 4337 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4338 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4339 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4340 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.
|
| 4341 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4342 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4343 |
-
Self CPU time total: 7.
|
| 4344 |
-
Self CUDA time total:
|
| 4345 |
|
| 4346 |
|
| 4347 |
impl wl p50(ms) ok
|
| 4348 |
-
binned_torch cuda_B1_S1024_E2
|
| 4349 |
-
binned_torch cuda_B1_S1024_E4
|
| 4350 |
-
binned_torch cuda_B1_S512_E2
|
| 4351 |
-
binned_torch cuda_B1_S512_E4
|
| 4352 |
-
binned_torch cuda_B4_S1024_E2
|
| 4353 |
-
binned_torch cuda_B4_S1024_E4
|
| 4354 |
-
binned_torch cuda_B4_S512_E2
|
| 4355 |
-
binned_torch cuda_B4_S512_E4
|
| 4356 |
</pre></div>
|
| 4357 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4358 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4359 |
<div class="uv-logs-content" style="display: none;">
|
| 4360 |
-
Installed 37 packages in
|
| 4361 |
</div>
|
| 4362 |
</div>
|
| 4363 |
<div class="cell-artifacts">
|
|
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: nv | 0.25s
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3904 |
</div>
|
| 3905 |
</div>
|
| 3906 |
<div id="output-nv" class="cell-output">
|
| 3907 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 23:00:37 2025
|
| 3908 |
+-----------------------------------------------------------------------------------------+
|
| 3909 |
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3913 |
| | | MIG M. |
|
| 3914 |
|=========================================+========================+======================|
|
| 3915 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3916 |
+
| N/A 40C P0 84W / 350W | 0MiB / 46068MiB | 60% Default |
|
| 3917 |
| | | N/A |
|
| 3918 |
+-----------------------------------------+------------------------+----------------------+
|
| 3919 |
|
|
|
|
| 3937 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3938 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3939 |
</span> |
|
| 3940 |
+
Cell: benchmark | 723.84s
|
| 3941 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3942 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3943 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4095 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4096 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4097 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4098 |
+
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 916.334ms 1818.27% 916.334ms 916.334ms 1
|
| 4099 |
+
binned_torch 24.63% 226.221ms 100.00% 918.346ms 918.346ms 0.000us 0.00% 50.398ms 50.398ms 1
|
| 4100 |
+
aten::item 1.84% 16.915ms 25.73% 236.247ms 15.396us 0.000us 0.00% 15.727ms 1.025us 15345
|
| 4101 |
+
aten::_local_scalar_dense 5.92% 54.373ms 23.88% 219.332ms 14.293us 15.726ms 31.20% 15.727ms 1.025us 15345
|
| 4102 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 15.726ms 31.20% 15.726ms 1.025us 15345
|
| 4103 |
+
aten::bmm 0.02% 194.226us 0.03% 236.195us 39.366us 8.013ms 15.90% 8.013ms 1.336ms 6
|
| 4104 |
+
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 8.013ms 15.90% 8.013ms 1.336ms 6
|
| 4105 |
+
aten::floor_divide 5.35% 49.157ms 13.15% 120.743ms 19.652us 7.547ms 14.98% 7.547ms 1.228us 6144
|
| 4106 |
+
aten::copy_ 3.75% 34.457ms 9.21% 84.535ms 13.732us 6.589ms 13.08% 6.592ms 1.071us 6156
|
| 4107 |
+
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.585ms 13.07% 6.585ms 1.070us 6153
|
| 4108 |
+
aten::mul 3.14% 28.847ms 5.63% 51.742ms 16.794us 4.707ms 9.34% 4.707ms 1.528us 3081
|
| 4109 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.479ms 8.89% 4.479ms 1.458us 3072
|
| 4110 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.026ms 7.99% 4.026ms 1.311us 3072
|
| 4111 |
+
aten::remainder 3.09% 28.363ms 4.76% 43.750ms 14.241us 3.702ms 7.35% 3.702ms 1.205us 3072
|
| 4112 |
+
aten::add 2.79% 25.584ms 4.81% 44.150ms 14.557us 3.631ms 7.20% 3.631ms 1.197us 3033
|
| 4113 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.522ms 6.99% 3.522ms 1.147us 3072
|
| 4114 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.235ms 6.42% 3.235ms 1.068us 3030
|
| 4115 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.954ms 3.88% 1.954ms 1.272us 1536
|
| 4116 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.749ms 3.47% 1.749ms 1.138us 1536
|
| 4117 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 287.138us 0.57% 287.138us 47.856us 6
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
+
Self CPU time total: 918.353ms
|
| 4120 |
+
Self CUDA time total: 50.396ms
|
| 4121 |
|
| 4122 |
|
| 4123 |
|
|
|
|
| 4127 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4128 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4129 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4130 |
+
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 930.604ms 1724.65% 930.604ms 930.604ms 1
|
| 4131 |
+
binned_torch 24.29% 226.115ms 100.00% 930.865ms 930.865ms 0.000us 0.00% 53.966ms 53.966ms 1
|
| 4132 |
+
aten::item 1.81% 16.815ms 27.55% 256.425ms 15.142us 0.000us 0.00% 17.838ms 1.053us 16935
|
| 4133 |
+
aten::_local_scalar_dense 6.14% 57.141ms 25.74% 239.611ms 14.149us 17.835ms 33.05% 17.838ms 1.053us 16935
|
| 4134 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 17.835ms 33.05% 17.835ms 1.053us 16935
|
| 4135 |
+
aten::bmm 0.02% 175.424us 0.02% 217.325us 36.221us 7.967ms 14.77% 7.967ms 1.328ms 6
|
| 4136 |
+
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.967ms 14.77% 7.967ms 1.328ms 6
|
| 4137 |
+
aten::floor_divide 5.05% 47.005ms 12.57% 117.000ms 19.043us 7.550ms 13.99% 7.551ms 1.229us 6144
|
| 4138 |
+
aten::copy_ 3.51% 32.640ms 8.36% 77.831ms 12.643us 6.635ms 12.30% 6.635ms 1.078us 6156
|
| 4139 |
+
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.632ms 12.29% 6.632ms 1.078us 6152
|
| 4140 |
+
aten::add 3.89% 36.256ms 6.95% 64.697ms 14.086us 5.059ms 9.38% 5.059ms 1.102us 4593
|
| 4141 |
+
aten::mul 2.92% 27.144ms 5.32% 49.502ms 16.067us 4.707ms 8.72% 4.707ms 1.528us 3081
|
| 4142 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.479ms 8.30% 4.479ms 1.458us 3072
|
| 4143 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.026ms 7.46% 4.026ms 1.310us 3072
|
| 4144 |
+
aten::remainder 2.81% 26.197ms 4.49% 41.800ms 13.607us 3.721ms 6.90% 3.721ms 1.211us 3072
|
| 4145 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.524ms 6.53% 3.524ms 1.147us 3072
|
| 4146 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.140ms 5.82% 3.140ms 1.036us 3030
|
| 4147 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.965ms 3.64% 1.965ms 1.279us 1536
|
| 4148 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.756ms 3.25% 1.756ms 1.143us 1536
|
| 4149 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.517ms 2.81% 1.517ms 0.972us 1560
|
| 4150 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4151 |
+
Self CPU time total: 930.874ms
|
| 4152 |
+
Self CUDA time total: 53.959ms
|
| 4153 |
|
| 4154 |
|
| 4155 |
|
|
|
|
| 4159 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4160 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4161 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4162 |
+
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.706s 1653.15% 1.706s 1.706s 1
|
| 4163 |
+
binned_torch 24.03% 409.734ms 100.00% 1.705s 1.705s 0.000us 0.00% 103.183ms 103.183ms 1
|
| 4164 |
+
aten::item 1.59% 27.070ms 26.54% 452.490ms 14.829us 0.000us 0.00% 31.572ms 1.035us 30513
|
| 4165 |
+
aten::_local_scalar_dense 5.90% 100.602ms 24.95% 425.421ms 13.942us 31.568ms 30.60% 31.572ms 1.035us 30513
|
| 4166 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 31.568ms 30.60% 31.568ms 1.035us 30513
|
| 4167 |
+
aten::bmm 0.01% 213.024us 0.02% 261.877us 43.646us 15.473ms 15.00% 15.473ms 2.579ms 6
|
| 4168 |
+
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.473ms 15.00% 15.473ms 2.579ms 6
|
| 4169 |
+
aten::floor_divide 5.42% 92.355ms 13.36% 227.861ms 18.543us 15.078ms 14.61% 15.078ms 1.227us 12288
|
| 4170 |
+
aten::copy_ 3.96% 67.445ms 9.41% 160.444ms 13.044us 13.330ms 12.92% 13.330ms 1.084us 12300
|
| 4171 |
+
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.326ms 12.92% 13.326ms 1.084us 12294
|
| 4172 |
+
aten::mul 3.18% 54.204ms 5.76% 98.288ms 15.974us 11.263ms 10.92% 11.265ms 1.831us 6153
|
| 4173 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.919ms 9.61% 9.919ms 1.614us 6144
|
| 4174 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.044ms 7.80% 8.044ms 1.309us 6144
|
| 4175 |
+
aten::remainder 3.09% 52.622ms 4.84% 82.495ms 13.427us 7.409ms 7.18% 7.409ms 1.206us 6144
|
| 4176 |
+
aten::add 2.82% 48.063ms 4.95% 84.371ms 14.269us 7.380ms 7.15% 7.380ms 1.248us 5913
|
| 4177 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.034ms 6.82% 7.034ms 1.145us 6144
|
| 4178 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.098ms 5.91% 6.098ms 1.032us 5910
|
| 4179 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.912ms 3.79% 3.912ms 1.273us 3072
|
| 4180 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.498ms 3.39% 3.498ms 1.139us 3072
|
| 4181 |
+
aten::clamp 0.00% 70.381us 0.01% 115.343us 19.224us 1.182ms 1.15% 1.182ms 197.026us 6
|
| 4182 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4183 |
+
Self CPU time total: 1.705s
|
| 4184 |
+
Self CUDA time total: 103.179ms
|
| 4185 |
|
| 4186 |
|
| 4187 |
|
|
|
|
| 4191 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4192 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4193 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4194 |
+
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.835s 1676.06% 1.835s 1.835s 1
|
| 4195 |
+
binned_torch 24.11% 442.690ms 100.00% 1.836s 1.836s 0.000us 0.00% 109.503ms 109.503ms 1
|
| 4196 |
+
aten::item 1.62% 29.702ms 27.50% 504.982ms 14.972us 0.000us 0.00% 35.015ms 1.038us 33729
|
| 4197 |
+
aten::_local_scalar_dense 6.21% 114.112ms 25.88% 475.279ms 14.091us 35.012ms 31.97% 35.015ms 1.038us 33729
|
| 4198 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 35.012ms 31.97% 35.012ms 1.038us 33728
|
| 4199 |
+
aten::bmm 0.01% 232.655us 0.02% 282.685us 47.114us 15.567ms 14.22% 15.567ms 2.595ms 6
|
| 4200 |
+
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.567ms 14.22% 15.567ms 2.595ms 6
|
| 4201 |
+
aten::floor_divide 5.11% 93.914ms 12.52% 229.926ms 18.711us 15.067ms 13.76% 15.067ms 1.226us 12288
|
| 4202 |
+
aten::copy_ 3.50% 64.191ms 8.58% 157.627ms 12.815us 13.353ms 12.19% 13.355ms 1.086us 12300
|
| 4203 |
+
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.350ms 12.19% 13.350ms 1.086us 12294
|
| 4204 |
+
aten::mul 2.97% 54.553ms 5.34% 97.962ms 15.921us 10.925ms 9.98% 10.925ms 1.776us 6153
|
| 4205 |
+
aten::add 3.96% 72.764ms 6.93% 127.157ms 13.975us 10.457ms 9.55% 10.457ms 1.149us 9099
|
| 4206 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.572ms 8.74% 9.572ms 1.558us 6144
|
| 4207 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.046ms 7.35% 8.046ms 1.310us 6144
|
| 4208 |
+
aten::remainder 2.95% 54.099ms 4.66% 85.633ms 13.938us 7.422ms 6.78% 7.422ms 1.208us 6144
|
| 4209 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.021ms 6.41% 7.021ms 1.143us 6144
|
| 4210 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.106ms 5.58% 6.106ms 1.033us 5910
|
| 4211 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.920ms 3.58% 3.920ms 1.276us 3072
|
| 4212 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.502ms 3.20% 3.502ms 1.140us 3072
|
| 4213 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.094ms 2.83% 3.094ms 0.971us 3186
|
| 4214 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4215 |
+
Self CPU time total: 1.836s
|
| 4216 |
+
Self CUDA time total: 109.497ms
|
| 4217 |
|
| 4218 |
|
| 4219 |
|
|
|
|
| 4223 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4224 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4225 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4226 |
+
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.483s 1652.23% 3.483s 3.483s 1
|
| 4227 |
+
binned_torch 24.18% 842.026ms 100.00% 3.482s 3.482s 0.000us 0.00% 210.838ms 210.838ms 1
|
| 4228 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 63.561ms 30.15% 63.561ms 1.032us 61586
|
| 4229 |
+
aten::item 1.74% 60.466ms 26.96% 938.865ms 15.245us 0.000us 0.00% 63.559ms 1.032us 61587
|
| 4230 |
+
aten::_local_scalar_dense 6.04% 210.488ms 25.22% 878.295ms 14.261us 63.559ms 30.15% 63.559ms 1.032us 61587
|
| 4231 |
+
aten::floor_divide 5.38% 187.378ms 13.29% 462.870ms 18.834us 30.531ms 14.48% 30.538ms 1.243us 24576
|
| 4232 |
+
aten::bmm 0.01% 232.923us 0.01% 283.154us 47.192us 29.267ms 13.88% 29.267ms 4.878ms 6
|
| 4233 |
+
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 29.267ms 13.88% 29.267ms 4.878ms 6
|
| 4234 |
+
aten::copy_ 3.71% 129.087ms 8.89% 309.556ms 12.590us 26.727ms 12.68% 26.728ms 1.087us 24588
|
| 4235 |
+
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.725ms 12.68% 26.725ms 1.087us 24582
|
| 4236 |
+
aten::mul 3.12% 108.737ms 5.69% 198.327ms 16.128us 25.576ms 12.13% 25.578ms 2.080us 12297
|
| 4237 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.134ms 10.50% 22.134ms 1.801us 12288
|
| 4238 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.473ms 7.81% 16.473ms 1.341us 12288
|
| 4239 |
+
aten::add 2.81% 97.833ms 4.96% 172.866ms 13.928us 16.092ms 7.63% 16.093ms 1.297us 12411
|
| 4240 |
+
aten::remainder 3.07% 106.957ms 4.82% 167.982ms 13.670us 14.887ms 7.06% 14.889ms 1.212us 12288
|
| 4241 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.058ms 6.67% 14.058ms 1.144us 12288
|
| 4242 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 12.970ms 6.15% 12.970ms 1.045us 12408
|
| 4243 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 7.857ms 3.73% 7.857ms 1.279us 6144
|
| 4244 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.030ms 3.33% 7.030ms 1.144us 6144
|
| 4245 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.605ms 1.24% 2.605ms 434.242us 6
|
| 4246 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4247 |
+
Self CPU time total: 3.483s
|
| 4248 |
+
Self CUDA time total: 210.821ms
|
| 4249 |
|
| 4250 |
|
| 4251 |
|
|
|
|
| 4255 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4256 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4257 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4258 |
+
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.725s 1668.35% 3.725s 3.725s 1
|
| 4259 |
+
binned_torch 24.05% 896.242ms 100.00% 3.727s 3.727s 0.000us 0.00% 223.307ms 223.307ms 1
|
| 4260 |
+
aten::item 1.73% 64.547ms 27.53% 1.026s 15.123us 0.000us 0.00% 69.633ms 1.026us 67845
|
| 4261 |
+
aten::_local_scalar_dense 6.19% 230.534ms 25.80% 961.495ms 14.172us 69.631ms 31.18% 69.633ms 1.026us 67845
|
| 4262 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 69.632ms 31.18% 69.632ms 1.026us 67841
|
| 4263 |
+
aten::floor_divide 5.09% 189.838ms 12.50% 465.764ms 18.952us 30.442ms 13.63% 30.448ms 1.239us 24576
|
| 4264 |
+
aten::bmm 0.01% 247.707us 0.01% 294.697us 49.116us 29.554ms 13.24% 29.554ms 4.926ms 6
|
| 4265 |
+
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 29.554ms 13.24% 29.554ms 4.926ms 6
|
| 4266 |
+
aten::copy_ 3.50% 130.326ms 8.36% 311.636ms 12.674us 26.718ms 11.97% 26.719ms 1.087us 24588
|
| 4267 |
+
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.715ms 11.96% 26.715ms 1.087us 24581
|
| 4268 |
+
aten::mul 2.92% 108.800ms 5.34% 198.878ms 16.173us 25.547ms 11.44% 25.547ms 2.077us 12297
|
| 4269 |
+
aten::add 3.96% 147.436ms 7.04% 262.447ms 14.081us 22.490ms 10.07% 22.492ms 1.207us 18639
|
| 4270 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.115ms 9.90% 22.115ms 1.800us 12288
|
| 4271 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.451ms 7.37% 16.451ms 1.339us 12287
|
| 4272 |
+
aten::remainder 2.81% 104.739ms 4.44% 165.425ms 13.462us 14.805ms 6.63% 14.806ms 1.205us 12288
|
| 4273 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 13.992ms 6.27% 13.992ms 1.139us 12287
|
| 4274 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.166ms 5.90% 13.166ms 1.061us 12407
|
| 4275 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 7.819ms 3.50% 7.819ms 1.273us 6144
|
| 4276 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.986ms 3.13% 6.986ms 1.137us 6144
|
| 4277 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.214ms 2.78% 6.214ms 0.998us 6228
|
| 4278 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4279 |
+
Self CPU time total: 3.727s
|
| 4280 |
+
Self CUDA time total: 223.293ms
|
| 4281 |
|
| 4282 |
|
| 4283 |
|
|
|
|
| 4287 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4288 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4289 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4290 |
+
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 6.919s 1639.48% 6.919s 6.919s 1
|
| 4291 |
+
binned_torch 24.46% 1.695s 100.00% 6.929s 6.929s 0.000us 0.00% 422.036ms 422.036ms 1
|
| 4292 |
+
aten::item 1.67% 115.500ms 26.73% 1.852s 15.089us 0.000us 0.00% 127.102ms 1.035us 122763
|
| 4293 |
+
aten::_local_scalar_dense 5.94% 411.594ms 25.07% 1.737s 14.148us 127.094ms 30.12% 127.102ms 1.035us 122763
|
| 4294 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 127.096ms 30.12% 127.096ms 1.035us 122762
|
| 4295 |
+
aten::floor_divide 5.38% 373.026ms 13.30% 921.425ms 18.746us 61.339ms 14.53% 61.343ms 1.248us 49152
|
| 4296 |
+
aten::bmm 0.00% 231.234us 0.00% 280.225us 46.704us 57.287ms 13.57% 57.287ms 9.548ms 6
|
| 4297 |
+
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 57.287ms 13.57% 57.287ms 9.548ms 6
|
| 4298 |
+
aten::copy_ 3.72% 257.654ms 8.91% 617.063ms 12.553us 53.696ms 12.72% 53.697ms 1.092us 49158
|
| 4299 |
+
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.694ms 12.72% 53.694ms 1.092us 49154
|
| 4300 |
+
aten::mul 3.13% 217.096ms 5.68% 393.622ms 16.011us 51.639ms 12.24% 51.644ms 2.101us 24585
|
| 4301 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.676ms 10.59% 44.676ms 1.818us 24576
|
| 4302 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 33.163ms 7.86% 33.163ms 1.349us 24576
|
| 4303 |
+
aten::add 2.81% 194.866ms 4.91% 340.544ms 13.937us 32.585ms 7.72% 32.588ms 1.334us 24435
|
| 4304 |
+
aten::remainder 3.09% 213.993ms 4.85% 335.801ms 13.664us 29.914ms 7.09% 29.918ms 1.217us 24576
|
| 4305 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 28.177ms 6.68% 28.177ms 1.147us 24576
|
| 4306 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.921ms 6.14% 25.921ms 1.061us 24431
|
| 4307 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.786ms 3.74% 15.786ms 1.285us 12288
|
| 4308 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.129ms 3.35% 14.129ms 1.150us 12288
|
| 4309 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 5.239ms 1.24% 5.239ms 873.180us 6
|
| 4310 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4311 |
+
Self CPU time total: 6.929s
|
| 4312 |
+
Self CUDA time total: 422.014ms
|
| 4313 |
|
| 4314 |
|
| 4315 |
|
|
|
|
| 4319 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4320 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4321 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4322 |
+
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.526s 1690.98% 7.526s 7.526s 1
|
| 4323 |
+
binned_torch 24.06% 1.811s 100.00% 7.528s 7.528s 0.000us 0.00% 445.109ms 445.109ms 1
|
| 4324 |
+
aten::item 1.62% 121.583ms 26.84% 2.020s 14.998us 0.000us 0.00% 138.816ms 1.030us 134715
|
| 4325 |
+
aten::_local_scalar_dense 6.12% 460.388ms 25.22% 1.899s 14.095us 138.805ms 31.19% 138.816ms 1.030us 134715
|
| 4326 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 138.805ms 31.19% 138.805ms 1.030us 134707
|
| 4327 |
+
aten::floor_divide 5.25% 395.063ms 12.72% 957.555ms 19.482us 61.331ms 13.78% 61.336ms 1.248us 49152
|
| 4328 |
+
aten::bmm 0.00% 238.536us 0.00% 289.618us 48.270us 57.304ms 12.88% 57.304ms 9.551ms 6
|
| 4329 |
+
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 57.304ms 12.88% 57.304ms 9.551ms 6
|
| 4330 |
+
aten::copy_ 3.62% 272.274ms 8.61% 648.516ms 13.192us 53.873ms 12.10% 53.876ms 1.096us 49158
|
| 4331 |
+
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.870ms 12.10% 53.870ms 1.096us 49149
|
| 4332 |
+
aten::mul 3.08% 231.551ms 5.44% 409.269ms 16.647us 51.546ms 11.58% 51.551ms 2.097us 24585
|
| 4333 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.593ms 10.02% 44.593ms 1.814us 24576
|
| 4334 |
+
aten::add 4.08% 306.812ms 7.05% 530.578ms 14.594us 43.966ms 9.88% 43.969ms 1.209us 36357
|
| 4335 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 33.107ms 7.44% 33.107ms 1.347us 24573
|
| 4336 |
+
aten::remainder 2.97% 223.921ms 4.70% 353.632ms 14.389us 29.770ms 6.69% 29.775ms 1.211us 24577
|
| 4337 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 28.225ms 6.34% 28.225ms 1.149us 24573
|
| 4338 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.583ms 5.75% 25.583ms 1.047us 24431
|
| 4339 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.722ms 3.53% 15.722ms 1.279us 12288
|
| 4340 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.047ms 3.16% 14.047ms 1.143us 12288
|
| 4341 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.757ms 2.64% 11.757ms 0.986us 11922
|
| 4342 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4343 |
+
Self CPU time total: 7.528s
|
| 4344 |
+
Self CUDA time total: 445.070ms
|
| 4345 |
|
| 4346 |
|
| 4347 |
impl wl p50(ms) ok
|
| 4348 |
+
binned_torch cuda_B1_S1024_E2 367.98 True
|
| 4349 |
+
binned_torch cuda_B1_S1024_E4 396.30 True
|
| 4350 |
+
binned_torch cuda_B1_S512_E2 154.35 True
|
| 4351 |
+
binned_torch cuda_B1_S512_E4 195.55 True
|
| 4352 |
+
binned_torch cuda_B4_S1024_E2 1510.09 True
|
| 4353 |
+
binned_torch cuda_B4_S1024_E4 1618.05 True
|
| 4354 |
+
binned_torch cuda_B4_S512_E2 733.47 True
|
| 4355 |
+
binned_torch cuda_B4_S512_E4 787.61 True
|
| 4356 |
</pre></div>
|
| 4357 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4358 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4359 |
<div class="uv-logs-content" style="display: none;">
|
| 4360 |
+
Installed 37 packages in 322ms
|
| 4361 |
</div>
|
| 4362 |
</div>
|
| 4363 |
<div class="cell-artifacts">
|
openai_moe/impls/gpt_oss_moe.html
CHANGED
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: nv | 0.
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3905,7 +3905,7 @@ Cell: nv | 0.28s
|
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div id="output-nv" class="cell-output">
|
| 3908 |
-
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3911 |
+-----------------------------------------+------------------------+----------------------+
|
|
@@ -3914,7 +3914,7 @@ Cell: nv | 0.28s
|
|
| 3914 |
| | | MIG M. |
|
| 3915 |
|=========================================+========================+======================|
|
| 3916 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3917 |
-
| N/A
|
| 3918 |
| | | N/A |
|
| 3919 |
+-----------------------------------------+------------------------+----------------------+
|
| 3920 |
|
|
@@ -3938,7 +3938,7 @@ Cell: nv | 0.28s
|
|
| 3938 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3939 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3940 |
</span> |
|
| 3941 |
-
Cell: benchmark | 21.
|
| 3942 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3943 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3944 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4042,29 +4042,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E2
|
|
| 4042 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4043 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4044 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4045 |
-
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 10.
|
| 4046 |
-
gpt_oss_experts 16.
|
| 4047 |
-
aten::matmul 0.
|
| 4048 |
-
aten::mm 2.
|
| 4049 |
-
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4050 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4051 |
-
aten::mul 1.
|
| 4052 |
-
aten::add 1.
|
| 4053 |
-
aten::index 1.
|
| 4054 |
-
|
| 4055 |
-
|
| 4056 |
-
void at::native::
|
| 4057 |
-
aten::nonzero 2.
|
| 4058 |
-
aten::clamp 0.
|
| 4059 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 63.
|
| 4060 |
-
|
| 4061 |
-
|
| 4062 |
-
|
| 4063 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 56.
|
| 4064 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 50.
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
-
Self CPU time total: 12.
|
| 4067 |
-
Self CUDA time total: 5.
|
| 4068 |
|
| 4069 |
|
| 4070 |
|
|
@@ -4074,29 +4074,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E4
|
|
| 4074 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4075 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
-
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 14.
|
| 4078 |
-
gpt_oss_experts 16.
|
| 4079 |
-
aten::matmul 0.
|
| 4080 |
-
aten::mm 2.
|
| 4081 |
-
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4082 |
-
aten::nonzero 2.
|
| 4083 |
-
aten::mul 1.
|
| 4084 |
-
aten::add 2.
|
| 4085 |
-
aten::where 0.07%
|
| 4086 |
-
aten::nonzero_numpy 0.
|
| 4087 |
-
aten::index 2.
|
| 4088 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 101.
|
| 4089 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 91.
|
| 4090 |
-
aten::clamp 1.
|
| 4091 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4092 |
-
aten::item 0.
|
| 4093 |
-
aten::_local_scalar_dense 1.
|
| 4094 |
-
aten::index_add_ 0.
|
| 4095 |
-
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4096 |
-
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 66.
|
| 4097 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4098 |
-
Self CPU time total: 16.
|
| 4099 |
-
Self CUDA time total: 6.
|
| 4100 |
|
| 4101 |
|
| 4102 |
|
|
@@ -4106,29 +4106,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E2
|
|
| 4106 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4107 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
-
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 12.
|
| 4110 |
-
gpt_oss_experts 13.
|
| 4111 |
-
aten::matmul 0.18% 23.
|
| 4112 |
-
aten::mm 1.99%
|
| 4113 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4114 |
-
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4115 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4116 |
-
aten::mul 1.
|
| 4117 |
-
aten::add 1.
|
| 4118 |
-
aten::index_add_ 0.
|
| 4119 |
-
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4120 |
-
|
| 4121 |
-
|
| 4122 |
-
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 116.
|
| 4123 |
-
aten::clamp 0.82% 108.
|
| 4124 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4125 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4126 |
-
aten::nonzero 1.
|
| 4127 |
-
aten::where 0.04% 5.
|
| 4128 |
-
aten::nonzero_numpy 0.08% 11.
|
| 4129 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4130 |
-
Self CPU time total: 13.
|
| 4131 |
-
Self CUDA time total: 8.
|
| 4132 |
|
| 4133 |
|
| 4134 |
|
|
@@ -4138,29 +4138,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E4
|
|
| 4138 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4139 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4140 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4141 |
-
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 18.
|
| 4142 |
-
gpt_oss_experts 12.
|
| 4143 |
-
aten::matmul 0.
|
| 4144 |
-
aten::mm 2.
|
| 4145 |
-
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4146 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4147 |
-
aten::mul 1.
|
| 4148 |
-
aten::add 1.
|
| 4149 |
-
aten::index 1.
|
| 4150 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4151 |
-
aten::index_add_ 0.
|
| 4152 |
-
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4153 |
-
aten::nonzero 1.
|
| 4154 |
-
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4155 |
-
aten::
|
| 4156 |
-
|
| 4157 |
-
aten::
|
| 4158 |
-
|
| 4159 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4160 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 108.
|
| 4161 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4162 |
-
Self CPU time total: 20.
|
| 4163 |
-
Self CUDA time total: 10.
|
| 4164 |
|
| 4165 |
|
| 4166 |
|
|
@@ -4170,29 +4170,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E2
|
|
| 4170 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4171 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4172 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4173 |
-
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4174 |
-
gpt_oss_experts 7.
|
| 4175 |
-
aten::matmul 0.10%
|
| 4176 |
-
aten::mm 1.
|
| 4177 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4178 |
-
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4179 |
-
aten::add 0.
|
| 4180 |
-
aten::mul 0.68%
|
| 4181 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4182 |
-
aten::index_add_ 0.
|
| 4183 |
-
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4184 |
-
aten::clamp 0.
|
| 4185 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4186 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4187 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4188 |
-
aten::index 0.79% 185.
|
| 4189 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4190 |
-
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4191 |
-
aten::sigmoid 0.
|
| 4192 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4193 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4194 |
-
Self CPU time total: 23.
|
| 4195 |
-
Self CUDA time total: 17.
|
| 4196 |
|
| 4197 |
|
| 4198 |
|
|
@@ -4202,29 +4202,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E4
|
|
| 4202 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4203 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4204 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4205 |
-
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 24.
|
| 4206 |
-
gpt_oss_experts 10.
|
| 4207 |
-
aten::matmul 0.17%
|
| 4208 |
-
aten::mm
|
| 4209 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4210 |
-
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4211 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4212 |
-
aten::add 1.
|
| 4213 |
-
aten::mul 1.
|
| 4214 |
-
aten::index_add_ 0.
|
| 4215 |
-
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4216 |
-
|
| 4217 |
-
|
| 4218 |
-
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4219 |
-
aten::clamp 0.81%
|
| 4220 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4221 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4222 |
-
aten::nonzero 1.
|
| 4223 |
-
aten::where 0.04%
|
| 4224 |
-
aten::nonzero_numpy 0.08%
|
| 4225 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4226 |
-
Self CPU time total: 27.
|
| 4227 |
-
Self CUDA time total: 17.
|
| 4228 |
|
| 4229 |
|
| 4230 |
|
|
@@ -4234,29 +4234,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E2
|
|
| 4234 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4235 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4236 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4237 |
-
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.
|
| 4238 |
-
gpt_oss_experts 4.
|
| 4239 |
-
aten::matmul 0.05% 22.
|
| 4240 |
-
aten::mm 0.
|
| 4241 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4242 |
-
aten::mul 0.
|
| 4243 |
-
aten::add 0.45% 194.
|
| 4244 |
-
aten::clamp 0.
|
| 4245 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4246 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4247 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4248 |
-
aten::index_add_ 0.
|
| 4249 |
-
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4250 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4251 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4252 |
-
aten::index 0.
|
| 4253 |
-
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 672.
|
| 4254 |
-
aten::sigmoid 0.
|
| 4255 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4256 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4257 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4258 |
-
Self CPU time total: 42.
|
| 4259 |
-
Self CUDA time total:
|
| 4260 |
|
| 4261 |
|
| 4262 |
|
|
@@ -4266,40 +4266,40 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E4
|
|
| 4266 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4267 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4268 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4269 |
-
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4270 |
-
gpt_oss_experts 6.
|
| 4271 |
-
aten::matmul 0.11%
|
| 4272 |
-
aten::mm 1.
|
| 4273 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 20.
|
| 4274 |
-
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 8.
|
| 4275 |
-
aten::add 0.
|
| 4276 |
-
aten::mul 0.
|
| 4277 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4278 |
-
aten::index_add_ 0.
|
| 4279 |
-
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4280 |
-
aten::clamp 0.
|
| 4281 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 772.
|
| 4282 |
-
aten::index 0.
|
| 4283 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4284 |
-
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4285 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 552.
|
| 4286 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4287 |
-
aten::sigmoid 0.
|
| 4288 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4289 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4290 |
-
Self CPU time total: 43.
|
| 4291 |
-
Self CUDA time total:
|
| 4292 |
|
| 4293 |
|
| 4294 |
impl wl p50(ms) ok
|
| 4295 |
-
gpt_oss_experts cuda_B1_S1024_E2 3.
|
| 4296 |
-
gpt_oss_experts cuda_B1_S1024_E4 5.
|
| 4297 |
-
gpt_oss_experts cuda_B1_S512_E2 2.
|
| 4298 |
-
gpt_oss_experts cuda_B1_S512_E4 3.
|
| 4299 |
-
gpt_oss_experts cuda_B4_S1024_E2 13.
|
| 4300 |
-
gpt_oss_experts cuda_B4_S1024_E4 13.
|
| 4301 |
-
gpt_oss_experts cuda_B4_S512_E2 6.
|
| 4302 |
-
gpt_oss_experts cuda_B4_S512_E4 7.
|
| 4303 |
</pre></div>
|
| 4304 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4305 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
|
@@ -4308,12 +4308,12 @@ gpt_oss_experts cuda_B4_S512_E4 7.52 True
|
|
| 4308 |
Updated https://github.com/huggingface/kernels.git (55b7c980e96bf5f747f0e4136be61c0b089ab76c)
|
| 4309 |
Building kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c
|
| 4310 |
Built kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c
|
| 4311 |
-
Installed 14 packages in
|
| 4312 |
</div>
|
| 4313 |
</div>
|
| 4314 |
<div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 4315 |
-
Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00,
|
| 4316 |
-
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00,
|
| 4317 |
<div class="cell-artifacts">
|
| 4318 |
<h4>Artifacts:</h4>
|
| 4319 |
<a href="artifacts/benchmark/openai_moe.jsonl" class="artifact" target="_blank">openai_moe.jsonl</a>
|
|
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: nv | 0.25s
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div id="output-nv" class="cell-output">
|
| 3908 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 23:00:37 2025
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3911 |
+-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3914 |
| | | MIG M. |
|
| 3915 |
|=========================================+========================+======================|
|
| 3916 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3917 |
+
| N/A 40C P0 84W / 350W | 0MiB / 46068MiB | 60% Default |
|
| 3918 |
| | | N/A |
|
| 3919 |
+-----------------------------------------+------------------------+----------------------+
|
| 3920 |
|
|
|
|
| 3938 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3939 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3940 |
</span> |
|
| 3941 |
+
Cell: benchmark | 21.54s
|
| 3942 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3943 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3944 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4042 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4043 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4044 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4045 |
+
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 10.236ms 195.72% 10.236ms 10.236ms 1
|
| 4046 |
+
gpt_oss_experts 16.81% 2.119ms 99.94% 12.602ms 12.602ms 0.000us 0.00% 5.233ms 5.233ms 1
|
| 4047 |
+
aten::matmul 0.21% 26.351us 3.80% 479.051us 39.921us 0.000us 0.00% 4.609ms 384.095us 12
|
| 4048 |
+
aten::mm 2.34% 295.677us 3.59% 452.700us 37.725us 4.609ms 88.13% 4.609ms 384.095us 12
|
| 4049 |
+
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.132ms 59.89% 3.132ms 348.055us 9
|
| 4050 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.470ms 28.11% 1.470ms 490.007us 3
|
| 4051 |
+
aten::mul 1.25% 158.075us 2.09% 263.508us 10.979us 109.535us 2.09% 109.535us 4.564us 24
|
| 4052 |
+
aten::add 1.50% 188.607us 3.77% 475.033us 26.391us 103.232us 1.97% 103.232us 5.735us 18
|
| 4053 |
+
aten::index 1.58% 199.165us 2.64% 332.439us 27.703us 88.193us 1.69% 88.193us 7.349us 12
|
| 4054 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 80.832us 1.55% 80.832us 6.736us 12
|
| 4055 |
+
aten::index_add_ 0.43% 54.021us 0.70% 88.353us 14.726us 79.361us 1.52% 79.361us 13.227us 6
|
| 4056 |
+
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 79.361us 1.52% 79.361us 13.227us 6
|
| 4057 |
+
aten::nonzero 2.14% 269.616us 6.31% 796.127us 88.459us 63.904us 1.22% 74.560us 8.284us 9
|
| 4058 |
+
aten::clamp 0.90% 113.849us 1.52% 191.573us 15.964us 63.523us 1.21% 63.523us 5.294us 12
|
| 4059 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 63.523us 1.21% 63.523us 5.294us 12
|
| 4060 |
+
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 60.767us 1.16% 60.767us 10.128us 6
|
| 4061 |
+
aten::where 0.06% 7.630us 5.01% 631.874us 105.312us 0.000us 0.00% 60.384us 10.064us 6
|
| 4062 |
+
aten::nonzero_numpy 0.10% 12.751us 4.95% 624.244us 104.041us 0.000us 0.00% 60.384us 10.064us 6
|
| 4063 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 56.992us 1.09% 56.992us 4.749us 12
|
| 4064 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 50.880us 0.97% 50.880us 1.131us 45
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
+
Self CPU time total: 12.609ms
|
| 4067 |
+
Self CUDA time total: 5.230ms
|
| 4068 |
|
| 4069 |
|
| 4070 |
|
|
|
|
| 4074 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4075 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
+
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 14.418ms 232.27% 14.418ms 14.418ms 1
|
| 4078 |
+
gpt_oss_experts 16.77% 2.777ms 99.97% 16.548ms 16.548ms 0.000us 0.00% 6.210ms 6.210ms 1
|
| 4079 |
+
aten::matmul 0.29% 47.549us 4.87% 805.573us 33.566us 0.000us 0.00% 5.399ms 224.951us 24
|
| 4080 |
+
aten::mm 2.86% 473.570us 4.58% 758.024us 31.584us 5.399ms 86.98% 5.399ms 224.951us 24
|
| 4081 |
+
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.343ms 86.07% 5.343ms 222.609us 24
|
| 4082 |
+
aten::nonzero 2.46% 406.423us 7.79% 1.290ms 85.983us 112.737us 1.82% 135.233us 9.016us 15
|
| 4083 |
+
aten::mul 1.91% 315.499us 3.27% 541.644us 11.284us 131.458us 2.12% 131.458us 2.739us 48
|
| 4084 |
+
aten::add 2.09% 345.610us 3.58% 592.305us 16.453us 127.137us 2.05% 127.137us 3.532us 36
|
| 4085 |
+
aten::where 0.07% 11.421us 7.35% 1.217ms 101.380us 0.000us 0.00% 121.345us 10.112us 12
|
| 4086 |
+
aten::nonzero_numpy 0.14% 22.419us 7.28% 1.205ms 100.429us 0.000us 0.00% 121.345us 10.112us 12
|
| 4087 |
+
aten::index 2.37% 392.707us 3.98% 658.793us 27.450us 110.560us 1.78% 110.560us 4.607us 24
|
| 4088 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 101.089us 1.63% 101.089us 4.212us 24
|
| 4089 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 91.523us 1.47% 91.523us 1.052us 87
|
| 4090 |
+
aten::clamp 1.31% 216.727us 2.19% 362.649us 15.110us 87.299us 1.41% 87.299us 3.637us 24
|
| 4091 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 87.299us 1.41% 87.299us 3.637us 24
|
| 4092 |
+
aten::item 0.49% 80.385us 37.67% 6.235ms 86.604us 0.000us 0.00% 75.204us 1.044us 72
|
| 4093 |
+
aten::_local_scalar_dense 1.99% 329.728us 37.18% 6.155ms 85.487us 75.204us 1.21% 75.204us 1.044us 72
|
| 4094 |
+
aten::index_add_ 0.56% 93.084us 0.97% 160.623us 13.385us 71.618us 1.15% 71.618us 5.968us 12
|
| 4095 |
+
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 71.618us 1.15% 71.618us 5.968us 12
|
| 4096 |
+
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 66.656us 1.07% 66.656us 5.555us 12
|
| 4097 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4098 |
+
Self CPU time total: 16.554ms
|
| 4099 |
+
Self CUDA time total: 6.207ms
|
| 4100 |
|
| 4101 |
|
| 4102 |
|
|
|
|
| 4106 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4107 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
+
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 12.715ms 148.98% 12.715ms 12.715ms 1
|
| 4110 |
+
gpt_oss_experts 13.24% 1.769ms 99.96% 13.348ms 13.348ms 0.000us 0.00% 8.540ms 8.540ms 1
|
| 4111 |
+
aten::matmul 0.18% 23.619us 3.35% 447.210us 37.267us 0.000us 0.00% 7.511ms 625.895us 12
|
| 4112 |
+
aten::mm 1.99% 265.185us 3.17% 423.591us 35.299us 7.511ms 88.01% 7.511ms 625.895us 12
|
| 4113 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 4.572ms 53.58% 4.572ms 762.082us 6
|
| 4114 |
+
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 1.497ms 17.54% 1.497ms 498.892us 3
|
| 4115 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.435ms 16.81% 1.435ms 478.305us 3
|
| 4116 |
+
aten::mul 1.21% 162.011us 2.06% 274.994us 11.458us 197.600us 2.32% 197.600us 8.233us 24
|
| 4117 |
+
aten::add 1.32% 176.183us 2.25% 300.545us 16.697us 188.546us 2.21% 188.546us 10.475us 18
|
| 4118 |
+
aten::index_add_ 0.35% 46.949us 0.64% 86.050us 14.342us 164.416us 1.93% 164.416us 27.403us 6
|
| 4119 |
+
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 164.416us 1.93% 164.416us 27.403us 6
|
| 4120 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 149.442us 1.75% 149.442us 12.453us 12
|
| 4121 |
+
aten::index 1.39% 185.093us 2.39% 318.747us 26.562us 146.144us 1.71% 146.144us 12.179us 12
|
| 4122 |
+
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 116.287us 1.36% 116.287us 19.381us 6
|
| 4123 |
+
aten::clamp 0.82% 108.858us 1.40% 187.503us 15.625us 110.850us 1.30% 110.850us 9.238us 12
|
| 4124 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 110.850us 1.30% 110.850us 9.238us 12
|
| 4125 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.960us 1.23% 104.960us 8.747us 12
|
| 4126 |
+
aten::nonzero 1.82% 243.314us 5.65% 754.927us 83.881us 69.183us 0.81% 80.703us 8.967us 9
|
| 4127 |
+
aten::where 0.04% 5.842us 4.63% 617.944us 102.991us 0.000us 0.00% 66.080us 11.013us 6
|
| 4128 |
+
aten::nonzero_numpy 0.08% 11.348us 4.58% 612.102us 102.017us 0.000us 0.00% 66.080us 11.013us 6
|
| 4129 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4130 |
+
Self CPU time total: 13.354ms
|
| 4131 |
+
Self CUDA time total: 8.534ms
|
| 4132 |
|
| 4133 |
|
| 4134 |
|
|
|
|
| 4138 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4139 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4140 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4141 |
+
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 18.371ms 173.06% 18.371ms 18.371ms 1
|
| 4142 |
+
gpt_oss_experts 12.78% 2.670ms 99.97% 20.895ms 20.895ms 0.000us 0.00% 10.621ms 10.621ms 1
|
| 4143 |
+
aten::matmul 0.23% 47.482us 3.94% 823.658us 34.319us 0.000us 0.00% 9.337ms 389.038us 24
|
| 4144 |
+
aten::mm 2.27% 474.301us 3.71% 776.176us 32.341us 9.337ms 87.96% 9.337ms 389.038us 24
|
| 4145 |
+
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 6.375ms 60.06% 6.375ms 354.186us 18
|
| 4146 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.948ms 27.77% 2.948ms 491.399us 6
|
| 4147 |
+
aten::mul 1.63% 341.535us 2.71% 565.653us 11.784us 233.052us 2.20% 233.052us 4.855us 48
|
| 4148 |
+
aten::add 1.65% 343.966us 2.82% 589.773us 16.383us 214.333us 2.02% 214.333us 5.954us 36
|
| 4149 |
+
aten::index 1.71% 356.851us 2.95% 617.053us 25.711us 204.352us 1.93% 204.352us 8.515us 24
|
| 4150 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 167.774us 1.58% 167.774us 6.991us 24
|
| 4151 |
+
aten::index_add_ 0.45% 94.502us 0.77% 161.933us 13.494us 156.322us 1.47% 156.322us 13.027us 12
|
| 4152 |
+
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 156.322us 1.47% 156.322us 13.027us 12
|
| 4153 |
+
aten::nonzero 1.91% 398.170us 6.16% 1.287ms 85.805us 122.527us 1.15% 147.135us 9.809us 15
|
| 4154 |
+
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 146.240us 1.38% 146.240us 12.187us 12
|
| 4155 |
+
aten::clamp 1.04% 217.693us 1.76% 368.516us 15.355us 133.438us 1.26% 133.438us 5.560us 24
|
| 4156 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.438us 1.26% 133.438us 5.560us 24
|
| 4157 |
+
aten::where 0.05% 11.100us 5.81% 1.214ms 101.204us 0.000us 0.00% 132.577us 11.048us 12
|
| 4158 |
+
aten::nonzero_numpy 0.10% 21.341us 5.76% 1.203ms 100.279us 0.000us 0.00% 132.577us 11.048us 12
|
| 4159 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 119.358us 1.12% 119.358us 4.973us 24
|
| 4160 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 108.671us 1.02% 108.671us 1.249us 87
|
| 4161 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4162 |
+
Self CPU time total: 20.901ms
|
| 4163 |
+
Self CUDA time total: 10.615ms
|
| 4164 |
|
| 4165 |
|
| 4166 |
|
|
|
|
| 4170 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4171 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4172 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4173 |
+
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 21.147ms 120.13% 21.147ms 21.147ms 1
|
| 4174 |
+
gpt_oss_experts 7.48% 1.759ms 99.98% 23.501ms 23.501ms 0.000us 0.00% 17.613ms 17.613ms 1
|
| 4175 |
+
aten::matmul 0.10% 24.413us 1.93% 452.632us 37.719us 0.000us 0.00% 14.754ms 1.229ms 12
|
| 4176 |
+
aten::mm 1.14% 267.578us 1.82% 428.219us 35.685us 14.754ms 83.81% 14.754ms 1.229ms 12
|
| 4177 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 9.005ms 51.15% 9.005ms 1.501ms 6
|
| 4178 |
+
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.740ms 32.61% 5.740ms 956.646us 6
|
| 4179 |
+
aten::add 0.80% 187.171us 1.34% 315.717us 17.540us 774.145us 4.40% 774.145us 43.008us 18
|
| 4180 |
+
aten::mul 0.68% 160.882us 1.16% 272.615us 11.359us 660.967us 3.75% 660.967us 27.540us 24
|
| 4181 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 494.017us 2.81% 494.017us 41.168us 12
|
| 4182 |
+
aten::index_add_ 0.20% 46.930us 0.35% 82.651us 13.775us 446.818us 2.54% 446.818us 74.470us 6
|
| 4183 |
+
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 446.818us 2.54% 446.818us 74.470us 6
|
| 4184 |
+
aten::clamp 0.49% 114.212us 0.82% 193.704us 16.142us 330.081us 1.88% 330.081us 27.507us 12
|
| 4185 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 330.081us 1.88% 330.081us 27.507us 12
|
| 4186 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 303.524us 1.72% 303.524us 50.587us 6
|
| 4187 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 280.128us 1.59% 280.128us 46.688us 6
|
| 4188 |
+
aten::index 0.79% 185.142us 1.34% 314.927us 26.244us 260.002us 1.48% 260.002us 21.667us 12
|
| 4189 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 255.139us 1.45% 255.139us 21.262us 12
|
| 4190 |
+
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 227.361us 1.29% 227.361us 37.894us 6
|
| 4191 |
+
aten::sigmoid 0.17% 39.139us 0.29% 67.081us 11.180us 175.681us 1.00% 175.681us 29.280us 6
|
| 4192 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 175.681us 1.00% 175.681us 29.280us 6
|
| 4193 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4194 |
+
Self CPU time total: 23.507ms
|
| 4195 |
+
Self CUDA time total: 17.603ms
|
| 4196 |
|
| 4197 |
|
| 4198 |
|
|
|
|
| 4202 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4203 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4204 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4205 |
+
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 24.812ms 140.52% 24.812ms 24.812ms 1
|
| 4206 |
+
gpt_oss_experts 10.20% 2.768ms 99.98% 27.139ms 27.139ms 0.000us 0.00% 17.668ms 17.668ms 1
|
| 4207 |
+
aten::matmul 0.17% 47.070us 3.25% 881.530us 36.730us 0.000us 0.00% 15.436ms 643.168us 24
|
| 4208 |
+
aten::mm 1.94% 525.958us 3.07% 834.460us 34.769us 15.436ms 87.42% 15.436ms 643.168us 24
|
| 4209 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 9.298ms 52.66% 9.298ms 774.816us 12
|
| 4210 |
+
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.185ms 18.04% 3.185ms 530.803us 6
|
| 4211 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.939ms 16.65% 2.939ms 489.897us 6
|
| 4212 |
+
aten::add 1.32% 358.751us 2.25% 610.989us 16.972us 429.537us 2.43% 429.537us 11.932us 36
|
| 4213 |
+
aten::mul 1.17% 318.045us 2.01% 546.157us 11.378us 419.555us 2.38% 419.555us 8.741us 48
|
| 4214 |
+
aten::index_add_ 0.35% 93.791us 0.61% 165.384us 13.782us 375.712us 2.13% 375.712us 31.309us 12
|
| 4215 |
+
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 375.712us 2.13% 375.712us 31.309us 12
|
| 4216 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 344.672us 1.95% 344.672us 14.361us 24
|
| 4217 |
+
aten::index 1.36% 368.555us 2.35% 637.581us 26.566us 343.779us 1.95% 343.779us 14.324us 24
|
| 4218 |
+
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 279.524us 1.58% 279.524us 23.294us 12
|
| 4219 |
+
aten::clamp 0.81% 220.839us 1.38% 373.627us 15.568us 232.100us 1.31% 232.100us 9.671us 24
|
| 4220 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 232.100us 1.31% 232.100us 9.671us 24
|
| 4221 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 222.273us 1.26% 222.273us 9.261us 24
|
| 4222 |
+
aten::nonzero 1.49% 404.133us 4.81% 1.304ms 86.953us 129.285us 0.73% 155.591us 10.373us 15
|
| 4223 |
+
aten::where 0.04% 11.801us 4.54% 1.232ms 102.652us 0.000us 0.00% 140.134us 11.678us 12
|
| 4224 |
+
aten::nonzero_numpy 0.08% 22.919us 4.49% 1.220ms 101.669us 0.000us 0.00% 140.134us 11.678us 12
|
| 4225 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4226 |
+
Self CPU time total: 27.144ms
|
| 4227 |
+
Self CUDA time total: 17.658ms
|
| 4228 |
|
| 4229 |
|
| 4230 |
|
|
|
|
| 4234 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4235 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4236 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4237 |
+
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.716ms 109.44% 40.716ms 40.716ms 1
|
| 4238 |
+
gpt_oss_experts 4.15% 1.782ms 99.82% 42.848ms 42.848ms 0.000us 0.00% 37.235ms 37.235ms 1
|
| 4239 |
+
aten::matmul 0.05% 22.008us 1.00% 427.588us 35.632us 0.000us 0.00% 27.249ms 2.271ms 12
|
| 4240 |
+
aten::mm 0.64% 276.436us 0.94% 405.580us 33.798us 27.249ms 73.24% 27.249ms 2.271ms 12
|
| 4241 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 27.245ms 73.23% 27.245ms 2.270ms 12
|
| 4242 |
+
aten::mul 0.38% 162.893us 0.65% 277.866us 11.578us 2.967ms 7.97% 2.967ms 123.619us 24
|
| 4243 |
+
aten::add 0.45% 194.205us 1.07% 458.802us 25.489us 2.398ms 6.45% 2.398ms 133.242us 18
|
| 4244 |
+
aten::clamp 0.26% 112.402us 0.45% 191.453us 15.954us 2.384ms 6.41% 2.384ms 198.708us 12
|
| 4245 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.384ms 6.41% 2.384ms 198.708us 12
|
| 4246 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.985ms 5.34% 1.985ms 165.412us 12
|
| 4247 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.626ms 4.37% 1.626ms 135.484us 12
|
| 4248 |
+
aten::index_add_ 0.11% 46.550us 0.19% 83.331us 13.889us 923.493us 2.48% 923.493us 153.916us 6
|
| 4249 |
+
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 923.493us 2.48% 923.493us 153.916us 6
|
| 4250 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 772.550us 2.08% 772.550us 128.758us 6
|
| 4251 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 736.421us 1.98% 736.421us 122.737us 6
|
| 4252 |
+
aten::index 0.43% 184.050us 0.73% 314.765us 26.230us 705.700us 1.90% 705.700us 58.808us 12
|
| 4253 |
+
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 672.068us 1.81% 672.068us 112.011us 6
|
| 4254 |
+
aten::sigmoid 0.09% 40.702us 0.16% 68.501us 11.417us 324.705us 0.87% 324.705us 54.117us 6
|
| 4255 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 324.705us 0.87% 324.705us 54.117us 6
|
| 4256 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 245.504us 0.66% 245.504us 40.917us 6
|
| 4257 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4258 |
+
Self CPU time total: 42.926ms
|
| 4259 |
+
Self CUDA time total: 37.203ms
|
| 4260 |
|
| 4261 |
|
| 4262 |
|
|
|
|
| 4266 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4267 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4268 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4269 |
+
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 41.326ms 117.97% 41.326ms 41.326ms 1
|
| 4270 |
+
gpt_oss_experts 6.48% 2.843ms 99.99% 43.865ms 43.865ms 0.000us 0.00% 35.050ms 35.050ms 1
|
| 4271 |
+
aten::matmul 0.11% 47.091us 2.05% 900.896us 37.537us 0.000us 0.00% 29.086ms 1.212ms 24
|
| 4272 |
+
aten::mm 1.22% 537.124us 1.95% 853.805us 35.575us 29.086ms 83.03% 29.086ms 1.212ms 24
|
| 4273 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 20.524ms 58.59% 20.524ms 1.368ms 15
|
| 4274 |
+
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 8.546ms 24.39% 8.546ms 949.503us 9
|
| 4275 |
+
aten::add 0.83% 362.842us 1.41% 616.516us 17.125us 1.481ms 4.23% 1.481ms 41.132us 36
|
| 4276 |
+
aten::mul 0.72% 316.599us 1.22% 535.905us 11.165us 1.379ms 3.94% 1.379ms 28.736us 48
|
| 4277 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 928.582us 2.65% 928.582us 38.691us 24
|
| 4278 |
+
aten::index_add_ 0.22% 95.553us 0.38% 168.433us 14.036us 914.346us 2.61% 914.346us 76.195us 12
|
| 4279 |
+
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 914.346us 2.61% 914.346us 76.195us 12
|
| 4280 |
+
aten::clamp 0.51% 224.207us 0.87% 380.890us 15.870us 772.996us 2.21% 772.996us 32.208us 24
|
| 4281 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 772.996us 2.21% 772.996us 32.208us 24
|
| 4282 |
+
aten::index 0.86% 378.436us 1.47% 642.801us 26.783us 657.670us 1.88% 657.670us 27.403us 24
|
| 4283 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 653.092us 1.86% 653.092us 54.424us 12
|
| 4284 |
+
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 586.630us 1.67% 586.630us 48.886us 12
|
| 4285 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 552.162us 1.58% 552.162us 46.014us 12
|
| 4286 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 522.306us 1.49% 522.306us 21.763us 24
|
| 4287 |
+
aten::sigmoid 0.20% 86.392us 0.33% 145.153us 12.096us 354.306us 1.01% 354.306us 29.525us 12
|
| 4288 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 354.306us 1.01% 354.306us 29.525us 12
|
| 4289 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4290 |
+
Self CPU time total: 43.870ms
|
| 4291 |
+
Self CUDA time total: 35.030ms
|
| 4292 |
|
| 4293 |
|
| 4294 |
impl wl p50(ms) ok
|
| 4295 |
+
gpt_oss_experts cuda_B1_S1024_E2 3.87 True
|
| 4296 |
+
gpt_oss_experts cuda_B1_S1024_E4 5.34 True
|
| 4297 |
+
gpt_oss_experts cuda_B1_S512_E2 2.66 True
|
| 4298 |
+
gpt_oss_experts cuda_B1_S512_E4 3.95 True
|
| 4299 |
+
gpt_oss_experts cuda_B4_S1024_E2 13.39 True
|
| 4300 |
+
gpt_oss_experts cuda_B4_S1024_E4 13.41 True
|
| 4301 |
+
gpt_oss_experts cuda_B4_S512_E2 6.80 True
|
| 4302 |
+
gpt_oss_experts cuda_B4_S512_E4 7.53 True
|
| 4303 |
</pre></div>
|
| 4304 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4305 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
|
|
|
| 4308 |
Updated https://github.com/huggingface/kernels.git (55b7c980e96bf5f747f0e4136be61c0b089ab76c)
|
| 4309 |
Building kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c
|
| 4310 |
Built kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c
|
| 4311 |
+
Installed 14 packages in 4ms
|
| 4312 |
</div>
|
| 4313 |
</div>
|
| 4314 |
<div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 4315 |
+
Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 5.85it/s]
|
| 4316 |
+
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 11.70it/s]</div>
|
| 4317 |
<div class="cell-artifacts">
|
| 4318 |
<h4>Artifacts:</h4>
|
| 4319 |
<a href="artifacts/benchmark/openai_moe.jsonl" class="artifact" target="_blank">openai_moe.jsonl</a>
|
openai_moe/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
openai_moe/results/combined_results.html
CHANGED
|
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
-
<dc:date>2025-12-
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
|
@@ -3908,294 +3908,320 @@ body[data-tool="eraser"] .main-content {
|
|
| 3908 |
</g>
|
| 3909 |
<g id="axes--1" class="axes">
|
| 3910 |
<g id="patch_2">
|
| 3911 |
-
<path d="M 57.
|
| 3912 |
</g>
|
| 3913 |
<g id="matplotlib.axis_1">
|
| 3914 |
<g id="xtick_1">
|
| 3915 |
<g id="grid-x--1" class="grid grid-x">
|
| 3916 |
-
<path d="M 93.
|
| 3917 |
</g>
|
| 3918 |
<g id="line2d_1">
|
| 3919 |
<defs>
|
| 3920 |
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3921 |
</defs>
|
| 3922 |
<g>
|
| 3923 |
-
<use ns4:href="#mafb3703e5b" x="93.
|
| 3924 |
</g>
|
| 3925 |
</g>
|
| 3926 |
<g id="text_1">
|
| 3927 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.
|
| 3928 |
</g>
|
| 3929 |
</g>
|
| 3930 |
<g id="xtick_2">
|
| 3931 |
<g id="grid-x--2" class="grid grid-x">
|
| 3932 |
-
<path d="M 195.
|
| 3933 |
</g>
|
| 3934 |
<g id="line2d_2">
|
| 3935 |
<g>
|
| 3936 |
-
<use ns4:href="#mafb3703e5b" x="195.
|
| 3937 |
</g>
|
| 3938 |
</g>
|
| 3939 |
<g id="text_2">
|
| 3940 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.
|
| 3941 |
</g>
|
| 3942 |
</g>
|
| 3943 |
<g id="xtick_3">
|
| 3944 |
<g id="grid-x--3" class="grid grid-x">
|
| 3945 |
-
<path d="M 297.
|
| 3946 |
</g>
|
| 3947 |
<g id="line2d_3">
|
| 3948 |
<g>
|
| 3949 |
-
<use ns4:href="#mafb3703e5b" x="297.
|
| 3950 |
</g>
|
| 3951 |
</g>
|
| 3952 |
<g id="text_3">
|
| 3953 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.
|
| 3954 |
</g>
|
| 3955 |
</g>
|
| 3956 |
<g id="xtick_4">
|
| 3957 |
<g id="grid-x--4" class="grid grid-x">
|
| 3958 |
-
<path d="M 400.
|
| 3959 |
</g>
|
| 3960 |
<g id="line2d_4">
|
| 3961 |
<g>
|
| 3962 |
-
<use ns4:href="#mafb3703e5b" x="400.
|
| 3963 |
</g>
|
| 3964 |
</g>
|
| 3965 |
<g id="text_4">
|
| 3966 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.
|
| 3967 |
</g>
|
| 3968 |
</g>
|
| 3969 |
<g id="xtick_5">
|
| 3970 |
<g id="grid-x--5" class="grid grid-x">
|
| 3971 |
-
<path d="M 502.
|
| 3972 |
</g>
|
| 3973 |
<g id="line2d_5">
|
| 3974 |
<g>
|
| 3975 |
-
<use ns4:href="#mafb3703e5b" x="502.
|
| 3976 |
</g>
|
| 3977 |
</g>
|
| 3978 |
<g id="text_5">
|
| 3979 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.
|
| 3980 |
</g>
|
| 3981 |
</g>
|
| 3982 |
<g id="xtick_6">
|
| 3983 |
<g id="grid-x--6" class="grid grid-x">
|
| 3984 |
-
<path d="M 605.
|
| 3985 |
</g>
|
| 3986 |
<g id="line2d_6">
|
| 3987 |
<g>
|
| 3988 |
-
<use ns4:href="#mafb3703e5b" x="605.
|
| 3989 |
</g>
|
| 3990 |
</g>
|
| 3991 |
<g id="text_6">
|
| 3992 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="xtick_7">
|
| 3996 |
<g id="grid-x--7" class="grid grid-x">
|
| 3997 |
-
<path d="M 707.
|
| 3998 |
</g>
|
| 3999 |
<g id="line2d_7">
|
| 4000 |
<g>
|
| 4001 |
-
<use ns4:href="#mafb3703e5b" x="707.
|
| 4002 |
</g>
|
| 4003 |
</g>
|
| 4004 |
<g id="text_7">
|
| 4005 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="xtick_8">
|
| 4009 |
<g id="grid-x--8" class="grid grid-x">
|
| 4010 |
-
<path d="M 809.
|
| 4011 |
</g>
|
| 4012 |
<g id="line2d_8">
|
| 4013 |
<g>
|
| 4014 |
-
<use ns4:href="#mafb3703e5b" x="809.
|
| 4015 |
</g>
|
| 4016 |
</g>
|
| 4017 |
<g id="text_8">
|
| 4018 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="label--x" class="xlabel">
|
| 4022 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="matplotlib.axis_2">
|
| 4026 |
<g id="ytick_1">
|
| 4027 |
<g id="grid-y--2" class="grid grid-y">
|
| 4028 |
-
<path d="M 57.
|
| 4029 |
</g>
|
| 4030 |
<g id="line2d_9">
|
| 4031 |
<defs>
|
| 4032 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4033 |
</defs>
|
| 4034 |
<g>
|
| 4035 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="text_9">
|
| 4039 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="ytick_2">
|
| 4043 |
<g id="grid-y--3" class="grid grid-y">
|
| 4044 |
-
<path d="M 57.
|
| 4045 |
</g>
|
| 4046 |
<g id="line2d_10">
|
| 4047 |
<g>
|
| 4048 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="text_10">
|
| 4052 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4053 |
</g>
|
| 4054 |
</g>
|
| 4055 |
<g id="ytick_3">
|
| 4056 |
<g id="grid-y--4" class="grid grid-y">
|
| 4057 |
-
<path d="M 57.
|
| 4058 |
</g>
|
| 4059 |
<g id="line2d_11">
|
| 4060 |
<g>
|
| 4061 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="text_11">
|
| 4065 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4066 |
</g>
|
| 4067 |
</g>
|
| 4068 |
<g id="ytick_4">
|
| 4069 |
<g id="grid-y--5" class="grid grid-y">
|
| 4070 |
-
<path d="M 57.
|
| 4071 |
</g>
|
| 4072 |
<g id="line2d_12">
|
| 4073 |
<g>
|
| 4074 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="text_12">
|
| 4078 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="ytick_5">
|
| 4082 |
<g id="grid-y--6" class="grid grid-y">
|
| 4083 |
-
<path d="M 57.
|
| 4084 |
</g>
|
| 4085 |
<g id="line2d_13">
|
| 4086 |
<g>
|
| 4087 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="text_13">
|
| 4091 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4092 |
</g>
|
| 4093 |
</g>
|
| 4094 |
<g id="ytick_6">
|
| 4095 |
<g id="grid-y--7" class="grid grid-y">
|
| 4096 |
-
<path d="M 57.
|
| 4097 |
</g>
|
| 4098 |
<g id="line2d_14">
|
| 4099 |
<g>
|
| 4100 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4101 |
</g>
|
| 4102 |
</g>
|
| 4103 |
<g id="text_14">
|
| 4104 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4105 |
</g>
|
| 4106 |
</g>
|
| 4107 |
<g id="ytick_7">
|
| 4108 |
<g id="grid-y--8" class="grid grid-y">
|
| 4109 |
-
<path d="M 57.
|
| 4110 |
</g>
|
| 4111 |
<g id="line2d_15">
|
| 4112 |
<g>
|
| 4113 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4114 |
</g>
|
| 4115 |
</g>
|
| 4116 |
<g id="text_15">
|
| 4117 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4118 |
</g>
|
| 4119 |
</g>
|
| 4120 |
<g id="label--y" class="ylabel">
|
| 4121 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.
|
| 4122 |
</g>
|
| 4123 |
</g>
|
| 4124 |
<g id="series--binned-torch" class="series">
|
| 4125 |
-
<path d="M 93.
|
| 4126 |
<defs>
|
| 4127 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4128 |
</defs>
|
| 4129 |
-
<g clip-path="url(#
|
| 4130 |
-
<use ns4:href="#md7efaf3aec" x="93.
|
| 4131 |
-
<use ns4:href="#md7efaf3aec" x="195.
|
| 4132 |
-
<use ns4:href="#md7efaf3aec" x="297.
|
| 4133 |
-
<use ns4:href="#md7efaf3aec" x="400.
|
| 4134 |
-
<use ns4:href="#md7efaf3aec" x="502.
|
| 4135 |
-
<use ns4:href="#md7efaf3aec" x="605.
|
| 4136 |
-
<use ns4:href="#md7efaf3aec" x="707.
|
| 4137 |
-
<use ns4:href="#md7efaf3aec" x="809.
|
| 4138 |
</g>
|
| 4139 |
</g>
|
| 4140 |
<g id="series--gpt-oss-experts" class="series">
|
| 4141 |
-
<path d="M 93.
|
| 4142 |
<defs>
|
| 4143 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4144 |
</defs>
|
| 4145 |
-
<g clip-path="url(#
|
| 4146 |
-
<use ns4:href="#m9b8c54d372" x="93.
|
| 4147 |
-
<use ns4:href="#m9b8c54d372" x="195.
|
| 4148 |
-
<use ns4:href="#m9b8c54d372" x="297.
|
| 4149 |
-
<use ns4:href="#m9b8c54d372" x="400.
|
| 4150 |
-
<use ns4:href="#m9b8c54d372" x="502.
|
| 4151 |
-
<use ns4:href="#m9b8c54d372" x="605.
|
| 4152 |
-
<use ns4:href="#m9b8c54d372" x="707.
|
| 4153 |
-
<use ns4:href="#m9b8c54d372" x="809.
|
| 4154 |
</g>
|
| 4155 |
</g>
|
| 4156 |
<g id="patch_3">
|
| 4157 |
-
<path d="M 57.
|
| 4158 |
</g>
|
| 4159 |
<g id="patch_4">
|
| 4160 |
<path d="M 845.766818 468.317269 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4161 |
</g>
|
| 4162 |
<g id="patch_5">
|
| 4163 |
-
<path d="M 57.
|
| 4164 |
</g>
|
| 4165 |
<g id="patch_6">
|
| 4166 |
-
<path d="M 57.
|
| 4167 |
</g>
|
| 4168 |
-
<g id="
|
| 4169 |
-
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.
|
| 4170 |
</g>
|
| 4171 |
<g id="legend" class="legend">
|
| 4172 |
<g id="patch_7">
|
| 4173 |
-
<path d="M 64.
|
| 4174 |
</g>
|
| 4175 |
-
<g id="
|
| 4176 |
-
<path d="M 66.
|
| 4177 |
<g>
|
| 4178 |
-
<use ns4:href="#md7efaf3aec" x="76.
|
| 4179 |
</g>
|
| 4180 |
</g>
|
| 4181 |
<g id="legend-label--binned-torch" class="legend">
|
| 4182 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.
|
| 4183 |
</g>
|
| 4184 |
-
<g id="
|
| 4185 |
-
<path d="M 66.
|
| 4186 |
<g>
|
| 4187 |
-
<use ns4:href="#m9b8c54d372" x="76.
|
| 4188 |
</g>
|
| 4189 |
</g>
|
| 4190 |
<g id="legend-label--gpt-oss-experts" class="legend">
|
| 4191 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.
|
| 4192 |
</g>
|
| 4193 |
</g>
|
| 4194 |
</g>
|
| 4195 |
</g>
|
| 4196 |
<defs>
|
| 4197 |
-
<clipPath id="
|
| 4198 |
-
<rect x="57.
|
| 4199 |
</clipPath>
|
| 4200 |
</defs>
|
| 4201 |
</svg>
|
|
@@ -4208,7 +4234,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4208 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4209 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4210 |
</span> |
|
| 4211 |
-
Cell: combine | 4.
|
| 4212 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4213 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4214 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4297,22 +4323,22 @@ Summary: 2 found, 0 skipped, 0 missing
|
|
| 4297 |
COMBINED BENCHMARK SUMMARY
|
| 4298 |
|
| 4299 |
impl wl p50(ms) ok
|
| 4300 |
-
binned_torch cuda_B1_S1024_E2
|
| 4301 |
-
binned_torch cuda_B1_S1024_E4
|
| 4302 |
-
binned_torch cuda_B1_S512_E2
|
| 4303 |
-
binned_torch cuda_B1_S512_E4
|
| 4304 |
-
binned_torch cuda_B4_S1024_E2
|
| 4305 |
-
binned_torch cuda_B4_S1024_E4
|
| 4306 |
-
binned_torch cuda_B4_S512_E2
|
| 4307 |
-
binned_torch cuda_B4_S512_E4
|
| 4308 |
-
gpt_oss_experts cuda_B1_S1024_E2 3.
|
| 4309 |
-
gpt_oss_experts cuda_B1_S1024_E4 5.
|
| 4310 |
-
gpt_oss_experts cuda_B1_S512_E2 2.
|
| 4311 |
-
gpt_oss_experts cuda_B1_S512_E4 3.
|
| 4312 |
-
gpt_oss_experts cuda_B4_S1024_E2 13.
|
| 4313 |
-
gpt_oss_experts cuda_B4_S1024_E4 13.
|
| 4314 |
-
gpt_oss_experts cuda_B4_S512_E2 6.
|
| 4315 |
-
gpt_oss_experts cuda_B4_S512_E4 7.
|
| 4316 |
|
| 4317 |
GENERATING COMBINED VISUALIZATION
|
| 4318 |
|
|
@@ -4332,7 +4358,7 @@ Implementations included:
|
|
| 4332 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4333 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4334 |
<div class="uv-logs-content" style="display: none;">
|
| 4335 |
-
Installed 37 packages in
|
| 4336 |
</div>
|
| 4337 |
</div>
|
| 4338 |
<div class="cell-artifacts">
|
|
@@ -4345,7 +4371,7 @@ Installed 37 packages in 205ms
|
|
| 4345 |
<rdf:RDF>
|
| 4346 |
<ns2:Work>
|
| 4347 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4348 |
-
<dc:date>2025-12-
|
| 4349 |
<dc:format>image/svg+xml</dc:format>
|
| 4350 |
<dc:creator>
|
| 4351 |
<ns2:Agent>
|
|
@@ -4364,294 +4390,320 @@ Installed 37 packages in 205ms
|
|
| 4364 |
</g>
|
| 4365 |
<g id="axes--1" class="axes">
|
| 4366 |
<g id="patch_2">
|
| 4367 |
-
<path d="M 57.
|
| 4368 |
</g>
|
| 4369 |
<g id="matplotlib.axis_1">
|
| 4370 |
<g id="xtick_1">
|
| 4371 |
<g id="grid-x--1" class="grid grid-x">
|
| 4372 |
-
<path d="M 93.
|
| 4373 |
</g>
|
| 4374 |
<g id="line2d_1">
|
| 4375 |
<defs>
|
| 4376 |
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4377 |
</defs>
|
| 4378 |
<g>
|
| 4379 |
-
<use ns4:href="#mafb3703e5b" x="93.
|
| 4380 |
</g>
|
| 4381 |
</g>
|
| 4382 |
<g id="text_1">
|
| 4383 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.
|
| 4384 |
</g>
|
| 4385 |
</g>
|
| 4386 |
<g id="xtick_2">
|
| 4387 |
<g id="grid-x--2" class="grid grid-x">
|
| 4388 |
-
<path d="M 195.
|
| 4389 |
</g>
|
| 4390 |
<g id="line2d_2">
|
| 4391 |
<g>
|
| 4392 |
-
<use ns4:href="#mafb3703e5b" x="195.
|
| 4393 |
</g>
|
| 4394 |
</g>
|
| 4395 |
<g id="text_2">
|
| 4396 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.
|
| 4397 |
</g>
|
| 4398 |
</g>
|
| 4399 |
<g id="xtick_3">
|
| 4400 |
<g id="grid-x--3" class="grid grid-x">
|
| 4401 |
-
<path d="M 297.
|
| 4402 |
</g>
|
| 4403 |
<g id="line2d_3">
|
| 4404 |
<g>
|
| 4405 |
-
<use ns4:href="#mafb3703e5b" x="297.
|
| 4406 |
</g>
|
| 4407 |
</g>
|
| 4408 |
<g id="text_3">
|
| 4409 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.
|
| 4410 |
</g>
|
| 4411 |
</g>
|
| 4412 |
<g id="xtick_4">
|
| 4413 |
<g id="grid-x--4" class="grid grid-x">
|
| 4414 |
-
<path d="M 400.
|
| 4415 |
</g>
|
| 4416 |
<g id="line2d_4">
|
| 4417 |
<g>
|
| 4418 |
-
<use ns4:href="#mafb3703e5b" x="400.
|
| 4419 |
</g>
|
| 4420 |
</g>
|
| 4421 |
<g id="text_4">
|
| 4422 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.
|
| 4423 |
</g>
|
| 4424 |
</g>
|
| 4425 |
<g id="xtick_5">
|
| 4426 |
<g id="grid-x--5" class="grid grid-x">
|
| 4427 |
-
<path d="M 502.
|
| 4428 |
</g>
|
| 4429 |
<g id="line2d_5">
|
| 4430 |
<g>
|
| 4431 |
-
<use ns4:href="#mafb3703e5b" x="502.
|
| 4432 |
</g>
|
| 4433 |
</g>
|
| 4434 |
<g id="text_5">
|
| 4435 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.
|
| 4436 |
</g>
|
| 4437 |
</g>
|
| 4438 |
<g id="xtick_6">
|
| 4439 |
<g id="grid-x--6" class="grid grid-x">
|
| 4440 |
-
<path d="M 605.
|
| 4441 |
</g>
|
| 4442 |
<g id="line2d_6">
|
| 4443 |
<g>
|
| 4444 |
-
<use ns4:href="#mafb3703e5b" x="605.
|
| 4445 |
</g>
|
| 4446 |
</g>
|
| 4447 |
<g id="text_6">
|
| 4448 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.
|
| 4449 |
</g>
|
| 4450 |
</g>
|
| 4451 |
<g id="xtick_7">
|
| 4452 |
<g id="grid-x--7" class="grid grid-x">
|
| 4453 |
-
<path d="M 707.
|
| 4454 |
</g>
|
| 4455 |
<g id="line2d_7">
|
| 4456 |
<g>
|
| 4457 |
-
<use ns4:href="#mafb3703e5b" x="707.
|
| 4458 |
</g>
|
| 4459 |
</g>
|
| 4460 |
<g id="text_7">
|
| 4461 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.
|
| 4462 |
</g>
|
| 4463 |
</g>
|
| 4464 |
<g id="xtick_8">
|
| 4465 |
<g id="grid-x--8" class="grid grid-x">
|
| 4466 |
-
<path d="M 809.
|
| 4467 |
</g>
|
| 4468 |
<g id="line2d_8">
|
| 4469 |
<g>
|
| 4470 |
-
<use ns4:href="#mafb3703e5b" x="809.
|
| 4471 |
</g>
|
| 4472 |
</g>
|
| 4473 |
<g id="text_8">
|
| 4474 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.
|
| 4475 |
</g>
|
| 4476 |
</g>
|
| 4477 |
<g id="label--x" class="xlabel">
|
| 4478 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.
|
| 4479 |
</g>
|
| 4480 |
</g>
|
| 4481 |
<g id="matplotlib.axis_2">
|
| 4482 |
<g id="ytick_1">
|
| 4483 |
<g id="grid-y--2" class="grid grid-y">
|
| 4484 |
-
<path d="M 57.
|
| 4485 |
</g>
|
| 4486 |
<g id="line2d_9">
|
| 4487 |
<defs>
|
| 4488 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4489 |
</defs>
|
| 4490 |
<g>
|
| 4491 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4492 |
</g>
|
| 4493 |
</g>
|
| 4494 |
<g id="text_9">
|
| 4495 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4496 |
</g>
|
| 4497 |
</g>
|
| 4498 |
<g id="ytick_2">
|
| 4499 |
<g id="grid-y--3" class="grid grid-y">
|
| 4500 |
-
<path d="M 57.
|
| 4501 |
</g>
|
| 4502 |
<g id="line2d_10">
|
| 4503 |
<g>
|
| 4504 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4505 |
</g>
|
| 4506 |
</g>
|
| 4507 |
<g id="text_10">
|
| 4508 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4509 |
</g>
|
| 4510 |
</g>
|
| 4511 |
<g id="ytick_3">
|
| 4512 |
<g id="grid-y--4" class="grid grid-y">
|
| 4513 |
-
<path d="M 57.
|
| 4514 |
</g>
|
| 4515 |
<g id="line2d_11">
|
| 4516 |
<g>
|
| 4517 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4518 |
</g>
|
| 4519 |
</g>
|
| 4520 |
<g id="text_11">
|
| 4521 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="ytick_4">
|
| 4525 |
<g id="grid-y--5" class="grid grid-y">
|
| 4526 |
-
<path d="M 57.
|
| 4527 |
</g>
|
| 4528 |
<g id="line2d_12">
|
| 4529 |
<g>
|
| 4530 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4531 |
</g>
|
| 4532 |
</g>
|
| 4533 |
<g id="text_12">
|
| 4534 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4535 |
</g>
|
| 4536 |
</g>
|
| 4537 |
<g id="ytick_5">
|
| 4538 |
<g id="grid-y--6" class="grid grid-y">
|
| 4539 |
-
<path d="M 57.
|
| 4540 |
</g>
|
| 4541 |
<g id="line2d_13">
|
| 4542 |
<g>
|
| 4543 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4544 |
</g>
|
| 4545 |
</g>
|
| 4546 |
<g id="text_13">
|
| 4547 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4548 |
</g>
|
| 4549 |
</g>
|
| 4550 |
<g id="ytick_6">
|
| 4551 |
<g id="grid-y--7" class="grid grid-y">
|
| 4552 |
-
<path d="M 57.
|
| 4553 |
</g>
|
| 4554 |
<g id="line2d_14">
|
| 4555 |
<g>
|
| 4556 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4557 |
</g>
|
| 4558 |
</g>
|
| 4559 |
<g id="text_14">
|
| 4560 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4561 |
</g>
|
| 4562 |
</g>
|
| 4563 |
<g id="ytick_7">
|
| 4564 |
<g id="grid-y--8" class="grid grid-y">
|
| 4565 |
-
<path d="M 57.
|
| 4566 |
</g>
|
| 4567 |
<g id="line2d_15">
|
| 4568 |
<g>
|
| 4569 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4570 |
</g>
|
| 4571 |
</g>
|
| 4572 |
<g id="text_15">
|
| 4573 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4574 |
</g>
|
| 4575 |
</g>
|
| 4576 |
<g id="label--y" class="ylabel">
|
| 4577 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.
|
| 4578 |
</g>
|
| 4579 |
</g>
|
| 4580 |
<g id="series--binned-torch" class="series">
|
| 4581 |
-
<path d="M 93.
|
| 4582 |
<defs>
|
| 4583 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4584 |
</defs>
|
| 4585 |
-
<g clip-path="url(#
|
| 4586 |
-
<use ns4:href="#md7efaf3aec" x="93.
|
| 4587 |
-
<use ns4:href="#md7efaf3aec" x="195.
|
| 4588 |
-
<use ns4:href="#md7efaf3aec" x="297.
|
| 4589 |
-
<use ns4:href="#md7efaf3aec" x="400.
|
| 4590 |
-
<use ns4:href="#md7efaf3aec" x="502.
|
| 4591 |
-
<use ns4:href="#md7efaf3aec" x="605.
|
| 4592 |
-
<use ns4:href="#md7efaf3aec" x="707.
|
| 4593 |
-
<use ns4:href="#md7efaf3aec" x="809.
|
| 4594 |
</g>
|
| 4595 |
</g>
|
| 4596 |
<g id="series--gpt-oss-experts" class="series">
|
| 4597 |
-
<path d="M 93.
|
| 4598 |
<defs>
|
| 4599 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4600 |
</defs>
|
| 4601 |
-
<g clip-path="url(#
|
| 4602 |
-
<use ns4:href="#m9b8c54d372" x="93.
|
| 4603 |
-
<use ns4:href="#m9b8c54d372" x="195.
|
| 4604 |
-
<use ns4:href="#m9b8c54d372" x="297.
|
| 4605 |
-
<use ns4:href="#m9b8c54d372" x="400.
|
| 4606 |
-
<use ns4:href="#m9b8c54d372" x="502.
|
| 4607 |
-
<use ns4:href="#m9b8c54d372" x="605.
|
| 4608 |
-
<use ns4:href="#m9b8c54d372" x="707.
|
| 4609 |
-
<use ns4:href="#m9b8c54d372" x="809.
|
| 4610 |
</g>
|
| 4611 |
</g>
|
| 4612 |
<g id="patch_3">
|
| 4613 |
-
<path d="M 57.
|
| 4614 |
</g>
|
| 4615 |
<g id="patch_4">
|
| 4616 |
<path d="M 845.766818 468.317269 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4617 |
</g>
|
| 4618 |
<g id="patch_5">
|
| 4619 |
-
<path d="M 57.
|
| 4620 |
</g>
|
| 4621 |
<g id="patch_6">
|
| 4622 |
-
<path d="M 57.
|
| 4623 |
</g>
|
| 4624 |
-
<g id="
|
| 4625 |
-
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.
|
| 4626 |
</g>
|
| 4627 |
<g id="legend" class="legend">
|
| 4628 |
<g id="patch_7">
|
| 4629 |
-
<path d="M 64.
|
| 4630 |
</g>
|
| 4631 |
-
<g id="
|
| 4632 |
-
<path d="M 66.
|
| 4633 |
<g>
|
| 4634 |
-
<use ns4:href="#md7efaf3aec" x="76.
|
| 4635 |
</g>
|
| 4636 |
</g>
|
| 4637 |
<g id="legend-label--binned-torch" class="legend">
|
| 4638 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.
|
| 4639 |
</g>
|
| 4640 |
-
<g id="
|
| 4641 |
-
<path d="M 66.
|
| 4642 |
<g>
|
| 4643 |
-
<use ns4:href="#m9b8c54d372" x="76.
|
| 4644 |
</g>
|
| 4645 |
</g>
|
| 4646 |
<g id="legend-label--gpt-oss-experts" class="legend">
|
| 4647 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.
|
| 4648 |
</g>
|
| 4649 |
</g>
|
| 4650 |
</g>
|
| 4651 |
</g>
|
| 4652 |
<defs>
|
| 4653 |
-
<clipPath id="
|
| 4654 |
-
<rect x="57.
|
| 4655 |
</clipPath>
|
| 4656 |
</defs>
|
| 4657 |
</svg>
|
|
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
+
<dc:date>2025-12-19T23:02:40.893386</dc:date>
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
|
|
|
| 3908 |
</g>
|
| 3909 |
<g id="axes--1" class="axes">
|
| 3910 |
<g id="patch_2">
|
| 3911 |
+
<path d="M 57.26 468.317269 L 845.766818 468.317269 L 845.766818 26.88 L 57.26 26.88 L 57.26 468.317269 z " style="fill: none" />
|
| 3912 |
</g>
|
| 3913 |
<g id="matplotlib.axis_1">
|
| 3914 |
<g id="xtick_1">
|
| 3915 |
<g id="grid-x--1" class="grid grid-x">
|
| 3916 |
+
<path d="M 93.101219 468.317269 L 93.101219 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3917 |
</g>
|
| 3918 |
<g id="line2d_1">
|
| 3919 |
<defs>
|
| 3920 |
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3921 |
</defs>
|
| 3922 |
<g>
|
| 3923 |
+
<use ns4:href="#mafb3703e5b" x="93.101219" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 3924 |
</g>
|
| 3925 |
</g>
|
| 3926 |
<g id="text_1">
|
| 3927 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.90334 544.791615) rotate(-45)">cuda_B1_S512_E2</text>
|
| 3928 |
</g>
|
| 3929 |
</g>
|
| 3930 |
<g id="xtick_2">
|
| 3931 |
<g id="grid-x--2" class="grid grid-x">
|
| 3932 |
+
<path d="M 195.504702 468.317269 L 195.504702 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3933 |
</g>
|
| 3934 |
<g id="line2d_2">
|
| 3935 |
<g>
|
| 3936 |
+
<use ns4:href="#mafb3703e5b" x="195.504702" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 3937 |
</g>
|
| 3938 |
</g>
|
| 3939 |
<g id="text_2">
|
| 3940 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.306823 544.791615) rotate(-45)">cuda_B1_S512_E4</text>
|
| 3941 |
</g>
|
| 3942 |
</g>
|
| 3943 |
<g id="xtick_3">
|
| 3944 |
<g id="grid-x--3" class="grid grid-x">
|
| 3945 |
+
<path d="M 297.908185 468.317269 L 297.908185 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3946 |
</g>
|
| 3947 |
<g id="line2d_3">
|
| 3948 |
<g>
|
| 3949 |
+
<use ns4:href="#mafb3703e5b" x="297.908185" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 3950 |
</g>
|
| 3951 |
</g>
|
| 3952 |
<g id="text_3">
|
| 3953 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.460822 549.290582) rotate(-45)">cuda_B1_S1024_E2</text>
|
| 3954 |
</g>
|
| 3955 |
</g>
|
| 3956 |
<g id="xtick_4">
|
| 3957 |
<g id="grid-x--4" class="grid grid-x">
|
| 3958 |
+
<path d="M 400.311668 468.317269 L 400.311668 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3959 |
</g>
|
| 3960 |
<g id="line2d_4">
|
| 3961 |
<g>
|
| 3962 |
+
<use ns4:href="#mafb3703e5b" x="400.311668" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 3963 |
</g>
|
| 3964 |
</g>
|
| 3965 |
<g id="text_4">
|
| 3966 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.864305 549.290582) rotate(-45)">cuda_B1_S1024_E4</text>
|
| 3967 |
</g>
|
| 3968 |
</g>
|
| 3969 |
<g id="xtick_5">
|
| 3970 |
<g id="grid-x--5" class="grid grid-x">
|
| 3971 |
+
<path d="M 502.71515 468.317269 L 502.71515 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3972 |
</g>
|
| 3973 |
<g id="line2d_5">
|
| 3974 |
<g>
|
| 3975 |
+
<use ns4:href="#mafb3703e5b" x="502.71515" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 3976 |
</g>
|
| 3977 |
</g>
|
| 3978 |
<g id="text_5">
|
| 3979 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.517271 544.791615) rotate(-45)">cuda_B4_S512_E2</text>
|
| 3980 |
</g>
|
| 3981 |
</g>
|
| 3982 |
<g id="xtick_6">
|
| 3983 |
<g id="grid-x--6" class="grid grid-x">
|
| 3984 |
+
<path d="M 605.118633 468.317269 L 605.118633 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3985 |
</g>
|
| 3986 |
<g id="line2d_6">
|
| 3987 |
<g>
|
| 3988 |
+
<use ns4:href="#mafb3703e5b" x="605.118633" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 3989 |
</g>
|
| 3990 |
</g>
|
| 3991 |
<g id="text_6">
|
| 3992 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.920754 544.791615) rotate(-45)">cuda_B4_S512_E4</text>
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="xtick_7">
|
| 3996 |
<g id="grid-x--7" class="grid grid-x">
|
| 3997 |
+
<path d="M 707.522116 468.317269 L 707.522116 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3998 |
</g>
|
| 3999 |
<g id="line2d_7">
|
| 4000 |
<g>
|
| 4001 |
+
<use ns4:href="#mafb3703e5b" x="707.522116" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4002 |
</g>
|
| 4003 |
</g>
|
| 4004 |
<g id="text_7">
|
| 4005 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.074754 549.290582) rotate(-45)">cuda_B4_S1024_E2</text>
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="xtick_8">
|
| 4009 |
<g id="grid-x--8" class="grid grid-x">
|
| 4010 |
+
<path d="M 809.925599 468.317269 L 809.925599 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4011 |
</g>
|
| 4012 |
<g id="line2d_8">
|
| 4013 |
<g>
|
| 4014 |
+
<use ns4:href="#mafb3703e5b" x="809.925599" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4015 |
</g>
|
| 4016 |
</g>
|
| 4017 |
<g id="text_8">
|
| 4018 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.478237 549.290582) rotate(-45)">cuda_B4_S1024_E4</text>
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="label--x" class="xlabel">
|
| 4022 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.513409" y="562.556245" transform="rotate(-0 451.513409 562.556245)">Workload</text>
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="matplotlib.axis_2">
|
| 4026 |
<g id="ytick_1">
|
| 4027 |
<g id="grid-y--2" class="grid grid-y">
|
| 4028 |
+
<path d="M 57.26 448.91253 L 845.766818 448.91253 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4029 |
</g>
|
| 4030 |
<g id="line2d_9">
|
| 4031 |
<defs>
|
| 4032 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4033 |
</defs>
|
| 4034 |
<g>
|
| 4035 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="448.91253" style="stroke: #000000; stroke-width: 0.8" />
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="text_9">
|
| 4039 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="452.711749" transform="rotate(-0 50.26 452.711749)">0</text>
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="ytick_2">
|
| 4043 |
<g id="grid-y--3" class="grid grid-y">
|
| 4044 |
+
<path d="M 57.26 399.227119 L 845.766818 399.227119 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4045 |
</g>
|
| 4046 |
<g id="line2d_10">
|
| 4047 |
<g>
|
| 4048 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="399.227119" style="stroke: #000000; stroke-width: 0.8" />
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="text_10">
|
| 4052 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="403.026338" transform="rotate(-0 50.26 403.026338)">200</text>
|
| 4053 |
</g>
|
| 4054 |
</g>
|
| 4055 |
<g id="ytick_3">
|
| 4056 |
<g id="grid-y--4" class="grid grid-y">
|
| 4057 |
+
<path d="M 57.26 349.541708 L 845.766818 349.541708 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4058 |
</g>
|
| 4059 |
<g id="line2d_11">
|
| 4060 |
<g>
|
| 4061 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="349.541708" style="stroke: #000000; stroke-width: 0.8" />
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="text_11">
|
| 4065 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="353.340926" transform="rotate(-0 50.26 353.340926)">400</text>
|
| 4066 |
</g>
|
| 4067 |
</g>
|
| 4068 |
<g id="ytick_4">
|
| 4069 |
<g id="grid-y--5" class="grid grid-y">
|
| 4070 |
+
<path d="M 57.26 299.856296 L 845.766818 299.856296 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4071 |
</g>
|
| 4072 |
<g id="line2d_12">
|
| 4073 |
<g>
|
| 4074 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="299.856296" style="stroke: #000000; stroke-width: 0.8" />
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="text_12">
|
| 4078 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="303.655515" transform="rotate(-0 50.26 303.655515)">600</text>
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="ytick_5">
|
| 4082 |
<g id="grid-y--6" class="grid grid-y">
|
| 4083 |
+
<path d="M 57.26 250.170885 L 845.766818 250.170885 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4084 |
</g>
|
| 4085 |
<g id="line2d_13">
|
| 4086 |
<g>
|
| 4087 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="250.170885" style="stroke: #000000; stroke-width: 0.8" />
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="text_13">
|
| 4091 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="253.970104" transform="rotate(-0 50.26 253.970104)">800</text>
|
| 4092 |
</g>
|
| 4093 |
</g>
|
| 4094 |
<g id="ytick_6">
|
| 4095 |
<g id="grid-y--7" class="grid grid-y">
|
| 4096 |
+
<path d="M 57.26 200.485474 L 845.766818 200.485474 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4097 |
</g>
|
| 4098 |
<g id="line2d_14">
|
| 4099 |
<g>
|
| 4100 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="200.485474" style="stroke: #000000; stroke-width: 0.8" />
|
| 4101 |
</g>
|
| 4102 |
</g>
|
| 4103 |
<g id="text_14">
|
| 4104 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="204.284693" transform="rotate(-0 50.26 204.284693)">1000</text>
|
| 4105 |
</g>
|
| 4106 |
</g>
|
| 4107 |
<g id="ytick_7">
|
| 4108 |
<g id="grid-y--8" class="grid grid-y">
|
| 4109 |
+
<path d="M 57.26 150.800062 L 845.766818 150.800062 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4110 |
</g>
|
| 4111 |
<g id="line2d_15">
|
| 4112 |
<g>
|
| 4113 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="150.800062" style="stroke: #000000; stroke-width: 0.8" />
|
| 4114 |
</g>
|
| 4115 |
</g>
|
| 4116 |
<g id="text_15">
|
| 4117 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="154.599281" transform="rotate(-0 50.26 154.599281)">1200</text>
|
| 4118 |
+
</g>
|
| 4119 |
+
</g>
|
| 4120 |
+
<g id="ytick_8">
|
| 4121 |
+
<g id="grid-y--9" class="grid grid-y">
|
| 4122 |
+
<path d="M 57.26 101.114651 L 845.766818 101.114651 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4123 |
+
</g>
|
| 4124 |
+
<g id="line2d_16">
|
| 4125 |
+
<g>
|
| 4126 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="101.114651" style="stroke: #000000; stroke-width: 0.8" />
|
| 4127 |
+
</g>
|
| 4128 |
+
</g>
|
| 4129 |
+
<g id="text_16">
|
| 4130 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="104.91387" transform="rotate(-0 50.26 104.91387)">1400</text>
|
| 4131 |
+
</g>
|
| 4132 |
+
</g>
|
| 4133 |
+
<g id="ytick_9">
|
| 4134 |
+
<g id="grid-y--10" class="grid grid-y">
|
| 4135 |
+
<path d="M 57.26 51.42924 L 845.766818 51.42924 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4136 |
+
</g>
|
| 4137 |
+
<g id="line2d_17">
|
| 4138 |
+
<g>
|
| 4139 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="51.42924" style="stroke: #000000; stroke-width: 0.8" />
|
| 4140 |
+
</g>
|
| 4141 |
+
</g>
|
| 4142 |
+
<g id="text_17">
|
| 4143 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="55.228459" transform="rotate(-0 50.26 55.228459)">1600</text>
|
| 4144 |
</g>
|
| 4145 |
</g>
|
| 4146 |
<g id="label--y" class="ylabel">
|
| 4147 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.730313" y="247.598635" transform="rotate(-90 18.730313 247.598635)">Latency P50 (ms)</text>
|
| 4148 |
</g>
|
| 4149 |
</g>
|
| 4150 |
<g id="series--binned-torch" class="series">
|
| 4151 |
+
<path d="M 93.101219 410.567585 L 195.504702 400.332462 L 297.908185 357.497295 L 400.311668 350.461107 L 502.71515 266.697997 L 605.118633 253.249871 L 707.522116 73.76468 L 809.925599 46.94533 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4152 |
<defs>
|
| 4153 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4154 |
</defs>
|
| 4155 |
+
<g clip-path="url(#pef1bcf59f7)">
|
| 4156 |
+
<use ns4:href="#md7efaf3aec" x="93.101219" y="410.567585" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4157 |
+
<use ns4:href="#md7efaf3aec" x="195.504702" y="400.332462" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4158 |
+
<use ns4:href="#md7efaf3aec" x="297.908185" y="357.497295" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4159 |
+
<use ns4:href="#md7efaf3aec" x="400.311668" y="350.461107" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4160 |
+
<use ns4:href="#md7efaf3aec" x="502.71515" y="266.697997" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4161 |
+
<use ns4:href="#md7efaf3aec" x="605.118633" y="253.249871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4162 |
+
<use ns4:href="#md7efaf3aec" x="707.522116" y="73.76468" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4163 |
+
<use ns4:href="#md7efaf3aec" x="809.925599" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4164 |
</g>
|
| 4165 |
</g>
|
| 4166 |
<g id="series--gpt-oss-experts" class="series">
|
| 4167 |
+
<path d="M 93.101219 448.251939 L 195.504702 447.930293 L 297.908185 447.951398 L 400.311668 447.585894 L 502.71515 447.222062 L 605.118633 447.041869 L 707.522116 445.587165 L 809.925599 445.581303 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4168 |
<defs>
|
| 4169 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4170 |
</defs>
|
| 4171 |
+
<g clip-path="url(#pef1bcf59f7)">
|
| 4172 |
+
<use ns4:href="#m9b8c54d372" x="93.101219" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4173 |
+
<use ns4:href="#m9b8c54d372" x="195.504702" y="447.930293" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4174 |
+
<use ns4:href="#m9b8c54d372" x="297.908185" y="447.951398" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4175 |
+
<use ns4:href="#m9b8c54d372" x="400.311668" y="447.585894" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4176 |
+
<use ns4:href="#m9b8c54d372" x="502.71515" y="447.222062" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4177 |
+
<use ns4:href="#m9b8c54d372" x="605.118633" y="447.041869" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4178 |
+
<use ns4:href="#m9b8c54d372" x="707.522116" y="445.587165" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4179 |
+
<use ns4:href="#m9b8c54d372" x="809.925599" y="445.581303" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4180 |
</g>
|
| 4181 |
</g>
|
| 4182 |
<g id="patch_3">
|
| 4183 |
+
<path d="M 57.26 468.317269 L 57.26 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4184 |
</g>
|
| 4185 |
<g id="patch_4">
|
| 4186 |
<path d="M 845.766818 468.317269 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4187 |
</g>
|
| 4188 |
<g id="patch_5">
|
| 4189 |
+
<path d="M 57.26 468.317269 L 845.766818 468.317269 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4190 |
</g>
|
| 4191 |
<g id="patch_6">
|
| 4192 |
+
<path d="M 57.26 26.88 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4193 |
</g>
|
| 4194 |
+
<g id="text_18">
|
| 4195 |
+
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.513409" y="20.88" transform="rotate(-0 451.513409 20.88)">Attention Implementation Latency</text>
|
| 4196 |
</g>
|
| 4197 |
<g id="legend" class="legend">
|
| 4198 |
<g id="patch_7">
|
| 4199 |
+
<path d="M 64.26 64.7925 L 177.05375 64.7925 Q 179.05375 64.7925 179.05375 62.7925 L 179.05375 33.88 Q 179.05375 31.88 177.05375 31.88 L 64.26 31.88 Q 62.26 31.88 62.26 33.88 L 62.26 62.7925 Q 62.26 64.7925 64.26 64.7925 L 64.26 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4200 |
</g>
|
| 4201 |
+
<g id="line2d_18">
|
| 4202 |
+
<path d="M 66.26 39.978438 L 76.26 39.978438 L 86.26 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4203 |
<g>
|
| 4204 |
+
<use ns4:href="#md7efaf3aec" x="76.26" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4205 |
</g>
|
| 4206 |
</g>
|
| 4207 |
<g id="legend-label--binned-torch" class="legend">
|
| 4208 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.26" y="43.478438" transform="rotate(-0 94.26 43.478438)">binned_torch</text>
|
| 4209 |
</g>
|
| 4210 |
+
<g id="line2d_19">
|
| 4211 |
+
<path d="M 66.26 54.934687 L 76.26 54.934687 L 86.26 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4212 |
<g>
|
| 4213 |
+
<use ns4:href="#m9b8c54d372" x="76.26" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4214 |
</g>
|
| 4215 |
</g>
|
| 4216 |
<g id="legend-label--gpt-oss-experts" class="legend">
|
| 4217 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.26" y="58.434687" transform="rotate(-0 94.26 58.434687)">gpt_oss_experts</text>
|
| 4218 |
</g>
|
| 4219 |
</g>
|
| 4220 |
</g>
|
| 4221 |
</g>
|
| 4222 |
<defs>
|
| 4223 |
+
<clipPath id="pef1bcf59f7">
|
| 4224 |
+
<rect x="57.26" y="26.88" width="788.506818" height="441.437269" />
|
| 4225 |
</clipPath>
|
| 4226 |
</defs>
|
| 4227 |
</svg>
|
|
|
|
| 4234 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4235 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4236 |
</span> |
|
| 4237 |
+
Cell: combine | 4.45s
|
| 4238 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4239 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4240 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4323 |
COMBINED BENCHMARK SUMMARY
|
| 4324 |
|
| 4325 |
impl wl p50(ms) ok
|
| 4326 |
+
binned_torch cuda_B1_S1024_E2 367.98 True
|
| 4327 |
+
binned_torch cuda_B1_S1024_E4 396.30 True
|
| 4328 |
+
binned_torch cuda_B1_S512_E2 154.35 True
|
| 4329 |
+
binned_torch cuda_B1_S512_E4 195.55 True
|
| 4330 |
+
binned_torch cuda_B4_S1024_E2 1510.09 True
|
| 4331 |
+
binned_torch cuda_B4_S1024_E4 1618.05 True
|
| 4332 |
+
binned_torch cuda_B4_S512_E2 733.47 True
|
| 4333 |
+
binned_torch cuda_B4_S512_E4 787.61 True
|
| 4334 |
+
gpt_oss_experts cuda_B1_S1024_E2 3.87 True
|
| 4335 |
+
gpt_oss_experts cuda_B1_S1024_E4 5.34 True
|
| 4336 |
+
gpt_oss_experts cuda_B1_S512_E2 2.66 True
|
| 4337 |
+
gpt_oss_experts cuda_B1_S512_E4 3.95 True
|
| 4338 |
+
gpt_oss_experts cuda_B4_S1024_E2 13.39 True
|
| 4339 |
+
gpt_oss_experts cuda_B4_S1024_E4 13.41 True
|
| 4340 |
+
gpt_oss_experts cuda_B4_S512_E2 6.80 True
|
| 4341 |
+
gpt_oss_experts cuda_B4_S512_E4 7.53 True
|
| 4342 |
|
| 4343 |
GENERATING COMBINED VISUALIZATION
|
| 4344 |
|
|
|
|
| 4358 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4359 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4360 |
<div class="uv-logs-content" style="display: none;">
|
| 4361 |
+
Installed 37 packages in 266ms
|
| 4362 |
</div>
|
| 4363 |
</div>
|
| 4364 |
<div class="cell-artifacts">
|
|
|
|
| 4371 |
<rdf:RDF>
|
| 4372 |
<ns2:Work>
|
| 4373 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4374 |
+
<dc:date>2025-12-19T23:02:40.893386</dc:date>
|
| 4375 |
<dc:format>image/svg+xml</dc:format>
|
| 4376 |
<dc:creator>
|
| 4377 |
<ns2:Agent>
|
|
|
|
| 4390 |
</g>
|
| 4391 |
<g id="axes--1" class="axes">
|
| 4392 |
<g id="patch_2">
|
| 4393 |
+
<path d="M 57.26 468.317269 L 845.766818 468.317269 L 845.766818 26.88 L 57.26 26.88 L 57.26 468.317269 z " style="fill: none" />
|
| 4394 |
</g>
|
| 4395 |
<g id="matplotlib.axis_1">
|
| 4396 |
<g id="xtick_1">
|
| 4397 |
<g id="grid-x--1" class="grid grid-x">
|
| 4398 |
+
<path d="M 93.101219 468.317269 L 93.101219 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4399 |
</g>
|
| 4400 |
<g id="line2d_1">
|
| 4401 |
<defs>
|
| 4402 |
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4403 |
</defs>
|
| 4404 |
<g>
|
| 4405 |
+
<use ns4:href="#mafb3703e5b" x="93.101219" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4406 |
</g>
|
| 4407 |
</g>
|
| 4408 |
<g id="text_1">
|
| 4409 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.90334 544.791615) rotate(-45)">cuda_B1_S512_E2</text>
|
| 4410 |
</g>
|
| 4411 |
</g>
|
| 4412 |
<g id="xtick_2">
|
| 4413 |
<g id="grid-x--2" class="grid grid-x">
|
| 4414 |
+
<path d="M 195.504702 468.317269 L 195.504702 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4415 |
</g>
|
| 4416 |
<g id="line2d_2">
|
| 4417 |
<g>
|
| 4418 |
+
<use ns4:href="#mafb3703e5b" x="195.504702" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4419 |
</g>
|
| 4420 |
</g>
|
| 4421 |
<g id="text_2">
|
| 4422 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.306823 544.791615) rotate(-45)">cuda_B1_S512_E4</text>
|
| 4423 |
</g>
|
| 4424 |
</g>
|
| 4425 |
<g id="xtick_3">
|
| 4426 |
<g id="grid-x--3" class="grid grid-x">
|
| 4427 |
+
<path d="M 297.908185 468.317269 L 297.908185 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4428 |
</g>
|
| 4429 |
<g id="line2d_3">
|
| 4430 |
<g>
|
| 4431 |
+
<use ns4:href="#mafb3703e5b" x="297.908185" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4432 |
</g>
|
| 4433 |
</g>
|
| 4434 |
<g id="text_3">
|
| 4435 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.460822 549.290582) rotate(-45)">cuda_B1_S1024_E2</text>
|
| 4436 |
</g>
|
| 4437 |
</g>
|
| 4438 |
<g id="xtick_4">
|
| 4439 |
<g id="grid-x--4" class="grid grid-x">
|
| 4440 |
+
<path d="M 400.311668 468.317269 L 400.311668 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4441 |
</g>
|
| 4442 |
<g id="line2d_4">
|
| 4443 |
<g>
|
| 4444 |
+
<use ns4:href="#mafb3703e5b" x="400.311668" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4445 |
</g>
|
| 4446 |
</g>
|
| 4447 |
<g id="text_4">
|
| 4448 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.864305 549.290582) rotate(-45)">cuda_B1_S1024_E4</text>
|
| 4449 |
</g>
|
| 4450 |
</g>
|
| 4451 |
<g id="xtick_5">
|
| 4452 |
<g id="grid-x--5" class="grid grid-x">
|
| 4453 |
+
<path d="M 502.71515 468.317269 L 502.71515 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4454 |
</g>
|
| 4455 |
<g id="line2d_5">
|
| 4456 |
<g>
|
| 4457 |
+
<use ns4:href="#mafb3703e5b" x="502.71515" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4458 |
</g>
|
| 4459 |
</g>
|
| 4460 |
<g id="text_5">
|
| 4461 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.517271 544.791615) rotate(-45)">cuda_B4_S512_E2</text>
|
| 4462 |
</g>
|
| 4463 |
</g>
|
| 4464 |
<g id="xtick_6">
|
| 4465 |
<g id="grid-x--6" class="grid grid-x">
|
| 4466 |
+
<path d="M 605.118633 468.317269 L 605.118633 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4467 |
</g>
|
| 4468 |
<g id="line2d_6">
|
| 4469 |
<g>
|
| 4470 |
+
<use ns4:href="#mafb3703e5b" x="605.118633" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4471 |
</g>
|
| 4472 |
</g>
|
| 4473 |
<g id="text_6">
|
| 4474 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.920754 544.791615) rotate(-45)">cuda_B4_S512_E4</text>
|
| 4475 |
</g>
|
| 4476 |
</g>
|
| 4477 |
<g id="xtick_7">
|
| 4478 |
<g id="grid-x--7" class="grid grid-x">
|
| 4479 |
+
<path d="M 707.522116 468.317269 L 707.522116 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4480 |
</g>
|
| 4481 |
<g id="line2d_7">
|
| 4482 |
<g>
|
| 4483 |
+
<use ns4:href="#mafb3703e5b" x="707.522116" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4484 |
</g>
|
| 4485 |
</g>
|
| 4486 |
<g id="text_7">
|
| 4487 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.074754 549.290582) rotate(-45)">cuda_B4_S1024_E2</text>
|
| 4488 |
</g>
|
| 4489 |
</g>
|
| 4490 |
<g id="xtick_8">
|
| 4491 |
<g id="grid-x--8" class="grid grid-x">
|
| 4492 |
+
<path d="M 809.925599 468.317269 L 809.925599 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4493 |
</g>
|
| 4494 |
<g id="line2d_8">
|
| 4495 |
<g>
|
| 4496 |
+
<use ns4:href="#mafb3703e5b" x="809.925599" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4497 |
</g>
|
| 4498 |
</g>
|
| 4499 |
<g id="text_8">
|
| 4500 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.478237 549.290582) rotate(-45)">cuda_B4_S1024_E4</text>
|
| 4501 |
</g>
|
| 4502 |
</g>
|
| 4503 |
<g id="label--x" class="xlabel">
|
| 4504 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.513409" y="562.556245" transform="rotate(-0 451.513409 562.556245)">Workload</text>
|
| 4505 |
</g>
|
| 4506 |
</g>
|
| 4507 |
<g id="matplotlib.axis_2">
|
| 4508 |
<g id="ytick_1">
|
| 4509 |
<g id="grid-y--2" class="grid grid-y">
|
| 4510 |
+
<path d="M 57.26 448.91253 L 845.766818 448.91253 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4511 |
</g>
|
| 4512 |
<g id="line2d_9">
|
| 4513 |
<defs>
|
| 4514 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4515 |
</defs>
|
| 4516 |
<g>
|
| 4517 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="448.91253" style="stroke: #000000; stroke-width: 0.8" />
|
| 4518 |
</g>
|
| 4519 |
</g>
|
| 4520 |
<g id="text_9">
|
| 4521 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="452.711749" transform="rotate(-0 50.26 452.711749)">0</text>
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="ytick_2">
|
| 4525 |
<g id="grid-y--3" class="grid grid-y">
|
| 4526 |
+
<path d="M 57.26 399.227119 L 845.766818 399.227119 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4527 |
</g>
|
| 4528 |
<g id="line2d_10">
|
| 4529 |
<g>
|
| 4530 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="399.227119" style="stroke: #000000; stroke-width: 0.8" />
|
| 4531 |
</g>
|
| 4532 |
</g>
|
| 4533 |
<g id="text_10">
|
| 4534 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="403.026338" transform="rotate(-0 50.26 403.026338)">200</text>
|
| 4535 |
</g>
|
| 4536 |
</g>
|
| 4537 |
<g id="ytick_3">
|
| 4538 |
<g id="grid-y--4" class="grid grid-y">
|
| 4539 |
+
<path d="M 57.26 349.541708 L 845.766818 349.541708 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4540 |
</g>
|
| 4541 |
<g id="line2d_11">
|
| 4542 |
<g>
|
| 4543 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="349.541708" style="stroke: #000000; stroke-width: 0.8" />
|
| 4544 |
</g>
|
| 4545 |
</g>
|
| 4546 |
<g id="text_11">
|
| 4547 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="353.340926" transform="rotate(-0 50.26 353.340926)">400</text>
|
| 4548 |
</g>
|
| 4549 |
</g>
|
| 4550 |
<g id="ytick_4">
|
| 4551 |
<g id="grid-y--5" class="grid grid-y">
|
| 4552 |
+
<path d="M 57.26 299.856296 L 845.766818 299.856296 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4553 |
</g>
|
| 4554 |
<g id="line2d_12">
|
| 4555 |
<g>
|
| 4556 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="299.856296" style="stroke: #000000; stroke-width: 0.8" />
|
| 4557 |
</g>
|
| 4558 |
</g>
|
| 4559 |
<g id="text_12">
|
| 4560 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="303.655515" transform="rotate(-0 50.26 303.655515)">600</text>
|
| 4561 |
</g>
|
| 4562 |
</g>
|
| 4563 |
<g id="ytick_5">
|
| 4564 |
<g id="grid-y--6" class="grid grid-y">
|
| 4565 |
+
<path d="M 57.26 250.170885 L 845.766818 250.170885 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4566 |
</g>
|
| 4567 |
<g id="line2d_13">
|
| 4568 |
<g>
|
| 4569 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="250.170885" style="stroke: #000000; stroke-width: 0.8" />
|
| 4570 |
</g>
|
| 4571 |
</g>
|
| 4572 |
<g id="text_13">
|
| 4573 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="253.970104" transform="rotate(-0 50.26 253.970104)">800</text>
|
| 4574 |
</g>
|
| 4575 |
</g>
|
| 4576 |
<g id="ytick_6">
|
| 4577 |
<g id="grid-y--7" class="grid grid-y">
|
| 4578 |
+
<path d="M 57.26 200.485474 L 845.766818 200.485474 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4579 |
</g>
|
| 4580 |
<g id="line2d_14">
|
| 4581 |
<g>
|
| 4582 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="200.485474" style="stroke: #000000; stroke-width: 0.8" />
|
| 4583 |
</g>
|
| 4584 |
</g>
|
| 4585 |
<g id="text_14">
|
| 4586 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="204.284693" transform="rotate(-0 50.26 204.284693)">1000</text>
|
| 4587 |
</g>
|
| 4588 |
</g>
|
| 4589 |
<g id="ytick_7">
|
| 4590 |
<g id="grid-y--8" class="grid grid-y">
|
| 4591 |
+
<path d="M 57.26 150.800062 L 845.766818 150.800062 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4592 |
</g>
|
| 4593 |
<g id="line2d_15">
|
| 4594 |
<g>
|
| 4595 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="150.800062" style="stroke: #000000; stroke-width: 0.8" />
|
| 4596 |
</g>
|
| 4597 |
</g>
|
| 4598 |
<g id="text_15">
|
| 4599 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="154.599281" transform="rotate(-0 50.26 154.599281)">1200</text>
|
| 4600 |
+
</g>
|
| 4601 |
+
</g>
|
| 4602 |
+
<g id="ytick_8">
|
| 4603 |
+
<g id="grid-y--9" class="grid grid-y">
|
| 4604 |
+
<path d="M 57.26 101.114651 L 845.766818 101.114651 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4605 |
+
</g>
|
| 4606 |
+
<g id="line2d_16">
|
| 4607 |
+
<g>
|
| 4608 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="101.114651" style="stroke: #000000; stroke-width: 0.8" />
|
| 4609 |
+
</g>
|
| 4610 |
+
</g>
|
| 4611 |
+
<g id="text_16">
|
| 4612 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="104.91387" transform="rotate(-0 50.26 104.91387)">1400</text>
|
| 4613 |
+
</g>
|
| 4614 |
+
</g>
|
| 4615 |
+
<g id="ytick_9">
|
| 4616 |
+
<g id="grid-y--10" class="grid grid-y">
|
| 4617 |
+
<path d="M 57.26 51.42924 L 845.766818 51.42924 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4618 |
+
</g>
|
| 4619 |
+
<g id="line2d_17">
|
| 4620 |
+
<g>
|
| 4621 |
+
<use ns4:href="#m0fca2865ba" x="57.26" y="51.42924" style="stroke: #000000; stroke-width: 0.8" />
|
| 4622 |
+
</g>
|
| 4623 |
+
</g>
|
| 4624 |
+
<g id="text_17">
|
| 4625 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="55.228459" transform="rotate(-0 50.26 55.228459)">1600</text>
|
| 4626 |
</g>
|
| 4627 |
</g>
|
| 4628 |
<g id="label--y" class="ylabel">
|
| 4629 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.730313" y="247.598635" transform="rotate(-90 18.730313 247.598635)">Latency P50 (ms)</text>
|
| 4630 |
</g>
|
| 4631 |
</g>
|
| 4632 |
<g id="series--binned-torch" class="series">
|
| 4633 |
+
<path d="M 93.101219 410.567585 L 195.504702 400.332462 L 297.908185 357.497295 L 400.311668 350.461107 L 502.71515 266.697997 L 605.118633 253.249871 L 707.522116 73.76468 L 809.925599 46.94533 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4634 |
<defs>
|
| 4635 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4636 |
</defs>
|
| 4637 |
+
<g clip-path="url(#pef1bcf59f7)">
|
| 4638 |
+
<use ns4:href="#md7efaf3aec" x="93.101219" y="410.567585" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4639 |
+
<use ns4:href="#md7efaf3aec" x="195.504702" y="400.332462" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4640 |
+
<use ns4:href="#md7efaf3aec" x="297.908185" y="357.497295" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4641 |
+
<use ns4:href="#md7efaf3aec" x="400.311668" y="350.461107" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4642 |
+
<use ns4:href="#md7efaf3aec" x="502.71515" y="266.697997" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4643 |
+
<use ns4:href="#md7efaf3aec" x="605.118633" y="253.249871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4644 |
+
<use ns4:href="#md7efaf3aec" x="707.522116" y="73.76468" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4645 |
+
<use ns4:href="#md7efaf3aec" x="809.925599" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4646 |
</g>
|
| 4647 |
</g>
|
| 4648 |
<g id="series--gpt-oss-experts" class="series">
|
| 4649 |
+
<path d="M 93.101219 448.251939 L 195.504702 447.930293 L 297.908185 447.951398 L 400.311668 447.585894 L 502.71515 447.222062 L 605.118633 447.041869 L 707.522116 445.587165 L 809.925599 445.581303 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4650 |
<defs>
|
| 4651 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4652 |
</defs>
|
| 4653 |
+
<g clip-path="url(#pef1bcf59f7)">
|
| 4654 |
+
<use ns4:href="#m9b8c54d372" x="93.101219" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4655 |
+
<use ns4:href="#m9b8c54d372" x="195.504702" y="447.930293" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4656 |
+
<use ns4:href="#m9b8c54d372" x="297.908185" y="447.951398" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4657 |
+
<use ns4:href="#m9b8c54d372" x="400.311668" y="447.585894" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4658 |
+
<use ns4:href="#m9b8c54d372" x="502.71515" y="447.222062" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4659 |
+
<use ns4:href="#m9b8c54d372" x="605.118633" y="447.041869" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4660 |
+
<use ns4:href="#m9b8c54d372" x="707.522116" y="445.587165" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4661 |
+
<use ns4:href="#m9b8c54d372" x="809.925599" y="445.581303" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4662 |
</g>
|
| 4663 |
</g>
|
| 4664 |
<g id="patch_3">
|
| 4665 |
+
<path d="M 57.26 468.317269 L 57.26 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4666 |
</g>
|
| 4667 |
<g id="patch_4">
|
| 4668 |
<path d="M 845.766818 468.317269 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4669 |
</g>
|
| 4670 |
<g id="patch_5">
|
| 4671 |
+
<path d="M 57.26 468.317269 L 845.766818 468.317269 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4672 |
</g>
|
| 4673 |
<g id="patch_6">
|
| 4674 |
+
<path d="M 57.26 26.88 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4675 |
</g>
|
| 4676 |
+
<g id="text_18">
|
| 4677 |
+
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.513409" y="20.88" transform="rotate(-0 451.513409 20.88)">Attention Implementation Latency</text>
|
| 4678 |
</g>
|
| 4679 |
<g id="legend" class="legend">
|
| 4680 |
<g id="patch_7">
|
| 4681 |
+
<path d="M 64.26 64.7925 L 177.05375 64.7925 Q 179.05375 64.7925 179.05375 62.7925 L 179.05375 33.88 Q 179.05375 31.88 177.05375 31.88 L 64.26 31.88 Q 62.26 31.88 62.26 33.88 L 62.26 62.7925 Q 62.26 64.7925 64.26 64.7925 L 64.26 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4682 |
</g>
|
| 4683 |
+
<g id="line2d_18">
|
| 4684 |
+
<path d="M 66.26 39.978438 L 76.26 39.978438 L 86.26 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4685 |
<g>
|
| 4686 |
+
<use ns4:href="#md7efaf3aec" x="76.26" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4687 |
</g>
|
| 4688 |
</g>
|
| 4689 |
<g id="legend-label--binned-torch" class="legend">
|
| 4690 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.26" y="43.478438" transform="rotate(-0 94.26 43.478438)">binned_torch</text>
|
| 4691 |
</g>
|
| 4692 |
+
<g id="line2d_19">
|
| 4693 |
+
<path d="M 66.26 54.934687 L 76.26 54.934687 L 86.26 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4694 |
<g>
|
| 4695 |
+
<use ns4:href="#m9b8c54d372" x="76.26" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4696 |
</g>
|
| 4697 |
</g>
|
| 4698 |
<g id="legend-label--gpt-oss-experts" class="legend">
|
| 4699 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.26" y="58.434687" transform="rotate(-0 94.26 58.434687)">gpt_oss_experts</text>
|
| 4700 |
</g>
|
| 4701 |
</g>
|
| 4702 |
</g>
|
| 4703 |
</g>
|
| 4704 |
<defs>
|
| 4705 |
+
<clipPath id="pef1bcf59f7">
|
| 4706 |
+
<rect x="57.26" y="26.88" width="788.506818" height="441.437269" />
|
| 4707 |
</clipPath>
|
| 4708 |
</defs>
|
| 4709 |
</svg>
|
rotary/impls/artifacts/benchmark/rotary.jsonl
CHANGED
|
@@ -1,24 +1,24 @@
|
|
| 1 |
-
{"ts": "2025-12-
|
| 2 |
-
{"ts": "2025-12-
|
| 3 |
-
{"ts": "2025-12-
|
| 4 |
-
{"ts": "2025-12-
|
| 5 |
-
{"ts": "2025-12-
|
| 6 |
-
{"ts": "2025-12-
|
| 7 |
-
{"ts": "2025-12-
|
| 8 |
-
{"ts": "2025-12-
|
| 9 |
-
{"ts": "2025-12-
|
| 10 |
-
{"ts": "2025-12-
|
| 11 |
-
{"ts": "2025-12-
|
| 12 |
-
{"ts": "2025-12-
|
| 13 |
-
{"ts": "2025-12-
|
| 14 |
-
{"ts": "2025-12-
|
| 15 |
-
{"ts": "2025-12-
|
| 16 |
-
{"ts": "2025-12-
|
| 17 |
-
{"ts": "2025-12-
|
| 18 |
-
{"ts": "2025-12-
|
| 19 |
-
{"ts": "2025-12-
|
| 20 |
-
{"ts": "2025-12-
|
| 21 |
-
{"ts": "2025-12-
|
| 22 |
-
{"ts": "2025-12-
|
| 23 |
-
{"ts": "2025-12-
|
| 24 |
-
{"ts": "2025-12-
|
|
|
|
| 1 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0716920001195831, "p50": 0.07250199996633455, "p90": 0.07283199988705746, "mean": 0.07240760000968294, "iqr": 0.0009109999155043624, "raw_times": [0.07250199996633455, 0.07283199988705746, 0.0716920001195831, 0.0719209999715531, 0.07309100010388647], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08089199991445639, "peak_bytes": 3178496, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590452924915553e-08, "mae_k": 1.5487040982975486e-08, "mse_q": 2.5241010080938753e-15, "mse_k": 2.364223539299626e-15, "ref": "rotary_torch"}, "err": null}
|
| 2 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08928200008995191, "p50": 0.08997100007945846, "p90": 0.0909519999368058, "mean": 0.09690580000096816, "iqr": 0.001269999984288006, "raw_times": [0.08928200008995191, 0.08997100007945846, 0.0896819999525178, 0.0909519999368058, 0.12464199994610681], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10248300009152445, "peak_bytes": 6356992, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5508486939097565e-08, "mae_k": 1.567566698668088e-08, "mse_q": 2.3630110116356316e-15, "mse_k": 2.416562128626943e-15, "ref": "rotary_torch"}, "err": null}
|
| 3 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08800199998404423, "p50": 0.08953199994721217, "p90": 0.09005199990497204, "mean": 0.08934599995882309, "iqr": 0.0015899997833912494, "raw_times": [0.08800199998404423, 0.0906819998363062, 0.08953199994721217, 0.08846200012158079, 0.09005199990497204], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08988199988380075, "peak_bytes": 12615680, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5856898016863852e-08, "mae_k": 1.572981211950264e-08, "mse_q": 2.4771055025978386e-15, "mse_k": 2.4544071371937915e-15, "ref": "rotary_torch"}, "err": null}
|
| 4 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08559200000490819, "p50": 0.08698200008439017, "p90": 0.08820199991532718, "mean": 0.08761399999457353, "iqr": 0.0018399998680251883, "raw_times": [0.08820199991532718, 0.08698200008439017, 0.09093199992094014, 0.08559200000490819, 0.08636200004730199], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09196199994221388, "peak_bytes": 25231360, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5617658277733426e-08, "mae_k": 1.5788685914230882e-08, "mse_q": 2.4549424620164562e-15, "mse_k": 2.492823469483563e-15, "ref": "rotary_torch"}, "err": null}
|
| 5 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0869620000685245, "p50": 0.08914199997889227, "p90": 0.08982199983620376, "mean": 0.08910199994716095, "iqr": 0.0019899998733308166, "raw_times": [0.08783199996287294, 0.08914199997889227, 0.08982199983620376, 0.0869620000685245, 0.09175199988931126], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08990199989966641, "peak_bytes": 12779520, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5962712041073246e-08, "mae_k": 1.5743363945830424e-08, "mse_q": 2.534145124782417e-15, "mse_k": 2.451281585618423e-15, "ref": "rotary_torch"}, "err": null}
|
| 6 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08547199990971421, "p50": 0.08752200005801569, "p90": 0.08800199998404423, "mean": 0.08713399993212079, "iqr": 0.002420000100755715, "raw_times": [0.08558199988328852, 0.08909199982554128, 0.08752200005801569, 0.08547199990971421, 0.08800199998404423], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0911110000743065, "peak_bytes": 25427968, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.578730035589615e-08, "mae_k": 1.5859711766097462e-08, "mse_q": 2.440287521479536e-15, "mse_k": 2.477901290051784e-15, "ref": "rotary_torch"}, "err": null}
|
| 7 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08664199981467391, "p50": 0.0881319999734842, "p90": 0.08973199987849512, "mean": 0.08822799991321517, "iqr": 0.0029299999368959107, "raw_times": [0.08664199981467391, 0.08983199995782343, 0.0881319999734842, 0.0868019999415992, 0.08973199987849512], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08937200004766055, "peak_bytes": 50462720, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5775295736375483e-08, "mae_k": 1.5847881229547056e-08, "mse_q": 2.471039476146077e-15, "mse_k": 2.472378635235686e-15, "ref": "rotary_torch"}, "err": null}
|
| 8 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08743200010030705, "p50": 0.08830199999465549, "p90": 0.08903200000531797, "mean": 0.08846400005495525, "iqr": 0.0009099999260797631, "raw_times": [0.08830199999465549, 0.08903200000531797, 0.08943200009525754, 0.0881220000792382, 0.08743200010030705], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09105200001613412, "peak_bytes": 100925440, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5959869870130206e-08, "mae_k": 1.588083975434529e-08, "mse_q": 2.510663677418633e-15, "mse_k": 2.502786271009168e-15, "ref": "rotary_torch"}, "err": null}
|
| 9 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08645200000501063, "p50": 0.08729199998924742, "p90": 0.08832200001052115, "mean": 0.08765380002841994, "iqr": 0.0011610000001383014, "raw_times": [0.08645200000501063, 0.08904200012693764, 0.08832200001052115, 0.08716100001038285, 0.08729199998924742], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08923200016397459, "peak_bytes": 51118080, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5894533689220225e-08, "mae_k": 1.5873395042831362e-08, "mse_q": 2.5093181655819197e-15, "mse_k": 2.488611809911578e-15, "ref": "rotary_torch"}, "err": null}
|
| 10 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08737200005271006, "p50": 0.08886199998414668, "p90": 0.08923200016397459, "mean": 0.08870620004017837, "iqr": 0.0006990001111262245, "raw_times": [0.08737200005271006, 0.08953199994721217, 0.08886199998414668, 0.08853300005284837, 0.08923200016397459], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08981199994195777, "peak_bytes": 101711872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5936768349433805e-08, "mae_k": 1.5960043953100467e-08, "mse_q": 2.51039008577667e-15, "mse_k": 2.5111253103748867e-15, "ref": "rotary_torch"}, "err": null}
|
| 11 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08698100009496557, "p50": 0.0875120001637697, "p90": 0.08985099998426449, "mean": 0.08861960000103863, "iqr": 0.0027790001695393585, "raw_times": [0.0875120001637697, 0.08985099998426449, 0.08698100009496557, 0.09168199994746828, 0.08707199981472513], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09252200015907874, "peak_bytes": 201850880, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 1.9073486328125e-06, "absmax_k": 9.5367431640625e-07, "mae_q": 1.586510300910504e-08, "mae_k": 1.5813935050346117e-08, "mse_q": 2.499836478770355e-15, "mse_k": 2.4755639026338358e-15, "ref": "rotary_torch"}, "err": null}
|
| 12 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2610050000839692, "p50": 0.261626000110482, "p90": 0.26182599981439125, "mean": 0.261735599997337, "iqr": 0.0002109998149535386, "raw_times": [0.2616149999994377, 0.2610050000839692, 0.261626000110482, 0.26182599981439125, 0.2626059999784047], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.26071599995702854, "peak_bytes": 403701760, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.581049247079136e-08, "mae_k": 1.5861061797295406e-08, "mse_q": 2.4735094242202705e-15, "mse_k": 2.486832828964107e-15, "ref": "rotary_torch"}, "err": null}
|
| 13 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08662100003675732, "p50": 0.08793199981482758, "p90": 0.08872200010046072, "mean": 0.08787560000200756, "iqr": 0.0014409999948838959, "raw_times": [0.08662100003675732, 0.08872200010046072, 0.08882199995241535, 0.08793199981482758, 0.08728100010557682], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0905719998627319, "peak_bytes": 137396224, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5824980437173508e-08, "mae_k": 1.5615324144846454e-08, "mse_q": 2.488090249374306e-15, "mse_k": 2.425079044911585e-15, "ref": "rotary_torch"}, "err": null}
|
| 14 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08488200001011137, "p50": 0.08702200011612149, "p90": 0.08860200000526675, "mean": 0.08870799997566792, "iqr": 0.002460000132487039, "raw_times": [0.08488200001011137, 0.08702200011612149, 0.08860200000526675, 0.0861419998727797, 0.09689199987406028], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09050199992088892, "peak_bytes": 12648448, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5683587761827766e-08, "mae_k": 1.574532682013796e-08, "mse_q": 2.4310271220254415e-15, "mse_k": 2.4601385856313877e-15, "ref": "rotary_torch"}, "err": null}
|
| 15 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08632200001557067, "p50": 0.08908199993129529, "p90": 0.08911199984140694, "mean": 0.08852599994497723, "iqr": 0.0001399998836859595, "raw_times": [0.08632200001557067, 0.08914199997889227, 0.08911199984140694, 0.08908199993129529, 0.08897199995772098], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09170199996333395, "peak_bytes": 25198592, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5835009747888762e-08, "mae_k": 1.572560215379326e-08, "mse_q": 2.478222950813504e-15, "mse_k": 2.4541699679685603e-15, "ref": "rotary_torch"}, "err": null}
|
| 16 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08825200006867817, "p50": 0.08947200012698886, "p90": 0.08949199991548085, "mean": 0.08924200005822058, "iqr": 0.00026999987312592566, "raw_times": [0.08825200006867817, 0.08977200013760012, 0.08949199991548085, 0.08947200012698886, 0.08922200004235492], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09157099998446938, "peak_bytes": 50397184, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5876850056883995e-08, "mae_k": 1.5927410501603845e-08, "mse_q": 2.504224532953606e-15, "mse_k": 2.503892919554756e-15, "ref": "rotary_torch"}, "err": null}
|
| 17 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08679200004735321, "p50": 0.08709200005796447, "p90": 0.08838100006869354, "mean": 0.08772180003688845, "iqr": 0.0012890000107290689, "raw_times": [0.08709200005796447, 0.08679200004735321, 0.08925199995246658, 0.08709200005796447, 0.08838100006869354], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09222199992109381, "peak_bytes": 25362432, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5820052823301012e-08, "mae_k": 1.580205122309053e-08, "mse_q": 2.4876468276264184e-15, "mse_k": 2.4866062476507165e-15, "ref": "rotary_torch"}, "err": null}
|
| 18 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08846200012158079, "p50": 0.08976200001598045, "p90": 0.09003199988910637, "mean": 0.0897020000138582, "iqr": 0.00085999977272877, "raw_times": [0.08846200012158079, 0.09003199988910637, 0.08976200001598045, 0.0891720001163776, 0.09108199992624577], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0899420001587714, "peak_bytes": 50593792, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5823172105911e-08, "mae_k": 1.582038855474366e-08, "mse_q": 2.464257071579175e-15, "mse_k": 2.4775099608301526e-15, "ref": "rotary_torch"}, "err": null}
|
| 19 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08725199995751609, "p50": 0.08807200015326089, "p90": 0.08955199996307783, "mean": 0.08866800003488606, "iqr": 0.0017499999103165464, "raw_times": [0.08780200005276129, 0.08807200015326089, 0.08955199996307783, 0.08725199995751609, 0.09066200004781422], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10307200000170269, "peak_bytes": 100794368, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5888783622131086e-08, "mae_k": 1.5861886026868888e-08, "mse_q": 2.4766798685418433e-15, "mse_k": 2.475923891636419e-15, "ref": "rotary_torch"}, "err": null}
|
| 20 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0861920000261307, "p50": 0.08893199992598966, "p90": 0.08899199997358664, "mean": 0.08807199997136195, "iqr": 0.0017600000319362152, "raw_times": [0.0861920000261307, 0.08723199994165043, 0.08893199992598966, 0.0890119999894523, 0.08899199997358664], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09088199999496283, "peak_bytes": 201588736, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5826390864503992e-08, "mae_k": 1.5792682717119533e-08, "mse_q": 2.480465258783123e-15, "mse_k": 2.475580631534544e-15, "ref": "rotary_torch"}, "err": null}
|
| 21 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08800199998404423, "p50": 0.08883200007403502, "p90": 0.08927299995775684, "mean": 0.08933019998949021, "iqr": 0.0010710000424296595, "raw_times": [0.08800199998404423, 0.08883200007403502, 0.09234200001628778, 0.08820199991532718, 0.08927299995775684], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09017200000016601, "peak_bytes": 101449728, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.592899323554775e-08, "mae_k": 1.5925031959795888e-08, "mse_q": 2.50783882253954e-15, "mse_k": 2.5015648494992274e-15, "ref": "rotary_torch"}, "err": null}
|
| 22 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08575199990445981, "p50": 0.08742199997868738, "p90": 0.08754200007388135, "mean": 0.08730620002097567, "iqr": 0.00023999996301427018, "raw_times": [0.08754200007388135, 0.08730200011086708, 0.08575199990445981, 0.0885130000369827, 0.08742199997868738], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0908419999632315, "peak_bytes": 202375168, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590209919299923e-08, "mae_k": 1.590130160877834e-08, "mse_q": 2.4971026799330918e-15, "mse_k": 2.506967649153289e-15, "ref": "rotary_torch"}, "err": null}
|
| 23 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.25617599999350205, "p50": 0.2583450000201992, "p90": 0.2584750000096392, "mean": 0.2583615999810718, "iqr": 0.0005590000000665896, "raw_times": [0.2579160000095726, 0.2583450000201992, 0.25617599999350205, 0.2608959998724458, 0.2584750000096392], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2549850000832521, "peak_bytes": 403177472, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5847520629108658e-08, "mae_k": 1.5862454461057496e-08, "mse_q": 2.4917348203881045e-15, "mse_k": 2.491306009958557e-15, "ref": "rotary_torch"}, "err": null}
|
| 24 |
+
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8457680000901746, "p50": 0.8511980001912889, "p90": 0.8513080001648632, "mean": 0.8505584000886302, "iqr": 0.003619000153776142, "raw_times": [0.8513080001648632, 0.8457680000901746, 0.847689000011087, 0.8511980001912889, 0.856828999985737], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8516880000115634, "peak_bytes": 806354944, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.585225106737198e-08, "mae_k": 1.581303976649906e-08, "mse_q": 2.4866460581992374e-15, "mse_k": 2.4721545950211372e-15, "ref": "rotary_torch"}, "err": null}
|
rotary/impls/hf_kernels_rotary.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rotary/impls/torch_rotary.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rotary/index.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Darwin arm64 | macOS-15.7.2-arm64-arm-64bit
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|