Upload folder using huggingface_hub
Browse files- activation/impls/artifacts/benchmark/activation.jsonl +9 -9
- activation/impls/cells/benchmark.py +13 -7
- activation/impls/hf_kernels_swiglu.html +99 -99
- activation/impls/torch_swiglu.html +130 -124
- activation/results/artifacts/combine/latency.svg +1 -1
- activation/results/combined_results.html +79 -79
- causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +24 -24
- causal_conv1d/impls/hf_kernels_causal_conv1d.html +0 -0
- causal_conv1d/impls/torch_causal_conv1d.html +0 -0
- causal_conv1d/results/artifacts/combine/latency.svg +2 -2
- causal_conv1d/results/combined_results.html +134 -134
- deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl +4 -4
- deformable_detr/impls/cells/benchmark.py +18 -94
- deformable_detr/impls/hf_kernels_deformable_detr.html +81 -79
- deformable_detr/impls/torch_deformable_detr.html +105 -99
- deformable_detr/results/artifacts/combine/latency.svg +2 -2
- deformable_detr/results/combined_results.html +232 -128
- flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -6
- flash_attn/impls/cells/benchmark.py +9 -8
- flash_attn/impls/flash_attention.html +144 -144
- flash_attn/impls/hf_kernels_flash_attn.html +98 -101
- flash_attn/impls/hf_kernels_flash_attn3.html +89 -85
- flash_attn/impls/mem_efficient_attention.html +134 -134
- flash_attn/impls/sage_attention.html +11 -11
- flash_attn/impls/xformers.html +94 -94
- flash_attn/results/artifacts/combine/latency.svg +2 -2
- flash_attn/results/cells/combine.py +1 -0
- flash_attn/results/combined_results.html +154 -152
- index.html +1 -1
- layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +4 -4
- layer_norm/impls/hf_kernels_layer_norm.html +54 -54
- layer_norm/impls/torch_layer_norm.html +56 -56
- layer_norm/results/artifacts/combine/latency.svg +2 -2
- layer_norm/results/combined_results.html +55 -55
- openai_moe/impls/artifacts/benchmark/openai_moe.jsonl +8 -8
- openai_moe/impls/binned_torch.html +186 -186
- openai_moe/impls/gpt_oss_moe.html +199 -197
- openai_moe/results/artifacts/combine/latency.svg +2 -2
- openai_moe/results/combined_results.html +191 -243
- rotary/impls/artifacts/benchmark/rotary.jsonl +24 -24
- rotary/impls/cells/benchmark.py +21 -12
- rotary/impls/hf_kernels_rotary.html +0 -0
- rotary/impls/torch_rotary.html +0 -0
- rotary/index.html +1 -1
- rotary/results/artifacts/combine/latency.svg +2 -2
- rotary/results/combined_results.html +167 -167
activation/impls/artifacts/benchmark/activation.jsonl
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
{"ts": "2025-
|
| 2 |
-
{"ts": "2025-
|
| 3 |
-
{"ts": "2025-
|
| 4 |
-
{"ts": "2025-
|
| 5 |
-
{"ts": "2025-
|
| 6 |
-
{"ts": "2025-
|
| 7 |
-
{"ts": "2025-
|
| 8 |
-
{"ts": "2025-
|
| 9 |
-
{"ts": "2025-
|
|
|
|
| 1 |
+
{"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024160000066331122, "p50": 0.024919999987105257, "p90": 0.025289999939559493, "mean": 0.025252000023101573, "iqr": 0.0006499999471998308, "raw_times": [0.025289999939559493, 0.02725000013015233, 0.024639999992359662, 0.024919999987105257, 0.024160000066331122], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030839999908494065, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027569999929255573, "p50": 0.029069999982311856, "p90": 0.029229999881863478, "mean": 0.029034399949523504, "iqr": 0.0008489998890581774, "raw_times": [0.027569999929255573, 0.030920999961381312, 0.0283809999928053, 0.029229999881863478, 0.029069999982311856], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03184999991390214, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0278800000614865, "p50": 0.02896099999816215, "p90": 0.029151000035199104, "mean": 0.02896060000239231, "iqr": 0.0004910000370728085, "raw_times": [0.028659999998126295, 0.03015099991898751, 0.029151000035199104, 0.02896099999816215, 0.0278800000614865], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03205000007255876, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027180999950360274, "p50": 0.028851000024587847, "p90": 0.029309999945326126, "mean": 0.02889839993258647, "iqr": 0.000470000031782547, "raw_times": [0.027180999950360274, 0.028851000024587847, 0.02883999991354358, 0.03030999982911453, 0.029309999945326126], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030620000188719132, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027590999934545835, "p50": 0.028819999897677917, "p90": 0.02953100010927301, "mean": 0.02878659997804789, "iqr": 0.0017000002117129043, "raw_times": [0.027590999934545835, 0.02953100010927301, 0.027830999897560105, 0.03016000005118258, 0.028819999897677917], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031159999934970983, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026730000172392465, "p50": 0.028800999871236854, "p90": 0.02885999992940924, "mean": 0.028368599987516063, "iqr": 0.0005089998467155965, "raw_times": [0.026730000172392465, 0.02885999992940924, 0.02910099988184811, 0.028800999871236854, 0.028351000082693645], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030940999977246975, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02737999989221862, "p50": 0.0283800000033807, "p90": 0.02853099999811093, "mean": 0.028162599983261316, "iqr": 0.0007899998308857903, "raw_times": [0.02737999989221862, 0.0283800000033807, 0.028780999855371192, 0.02774100016722514, 0.02853099999811093], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.034010999797828845, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02824100010911934, "p50": 0.028820000125051592, "p90": 0.02886099991883384, "mean": 0.029222400007711258, "iqr": 0.00022099993657320738, "raw_times": [0.02824100010911934, 0.02886099991883384, 0.028639999982260633, 0.03154999990329088, 0.028820000125051592], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029901000061727245, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-12-19T19:09:36Z", "run": "8f147eb9175a4bb285586e9acc883123", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02627100002428051, "p50": 0.02855000002455199, "p90": 0.02863100007743924, "mean": 0.028174600083730184, "iqr": 0.0002599999788799323, "raw_times": [0.02627100002428051, 0.028371000098559307, 0.02863100007743924, 0.02905000019381987, 0.02855000002455199], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02980999988722033, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
activation/impls/cells/benchmark.py
CHANGED
|
@@ -4,6 +4,7 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
|
@@ -12,17 +13,22 @@
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 15 |
-
|
| 16 |
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
run_benchmark(
|
| 24 |
kernel_type=KernelTypeEnum.ACTIVATION,
|
| 25 |
-
impl_name="
|
| 26 |
-
impl_tags={"family":"hf-kernels", "backend":"
|
| 27 |
-
impl_func=
|
| 28 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
+
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
+
from kernels import get_kernel
|
| 17 |
|
| 18 |
+
# Load the activation kernel
|
| 19 |
+
activation = get_kernel("kernels-community/activation")
|
| 20 |
|
| 21 |
+
|
| 22 |
+
def hf_kernels_swiglu(input_tensor):
|
| 23 |
+
hidden_dim = input_tensor.shape[-1] // 2
|
| 24 |
+
out_shape = input_tensor.shape[:-1] + (hidden_dim,)
|
| 25 |
+
out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
|
| 26 |
+
return activation.silu_and_mul(out, input_tensor)
|
| 27 |
|
| 28 |
|
| 29 |
run_benchmark(
|
| 30 |
kernel_type=KernelTypeEnum.ACTIVATION,
|
| 31 |
+
impl_name="hf_kernels_swiglu",
|
| 32 |
+
impl_tags={"family": "hf-kernels", "backend": "cuda"},
|
| 33 |
+
impl_func=hf_kernels_swiglu,
|
| 34 |
)
|
activation/impls/hf_kernels_swiglu.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: nv | 0.
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3905,16 +3905,16 @@ Cell: nv | 0.22s
|
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div id="output-nv" class="cell-output">
|
| 3908 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
-
| NVIDIA-SMI 580.
|
| 3911 |
+-----------------------------------------+------------------------+----------------------+
|
| 3912 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3913 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3914 |
| | | MIG M. |
|
| 3915 |
|=========================================+========================+======================|
|
| 3916 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3917 |
-
| N/A
|
| 3918 |
| | | N/A |
|
| 3919 |
+-----------------------------------------+------------------------+----------------------+
|
| 3920 |
|
|
@@ -3938,7 +3938,7 @@ Cell: nv | 0.22s
|
|
| 3938 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3939 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3940 |
</span> |
|
| 3941 |
-
Cell: benchmark |
|
| 3942 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3943 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3944 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3995,17 +3995,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
|
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3997 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3998 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3999 |
-
hf_kernels_swiglu
|
| 4000 |
-
|
| 4001 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4002 |
-
Activity Buffer Request
|
| 4003 |
-
aten::empty
|
| 4004 |
-
cudaLaunchKernel
|
| 4005 |
-
cudaDeviceSynchronize 0.
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
-
Self CPU time total: 2.
|
| 4008 |
-
Self CUDA time total: 4.
|
| 4009 |
|
| 4010 |
|
| 4011 |
|
|
@@ -4015,17 +4015,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
|
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4019 |
-
hf_kernels_swiglu
|
| 4020 |
-
|
| 4021 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4022 |
-
Activity Buffer Request
|
| 4023 |
-
aten::empty 1.
|
| 4024 |
-
cudaLaunchKernel 1.
|
| 4025 |
-
cudaDeviceSynchronize 0.31%
|
| 4026 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4027 |
-
Self CPU time total: 1.
|
| 4028 |
-
Self CUDA time total: 3.
|
| 4029 |
|
| 4030 |
|
| 4031 |
|
|
@@ -4035,17 +4035,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
|
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4039 |
-
hf_kernels_swiglu
|
| 4040 |
-
|
| 4041 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4042 |
-
Activity Buffer Request
|
| 4043 |
-
aten::empty
|
| 4044 |
-
cudaLaunchKernel 1.
|
| 4045 |
-
cudaDeviceSynchronize 0.28% 5.
|
| 4046 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4047 |
-
Self CPU time total: 1.
|
| 4048 |
-
Self CUDA time total: 4.
|
| 4049 |
|
| 4050 |
|
| 4051 |
|
|
@@ -4055,17 +4055,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
|
|
| 4055 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4056 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4057 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4058 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4059 |
-
hf_kernels_swiglu
|
| 4060 |
-
|
| 4061 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4062 |
-
Activity Buffer Request 83
|
| 4063 |
-
aten::empty 0.
|
| 4064 |
-
cudaLaunchKernel 10.
|
| 4065 |
-
cudaDeviceSynchronize 0.
|
| 4066 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4067 |
-
Self CPU time total: 2.
|
| 4068 |
-
Self CUDA time total: 4.
|
| 4069 |
|
| 4070 |
|
| 4071 |
|
|
@@ -4075,17 +4075,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
|
|
| 4075 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4076 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4077 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4078 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4079 |
-
hf_kernels_swiglu
|
| 4080 |
-
|
| 4081 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4082 |
-
Activity Buffer Request
|
| 4083 |
-
aten::empty
|
| 4084 |
-
cudaLaunchKernel
|
| 4085 |
-
cudaDeviceSynchronize 0.
|
| 4086 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4087 |
-
Self CPU time total:
|
| 4088 |
-
Self CUDA time total: 5.
|
| 4089 |
|
| 4090 |
|
| 4091 |
|
|
@@ -4095,17 +4095,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
|
|
| 4095 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4096 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4097 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4098 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4099 |
-
hf_kernels_swiglu
|
| 4100 |
-
|
| 4101 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4102 |
-
Activity Buffer Request
|
| 4103 |
-
aten::empty
|
| 4104 |
-
cudaLaunchKernel
|
| 4105 |
-
cudaDeviceSynchronize
|
| 4106 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4107 |
-
Self CPU time total:
|
| 4108 |
-
Self CUDA time total: 7.
|
| 4109 |
|
| 4110 |
|
| 4111 |
|
|
@@ -4115,17 +4115,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
|
|
| 4115 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4116 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4117 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4118 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.
|
| 4119 |
-
hf_kernels_swiglu
|
| 4120 |
-
|
| 4121 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4122 |
-
Activity Buffer Request
|
| 4123 |
-
aten::empty
|
| 4124 |
-
cudaLaunchKernel
|
| 4125 |
-
cudaDeviceSynchronize
|
| 4126 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4127 |
-
Self CPU time total:
|
| 4128 |
-
Self CUDA time total: 6.
|
| 4129 |
|
| 4130 |
|
| 4131 |
|
|
@@ -4135,17 +4135,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
|
|
| 4135 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4136 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4137 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4138 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4139 |
-
hf_kernels_swiglu
|
| 4140 |
-
|
| 4141 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4142 |
-
Activity Buffer Request
|
| 4143 |
-
aten::empty
|
| 4144 |
-
cudaLaunchKernel
|
| 4145 |
-
cudaDeviceSynchronize 0.
|
| 4146 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4147 |
-
Self CPU time total:
|
| 4148 |
-
Self CUDA time total: 9.
|
| 4149 |
|
| 4150 |
|
| 4151 |
|
|
@@ -4155,17 +4155,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
|
|
| 4155 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4156 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4157 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4158 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4159 |
-
hf_kernels_swiglu
|
| 4160 |
-
|
| 4161 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4162 |
-
Activity Buffer Request
|
| 4163 |
-
aten::empty
|
| 4164 |
-
cudaLaunchKernel
|
| 4165 |
-
cudaDeviceSynchronize
|
| 4166 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4167 |
-
Self CPU time total:
|
| 4168 |
-
Self CUDA time total:
|
| 4169 |
|
| 4170 |
|
| 4171 |
impl wl p50(ms) ok
|
|
@@ -4182,12 +4182,12 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
|
|
| 4182 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4183 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4184 |
<div class="uv-logs-content" style="display: none;">
|
| 4185 |
-
Installed
|
| 4186 |
</div>
|
| 4187 |
</div>
|
| 4188 |
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4189 |
-
Fetching 7 files:
|
| 4190 |
-
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00,
|
| 4191 |
<div class="cell-artifacts">
|
| 4192 |
<h4>Artifacts:</h4>
|
| 4193 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: nv | 0.24s
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div id="output-nv" class="cell-output">
|
| 3908 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:56:03 2025
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
+
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3911 |
+-----------------------------------------+------------------------+----------------------+
|
| 3912 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3913 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3914 |
| | | MIG M. |
|
| 3915 |
|=========================================+========================+======================|
|
| 3916 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3917 |
+
| N/A 30C P0 77W / 350W | 0MiB / 46068MiB | 10% Default |
|
| 3918 |
| | | N/A |
|
| 3919 |
+-----------------------------------------+------------------------+----------------------+
|
| 3920 |
|
|
|
|
| 3938 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3939 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3940 |
</span> |
|
| 3941 |
+
Cell: benchmark | 4.62s
|
| 3942 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3943 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3944 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3997 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3998 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 72.704us 1747.69% 72.704us 72.704us 1
|
| 3999 |
+
hf_kernels_swiglu 10.22% 211.154us 99.32% 2.053ms 2.053ms 0.000us 0.00% 5.600us 5.600us 1
|
| 4000 |
+
_activation_23bf3fb::silu_and_mul 1.00% 20.580us 87.11% 1.800ms 600.140us 4.160us 100.00% 5.600us 1.867us 3
|
| 4001 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.160us 100.00% 4.160us 1.387us 3
|
| 4002 |
+
Activity Buffer Request 84.13% 1.739ms 84.13% 1.739ms 1.739ms 1.440us 34.62% 1.440us 1.440us 1
|
| 4003 |
+
aten::empty 1.99% 41.071us 1.99% 41.071us 13.690us 0.000us 0.00% 0.000us 0.000us 3
|
| 4004 |
+
cudaLaunchKernel 1.99% 41.111us 1.99% 41.111us 13.704us 0.000us 0.00% 0.000us 0.000us 3
|
| 4005 |
+
cudaDeviceSynchronize 0.68% 14.100us 0.68% 14.100us 14.100us 0.000us 0.00% 0.000us 0.000us 1
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
+
Self CPU time total: 2.067ms
|
| 4008 |
+
Self CUDA time total: 4.160us
|
| 4009 |
|
| 4010 |
|
| 4011 |
|
|
|
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 105.278us 2696.67% 105.278us 105.278us 1
|
| 4019 |
+
hf_kernels_swiglu 7.13% 139.913us 99.69% 1.957ms 1.957ms 0.000us 0.00% 5.216us 5.216us 1
|
| 4020 |
+
_activation_23bf3fb::silu_and_mul 1.22% 23.859us 91.38% 1.794ms 598.043us 3.904us 100.00% 5.216us 1.739us 3
|
| 4021 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.904us 100.00% 3.904us 1.301us 3
|
| 4022 |
+
Activity Buffer Request 88.47% 1.737ms 88.47% 1.737ms 1.737ms 1.312us 33.61% 1.312us 1.312us 1
|
| 4023 |
+
aten::empty 1.19% 23.420us 1.19% 23.420us 7.807us 0.000us 0.00% 0.000us 0.000us 3
|
| 4024 |
+
cudaLaunchKernel 1.70% 33.281us 1.70% 33.281us 11.094us 0.000us 0.00% 0.000us 0.000us 3
|
| 4025 |
+
cudaDeviceSynchronize 0.31% 6.000us 0.31% 6.000us 6.000us 0.000us 0.00% 0.000us 0.000us 1
|
| 4026 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4027 |
+
Self CPU time total: 1.963ms
|
| 4028 |
+
Self CUDA time total: 3.904us
|
| 4029 |
|
| 4030 |
|
| 4031 |
|
|
|
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.849us 1275.09% 62.849us 62.849us 1
|
| 4039 |
+
hf_kernels_swiglu 5.51% 105.232us 99.72% 1.903ms 1.903ms 0.000us 0.00% 6.594us 6.594us 1
|
| 4040 |
+
_activation_23bf3fb::silu_and_mul 1.04% 19.839us 93.23% 1.779ms 593.100us 4.929us 100.00% 6.594us 2.198us 3
|
| 4041 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.929us 100.00% 4.929us 1.643us 3
|
| 4042 |
+
Activity Buffer Request 90.86% 1.734ms 90.86% 1.734ms 1.734ms 1.665us 33.78% 1.665us 1.665us 1
|
| 4043 |
+
aten::empty 0.98% 18.730us 0.98% 18.730us 6.243us 0.000us 0.00% 0.000us 0.000us 3
|
| 4044 |
+
cudaLaunchKernel 1.33% 25.362us 1.33% 25.362us 8.454us 0.000us 0.00% 0.000us 0.000us 3
|
| 4045 |
+
cudaDeviceSynchronize 0.28% 5.330us 0.28% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1
|
| 4046 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4047 |
+
Self CPU time total: 1.909ms
|
| 4048 |
+
Self CUDA time total: 4.929us
|
| 4049 |
|
| 4050 |
|
| 4051 |
|
|
|
|
| 4055 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4056 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4057 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4058 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.512us 1515.79% 64.512us 64.512us 1
|
| 4059 |
+
hf_kernels_swiglu 5.00% 107.783us 99.78% 2.152ms 2.152ms 0.000us 0.00% 5.696us 5.696us 1
|
| 4060 |
+
_activation_23bf3fb::silu_and_mul 0.93% 20.060us 93.90% 2.025ms 675.114us 4.256us 100.00% 5.696us 1.899us 3
|
| 4061 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.256us 100.00% 4.256us 1.419us 3
|
| 4062 |
+
Activity Buffer Request 82.83% 1.787ms 82.83% 1.787ms 1.787ms 1.440us 33.83% 1.440us 1.440us 1
|
| 4063 |
+
aten::empty 0.89% 19.099us 0.89% 19.099us 6.366us 0.000us 0.00% 0.000us 0.000us 3
|
| 4064 |
+
cudaLaunchKernel 10.14% 218.744us 10.14% 218.744us 72.915us 0.000us 0.00% 0.000us 0.000us 3
|
| 4065 |
+
cudaDeviceSynchronize 0.22% 4.671us 0.22% 4.671us 4.671us 0.000us 0.00% 0.000us 0.000us 1
|
| 4066 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4067 |
+
Self CPU time total: 2.157ms
|
| 4068 |
+
Self CUDA time total: 4.256us
|
| 4069 |
|
| 4070 |
|
| 4071 |
|
|
|
|
| 4075 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4076 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4077 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4078 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 60.319us 1029.86% 60.319us 60.319us 1
|
| 4079 |
+
hf_kernels_swiglu 13.59% 83.190us 99.22% 607.209us 607.209us 0.000us 0.00% 7.809us 7.809us 1
|
| 4080 |
+
_activation_23bf3fb::silu_and_mul 3.33% 20.351us 82.60% 505.509us 168.503us 5.857us 100.00% 7.809us 2.603us 3
|
| 4081 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.857us 100.00% 5.857us 1.952us 3
|
| 4082 |
+
Activity Buffer Request 46.13% 282.314us 46.13% 282.314us 282.314us 1.952us 33.33% 1.952us 1.952us 1
|
| 4083 |
+
aten::empty 3.02% 18.510us 3.02% 18.510us 6.170us 0.000us 0.00% 0.000us 0.000us 3
|
| 4084 |
+
cudaLaunchKernel 33.14% 202.844us 33.14% 202.844us 67.615us 0.000us 0.00% 0.000us 0.000us 3
|
| 4085 |
+
cudaDeviceSynchronize 0.78% 4.791us 0.78% 4.791us 4.791us 0.000us 0.00% 0.000us 0.000us 1
|
| 4086 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4087 |
+
Self CPU time total: 612.000us
|
| 4088 |
+
Self CUDA time total: 5.857us
|
| 4089 |
|
| 4090 |
|
| 4091 |
|
|
|
|
| 4095 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4096 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4097 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4098 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.087us 899.57% 69.087us 69.087us 1
|
| 4099 |
+
hf_kernels_swiglu 5.09% 105.021us 99.75% 2.059ms 2.059ms 0.000us 0.00% 10.240us 10.240us 1
|
| 4100 |
+
_activation_23bf3fb::silu_and_mul 0.96% 19.861us 93.70% 1.934ms 644.594us 7.680us 100.00% 10.240us 3.413us 3
|
| 4101 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.680us 100.00% 7.680us 2.560us 3
|
| 4102 |
+
Activity Buffer Request 83.16% 1.716ms 83.16% 1.716ms 1.716ms 2.560us 33.33% 2.560us 2.560us 1
|
| 4103 |
+
aten::empty 0.96% 19.840us 0.96% 19.840us 6.613us 0.000us 0.00% 0.000us 0.000us 3
|
| 4104 |
+
cudaLaunchKernel 9.57% 197.533us 9.57% 197.533us 65.844us 0.000us 0.00% 0.000us 0.000us 3
|
| 4105 |
+
cudaDeviceSynchronize 0.25% 5.209us 0.25% 5.209us 5.209us 0.000us 0.00% 0.000us 0.000us 1
|
| 4106 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4107 |
+
Self CPU time total: 2.064ms
|
| 4108 |
+
Self CUDA time total: 7.680us
|
| 4109 |
|
| 4110 |
|
| 4111 |
|
|
|
|
| 4115 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4116 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4117 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4118 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.615us 969.59% 63.615us 63.615us 1
|
| 4119 |
+
hf_kernels_swiglu 4.67% 99.430us 99.78% 2.123ms 2.123ms 0.000us 0.00% 8.769us 8.769us 1
|
| 4120 |
+
_activation_23bf3fb::silu_and_mul 0.94% 19.910us 94.25% 2.005ms 668.341us 6.561us 100.00% 8.769us 2.923us 3
|
| 4121 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.561us 100.00% 6.561us 2.187us 3
|
| 4122 |
+
Activity Buffer Request 84.26% 1.793ms 84.26% 1.793ms 1.793ms 2.208us 33.65% 2.208us 2.208us 1
|
| 4123 |
+
aten::empty 0.86% 18.221us 0.86% 18.221us 6.074us 0.000us 0.00% 0.000us 0.000us 3
|
| 4124 |
+
cudaLaunchKernel 9.05% 192.544us 9.05% 192.544us 64.181us 0.000us 0.00% 0.000us 0.000us 3
|
| 4125 |
+
cudaDeviceSynchronize 0.22% 4.771us 0.22% 4.771us 4.771us 0.000us 0.00% 0.000us 0.000us 1
|
| 4126 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4127 |
+
Self CPU time total: 2.127ms
|
| 4128 |
+
Self CUDA time total: 6.561us
|
| 4129 |
|
| 4130 |
|
| 4131 |
|
|
|
|
| 4135 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4136 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4137 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4138 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 58.655us 627.73% 58.655us 58.655us 1
|
| 4139 |
+
hf_kernels_swiglu 14.96% 80.683us 99.03% 533.948us 533.948us 0.000us 0.00% 12.480us 12.480us 1
|
| 4140 |
+
_activation_23bf3fb::silu_and_mul 3.95% 21.299us 80.75% 435.406us 145.135us 9.344us 100.00% 12.480us 4.160us 3
|
| 4141 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.344us 100.00% 9.344us 3.115us 3
|
| 4142 |
+
Activity Buffer Request 41.04% 221.264us 41.04% 221.264us 221.264us 3.136us 33.56% 3.136us 3.136us 1
|
| 4143 |
+
aten::empty 3.31% 17.859us 3.31% 17.859us 5.953us 0.000us 0.00% 0.000us 0.000us 3
|
| 4144 |
+
cudaLaunchKernel 35.77% 192.843us 35.77% 192.843us 64.281us 0.000us 0.00% 0.000us 0.000us 3
|
| 4145 |
+
cudaDeviceSynchronize 0.97% 5.240us 0.97% 5.240us 5.240us 0.000us 0.00% 0.000us 0.000us 1
|
| 4146 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4147 |
+
Self CPU time total: 539.188us
|
| 4148 |
+
Self CUDA time total: 9.344us
|
| 4149 |
|
| 4150 |
|
| 4151 |
|
|
|
|
| 4155 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4156 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4157 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4158 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 60.863us 469.62% 60.863us 60.863us 1
|
| 4159 |
+
hf_kernels_swiglu 16.50% 95.821us 99.18% 576.059us 576.059us 0.000us 0.00% 17.312us 17.312us 1
|
| 4160 |
+
_activation_23bf3fb::silu_and_mul 3.50% 20.301us 79.69% 462.858us 154.286us 12.960us 100.00% 17.312us 5.771us 3
|
| 4161 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 12.960us 100.00% 12.960us 4.320us 3
|
| 4162 |
+
Activity Buffer Request 43.18% 250.794us 43.18% 250.794us 250.794us 4.352us 33.58% 4.352us 4.352us 1
|
| 4163 |
+
aten::empty 2.99% 17.380us 2.99% 17.380us 5.793us 0.000us 0.00% 0.000us 0.000us 3
|
| 4164 |
+
cudaLaunchKernel 33.01% 191.763us 33.01% 191.763us 63.921us 0.000us 0.00% 0.000us 0.000us 3
|
| 4165 |
+
cudaDeviceSynchronize 0.82% 4.790us 0.82% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1
|
| 4166 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4167 |
+
Self CPU time total: 580.849us
|
| 4168 |
+
Self CUDA time total: 12.960us
|
| 4169 |
|
| 4170 |
|
| 4171 |
impl wl p50(ms) ok
|
|
|
|
| 4182 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4183 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4184 |
<div class="uv-logs-content" style="display: none;">
|
| 4185 |
+
Installed 14 packages in 12ms
|
| 4186 |
</div>
|
| 4187 |
</div>
|
| 4188 |
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4189 |
+
Fetching 7 files: 14%|█▍ | 1/7 [00:00<00:02, 3.00it/s]
|
| 4190 |
+
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 17.15it/s]</div>
|
| 4191 |
<div class="cell-artifacts">
|
| 4192 |
<h4>Artifacts:</h4>
|
| 4193 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
activation/impls/torch_swiglu.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: nv | 0.
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3904,16 +3904,16 @@ Cell: nv | 0.22s
|
|
| 3904 |
</div>
|
| 3905 |
</div>
|
| 3906 |
<div id="output-nv" class="cell-output">
|
| 3907 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3908 |
+-----------------------------------------------------------------------------------------+
|
| 3909 |
-
| NVIDIA-SMI 580.
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
| 3911 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3912 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3913 |
| | | MIG M. |
|
| 3914 |
|=========================================+========================+======================|
|
| 3915 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3916 |
-
| N/A
|
| 3917 |
| | | N/A |
|
| 3918 |
+-----------------------------------------+------------------------+----------------------+
|
| 3919 |
|
|
@@ -3935,9 +3935,9 @@ Cell: nv | 0.22s
|
|
| 3935 |
<span class="collapse-indicators">
|
| 3936 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3937 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3938 |
-
<span id="uv-indicator-benchmark" style="cursor:
|
| 3939 |
</span> |
|
| 3940 |
-
Cell: benchmark |
|
| 3941 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3942 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3943 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3987,20 +3987,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
|
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3991 |
-
torch_eager 8.
|
| 3992 |
-
aten::silu 2.
|
| 3993 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 3994 |
-
aten::mul 1.
|
| 3995 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 3996 |
-
Activity Buffer Request 81.
|
| 3997 |
-
aten::slice 1.
|
| 3998 |
-
aten::as_strided 0.44%
|
| 3999 |
-
cudaLaunchKernel
|
| 4000 |
-
cudaDeviceSynchronize 0.
|
| 4001 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4002 |
-
Self CPU time total: 2.
|
| 4003 |
-
Self CUDA time total: 12.
|
| 4004 |
|
| 4005 |
|
| 4006 |
|
|
@@ -4010,20 +4010,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
|
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4014 |
-
torch_eager
|
| 4015 |
-
aten::silu
|
| 4016 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4017 |
-
aten::mul 1.
|
| 4018 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4019 |
-
Activity Buffer Request 86.
|
| 4020 |
-
aten::slice 1.
|
| 4021 |
-
aten::as_strided 0.27% 5.
|
| 4022 |
-
cudaLaunchKernel 2.
|
| 4023 |
-
cudaDeviceSynchronize 0.
|
| 4024 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
-
Self CPU time total:
|
| 4026 |
-
Self CUDA time total: 12.
|
| 4027 |
|
| 4028 |
|
| 4029 |
|
|
@@ -4033,20 +4033,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
|
|
| 4033 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4034 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4037 |
-
torch_eager
|
| 4038 |
-
aten::silu
|
| 4039 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4040 |
-
aten::mul 1.
|
| 4041 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4042 |
-
Activity Buffer Request
|
| 4043 |
-
aten::slice 1.
|
| 4044 |
-
aten::as_strided 0.26% 5.
|
| 4045 |
-
cudaLaunchKernel 2.
|
| 4046 |
-
cudaDeviceSynchronize 0.
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
-
Self CPU time total: 1.
|
| 4049 |
-
Self CUDA time total: 13.
|
| 4050 |
|
| 4051 |
|
| 4052 |
|
|
@@ -4056,20 +4056,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
|
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4060 |
-
torch_eager
|
| 4061 |
-
aten::silu
|
| 4062 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4063 |
-
aten::mul 1.
|
| 4064 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4065 |
-
Activity Buffer Request
|
| 4066 |
-
aten::slice 1.
|
| 4067 |
-
aten::as_strided 0.
|
| 4068 |
-
cudaLaunchKernel
|
| 4069 |
-
cudaDeviceSynchronize 0.
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
-
Self CPU time total:
|
| 4072 |
-
Self CUDA time total: 12.
|
| 4073 |
|
| 4074 |
|
| 4075 |
|
|
@@ -4079,20 +4079,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
|
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4081 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4082 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4083 |
-
torch_eager
|
| 4084 |
-
aten::silu 1.
|
| 4085 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4086 |
-
aten::mul 1.
|
| 4087 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4088 |
-
Activity Buffer Request 80.
|
| 4089 |
-
aten::slice 1.
|
| 4090 |
-
aten::as_strided 0.
|
| 4091 |
-
cudaLaunchKernel
|
| 4092 |
-
cudaDeviceSynchronize 0.
|
| 4093 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4094 |
-
Self CPU time total: 2.
|
| 4095 |
-
Self CUDA time total: 13.
|
| 4096 |
|
| 4097 |
|
| 4098 |
|
|
@@ -4102,20 +4102,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
|
|
| 4102 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4103 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4104 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4105 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4106 |
-
torch_eager
|
| 4107 |
-
aten::silu
|
| 4108 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4109 |
-
aten::mul
|
| 4110 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4111 |
-
Activity Buffer Request
|
| 4112 |
-
aten::slice
|
| 4113 |
-
aten::as_strided 0.
|
| 4114 |
-
cudaLaunchKernel
|
| 4115 |
-
cudaDeviceSynchronize 0.
|
| 4116 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4117 |
-
Self CPU time total:
|
| 4118 |
-
Self CUDA time total: 15.
|
| 4119 |
|
| 4120 |
|
| 4121 |
|
|
@@ -4125,20 +4125,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
|
|
| 4125 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4126 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4127 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4128 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4129 |
-
torch_eager 4.
|
| 4130 |
-
aten::silu 1.
|
| 4131 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4132 |
-
aten::mul 1.
|
| 4133 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4134 |
-
Activity Buffer Request
|
| 4135 |
-
aten::slice 1.
|
| 4136 |
-
aten::as_strided 0.
|
| 4137 |
-
cudaLaunchKernel
|
| 4138 |
-
cudaDeviceSynchronize 0.24% 5.
|
| 4139 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4140 |
-
Self CPU time total: 2.
|
| 4141 |
-
Self CUDA time total: 14.
|
| 4142 |
|
| 4143 |
|
| 4144 |
|
|
@@ -4148,20 +4148,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
|
|
| 4148 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4149 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4150 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4151 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4152 |
-
torch_eager
|
| 4153 |
-
aten::silu
|
| 4154 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4155 |
-
aten::mul
|
| 4156 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4157 |
-
Activity Buffer Request
|
| 4158 |
-
aten::slice
|
| 4159 |
-
aten::as_strided 0.
|
| 4160 |
-
cudaLaunchKernel
|
| 4161 |
-
cudaDeviceSynchronize 0.
|
| 4162 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4163 |
-
Self CPU time total:
|
| 4164 |
-
Self CUDA time total: 15.
|
| 4165 |
|
| 4166 |
|
| 4167 |
|
|
@@ -4171,20 +4171,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
|
|
| 4171 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4172 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4173 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4174 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4175 |
-
torch_eager
|
| 4176 |
-
aten::silu
|
| 4177 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.
|
| 4178 |
-
aten::mul
|
| 4179 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.
|
| 4180 |
-
Activity Buffer Request
|
| 4181 |
-
aten::slice
|
| 4182 |
-
aten::as_strided 0.
|
| 4183 |
-
cudaLaunchKernel
|
| 4184 |
-
cudaDeviceSynchronize 0.
|
| 4185 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4186 |
-
Self CPU time total:
|
| 4187 |
-
Self CUDA time total: 22.
|
| 4188 |
|
| 4189 |
|
| 4190 |
impl wl p50(ms) ok
|
|
@@ -4198,6 +4198,12 @@ torch_eager cuda_T512_D1024 0.05 True
|
|
| 4198 |
torch_eager cuda_T512_D2048 0.05 True
|
| 4199 |
torch_eager cuda_T512_D768 0.05 True
|
| 4200 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4201 |
<div class="cell-artifacts">
|
| 4202 |
<h4>Artifacts:</h4>
|
| 4203 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: nv | 0.24s
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3904 |
</div>
|
| 3905 |
</div>
|
| 3906 |
<div id="output-nv" class="cell-output">
|
| 3907 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:56:03 2025
|
| 3908 |
+-----------------------------------------------------------------------------------------+
|
| 3909 |
+
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
| 3911 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3912 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3913 |
| | | MIG M. |
|
| 3914 |
|=========================================+========================+======================|
|
| 3915 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3916 |
+
| N/A 30C P0 77W / 350W | 0MiB / 46068MiB | 10% Default |
|
| 3917 |
| | | N/A |
|
| 3918 |
+-----------------------------------------+------------------------+----------------------+
|
| 3919 |
|
|
|
|
| 3935 |
<span class="collapse-indicators">
|
| 3936 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3937 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3938 |
+
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3939 |
</span> |
|
| 3940 |
+
Cell: benchmark | 7.23s
|
| 3941 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3942 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3943 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 170.752us 1351.10% 170.752us 170.752us 1
|
| 3991 |
+
torch_eager 8.36% 195.202us 99.35% 2.320ms 2.320ms 0.000us 0.00% 14.941us 14.941us 1
|
| 3992 |
+
aten::silu 2.60% 60.811us 86.31% 2.016ms 671.908us 6.463us 51.14% 8.766us 2.922us 3
|
| 3993 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.463us 51.14% 6.463us 2.154us 3
|
| 3994 |
+
aten::mul 1.36% 31.870us 2.27% 52.962us 17.654us 6.175us 48.86% 6.175us 2.058us 3
|
| 3995 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.175us 48.86% 6.175us 2.058us 3
|
| 3996 |
+
Activity Buffer Request 81.78% 1.910ms 81.78% 1.910ms 1.910ms 2.303us 18.22% 2.303us 2.303us 1
|
| 3997 |
+
aten::slice 1.97% 46.103us 2.42% 56.432us 9.405us 0.000us 0.00% 0.000us 0.000us 6
|
| 3998 |
+
aten::as_strided 0.44% 10.329us 0.44% 10.329us 1.721us 0.000us 0.00% 0.000us 0.000us 6
|
| 3999 |
+
cudaLaunchKernel 2.83% 66.203us 2.83% 66.203us 11.034us 0.000us 0.00% 0.000us 0.000us 6
|
| 4000 |
+
cudaDeviceSynchronize 0.65% 15.081us 0.65% 15.081us 15.081us 0.000us 0.00% 0.000us 0.000us 1
|
| 4001 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4002 |
+
Self CPU time total: 2.335ms
|
| 4003 |
+
Self CUDA time total: 12.638us
|
| 4004 |
|
| 4005 |
|
| 4006 |
|
|
|
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 142.911us 1157.08% 142.911us 142.911us 1
|
| 4014 |
+
torch_eager 5.43% 102.941us 99.70% 1.891ms 1.891ms 0.000us 0.00% 14.495us 14.495us 1
|
| 4015 |
+
aten::silu 2.14% 40.580us 90.39% 1.715ms 571.523us 6.399us 51.81% 8.543us 2.848us 3
|
| 4016 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.81% 6.399us 2.133us 3
|
| 4017 |
+
aten::mul 1.41% 26.703us 2.36% 44.783us 14.928us 5.952us 48.19% 5.952us 1.984us 3
|
| 4018 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.952us 48.19% 5.952us 1.984us 3
|
| 4019 |
+
Activity Buffer Request 86.86% 1.648ms 86.86% 1.648ms 1.648ms 2.144us 17.36% 2.144us 2.144us 1
|
| 4020 |
+
aten::slice 1.25% 23.641us 1.52% 28.820us 4.803us 0.000us 0.00% 0.000us 0.000us 6
|
| 4021 |
+
aten::as_strided 0.27% 5.179us 0.27% 5.179us 0.863us 0.000us 0.00% 0.000us 0.000us 6
|
| 4022 |
+
cudaLaunchKernel 2.34% 44.460us 2.34% 44.460us 7.410us 0.000us 0.00% 0.000us 0.000us 6
|
| 4023 |
+
cudaDeviceSynchronize 0.30% 5.691us 0.30% 5.691us 5.691us 0.000us 0.00% 0.000us 0.000us 1
|
| 4024 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
+
Self CPU time total: 1.897ms
|
| 4026 |
+
Self CUDA time total: 12.351us
|
| 4027 |
|
| 4028 |
|
| 4029 |
|
|
|
|
| 4033 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4034 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 146.017us 1102.26% 146.017us 146.017us 1
|
| 4037 |
+
torch_eager 5.52% 107.884us 99.72% 1.948ms 1.948ms 0.000us 0.00% 15.519us 15.519us 1
|
| 4038 |
+
aten::silu 2.05% 40.061us 90.43% 1.767ms 588.983us 6.783us 51.20% 9.055us 3.018us 3
|
| 4039 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.783us 51.20% 6.783us 2.261us 3
|
| 4040 |
+
aten::mul 1.30% 25.470us 2.24% 43.800us 14.600us 6.464us 48.80% 6.464us 2.155us 3
|
| 4041 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.80% 6.464us 2.155us 3
|
| 4042 |
+
Activity Buffer Request 87.02% 1.700ms 87.02% 1.700ms 1.700ms 2.272us 17.15% 2.272us 2.272us 1
|
| 4043 |
+
aten::slice 1.26% 24.689us 1.53% 29.809us 4.968us 0.000us 0.00% 0.000us 0.000us 6
|
| 4044 |
+
aten::as_strided 0.26% 5.120us 0.26% 5.120us 0.853us 0.000us 0.00% 0.000us 0.000us 6
|
| 4045 |
+
cudaLaunchKernel 2.30% 44.851us 2.30% 44.851us 7.475us 0.000us 0.00% 0.000us 0.000us 6
|
| 4046 |
+
cudaDeviceSynchronize 0.28% 5.500us 0.28% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
+
Self CPU time total: 1.954ms
|
| 4049 |
+
Self CUDA time total: 13.247us
|
| 4050 |
|
| 4051 |
|
| 4052 |
|
|
|
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 147.168us 1164.30% 147.168us 147.168us 1
|
| 4060 |
+
torch_eager 6.37% 108.862us 99.70% 1.705ms 1.705ms 0.000us 0.00% 14.816us 14.816us 1
|
| 4061 |
+
aten::silu 2.27% 38.759us 89.04% 1.523ms 507.511us 6.496us 51.39% 8.672us 2.891us 3
|
| 4062 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.496us 51.39% 6.496us 2.165us 3
|
| 4063 |
+
aten::mul 1.56% 26.620us 2.60% 44.441us 14.814us 6.144us 48.61% 6.144us 2.048us 3
|
| 4064 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.144us 48.61% 6.144us 2.048us 3
|
| 4065 |
+
Activity Buffer Request 74.65% 1.277ms 74.65% 1.277ms 1.277ms 2.176us 17.22% 2.176us 2.176us 1
|
| 4066 |
+
aten::slice 1.39% 23.842us 1.70% 29.081us 4.847us 0.000us 0.00% 0.000us 0.000us 6
|
| 4067 |
+
aten::as_strided 0.31% 5.239us 0.31% 5.239us 0.873us 0.000us 0.00% 0.000us 0.000us 6
|
| 4068 |
+
cudaLaunchKernel 13.16% 225.035us 13.16% 225.035us 37.506us 0.000us 0.00% 0.000us 0.000us 6
|
| 4069 |
+
cudaDeviceSynchronize 0.30% 5.120us 0.30% 5.120us 5.120us 0.000us 0.00% 0.000us 0.000us 1
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
+
Self CPU time total: 1.710ms
|
| 4072 |
+
Self CUDA time total: 12.640us
|
| 4073 |
|
| 4074 |
|
| 4075 |
|
|
|
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4081 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4082 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 146.271us 1106.86% 146.271us 146.271us 1
|
| 4083 |
+
torch_eager 4.97% 106.601us 99.77% 2.139ms 2.139ms 0.000us 0.00% 15.486us 15.486us 1
|
| 4084 |
+
aten::silu 1.88% 40.251us 91.37% 1.959ms 652.944us 6.751us 51.09% 9.022us 3.007us 3
|
| 4085 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.751us 51.09% 6.751us 2.250us 3
|
| 4086 |
+
aten::mul 1.15% 24.611us 1.97% 42.221us 14.074us 6.464us 48.91% 6.464us 2.155us 3
|
| 4087 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.91% 6.464us 2.155us 3
|
| 4088 |
+
Activity Buffer Request 80.01% 1.715ms 80.01% 1.715ms 1.715ms 2.271us 17.19% 2.271us 2.271us 1
|
| 4089 |
+
aten::slice 1.17% 25.129us 1.45% 31.071us 5.179us 0.000us 0.00% 0.000us 0.000us 6
|
| 4090 |
+
aten::as_strided 0.28% 5.942us 0.28% 5.942us 0.990us 0.000us 0.00% 0.000us 0.000us 6
|
| 4091 |
+
cudaLaunchKernel 10.31% 220.963us 10.31% 220.963us 36.827us 0.000us 0.00% 0.000us 0.000us 6
|
| 4092 |
+
cudaDeviceSynchronize 0.23% 5.031us 0.23% 5.031us 5.031us 0.000us 0.00% 0.000us 0.000us 1
|
| 4093 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4094 |
+
Self CPU time total: 2.144ms
|
| 4095 |
+
Self CUDA time total: 13.215us
|
| 4096 |
|
| 4097 |
|
| 4098 |
|
|
|
|
| 4102 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4103 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4104 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4105 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 139.706us 900.17% 139.706us 139.706us 1
|
| 4106 |
+
torch_eager 16.34% 103.162us 99.17% 626.050us 626.050us 0.000us 0.00% 18.208us 18.208us 1
|
| 4107 |
+
aten::silu 6.36% 40.131us 71.62% 452.127us 150.709us 7.968us 51.34% 10.656us 3.552us 3
|
| 4108 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 51.34% 7.968us 2.656us 3
|
| 4109 |
+
aten::mul 3.68% 23.240us 6.43% 40.610us 13.537us 7.552us 48.66% 7.552us 2.517us 3
|
| 4110 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.66% 7.552us 2.517us 3
|
| 4111 |
+
Activity Buffer Request 33.83% 213.593us 33.83% 213.593us 213.593us 2.688us 17.32% 2.688us 2.688us 1
|
| 4112 |
+
aten::slice 3.84% 24.240us 4.78% 30.151us 5.025us 0.000us 0.00% 0.000us 0.000us 6
|
| 4113 |
+
aten::as_strided 0.94% 5.911us 0.94% 5.911us 0.985us 0.000us 0.00% 0.000us 0.000us 6
|
| 4114 |
+
cudaLaunchKernel 34.18% 215.773us 34.18% 215.773us 35.962us 0.000us 0.00% 0.000us 0.000us 6
|
| 4115 |
+
cudaDeviceSynchronize 0.83% 5.229us 0.83% 5.229us 5.229us 0.000us 0.00% 0.000us 0.000us 1
|
| 4116 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4117 |
+
Self CPU time total: 631.279us
|
| 4118 |
+
Self CUDA time total: 15.520us
|
| 4119 |
|
| 4120 |
|
| 4121 |
|
|
|
|
| 4125 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4126 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4127 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4128 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 149.023us 1039.50% 149.023us 149.023us 1
|
| 4129 |
+
torch_eager 4.97% 105.151us 99.76% 2.112ms 2.112ms 0.000us 0.00% 16.800us 16.800us 1
|
| 4130 |
+
aten::silu 1.93% 40.940us 91.23% 1.932ms 643.947us 7.360us 51.34% 9.824us 3.275us 3
|
| 4131 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 51.34% 7.360us 2.453us 3
|
| 4132 |
+
aten::mul 1.20% 25.341us 2.15% 45.422us 15.141us 6.976us 48.66% 6.976us 2.325us 3
|
| 4133 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.976us 48.66% 6.976us 2.325us 3
|
| 4134 |
+
Activity Buffer Request 80.00% 1.694ms 80.00% 1.694ms 1.694ms 2.464us 17.19% 2.464us 2.464us 1
|
| 4135 |
+
aten::slice 1.16% 24.531us 1.41% 29.941us 4.990us 0.000us 0.00% 0.000us 0.000us 6
|
| 4136 |
+
aten::as_strided 0.26% 5.410us 0.26% 5.410us 0.902us 0.000us 0.00% 0.000us 0.000us 6
|
| 4137 |
+
cudaLaunchKernel 10.25% 217.014us 10.25% 217.014us 36.169us 0.000us 0.00% 0.000us 0.000us 6
|
| 4138 |
+
cudaDeviceSynchronize 0.24% 5.140us 0.24% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1
|
| 4139 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4140 |
+
Self CPU time total: 2.117ms
|
| 4141 |
+
Self CUDA time total: 14.336us
|
| 4142 |
|
| 4143 |
|
| 4144 |
|
|
|
|
| 4148 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4149 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4150 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4151 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 229.537us 1472.90% 229.537us 229.537us 1
|
| 4152 |
+
torch_eager 26.22% 183.030us 99.31% 693.152us 693.152us 0.000us 0.00% 18.272us 18.272us 1
|
| 4153 |
+
aten::silu 5.68% 39.610us 61.61% 430.047us 143.349us 7.967us 51.12% 10.655us 3.552us 3
|
| 4154 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.967us 51.12% 7.967us 2.656us 3
|
| 4155 |
+
aten::mul 3.79% 26.431us 6.97% 48.673us 16.224us 7.617us 48.88% 7.617us 2.539us 3
|
| 4156 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.617us 48.88% 7.617us 2.539us 3
|
| 4157 |
+
Activity Buffer Request 27.95% 195.093us 27.95% 195.093us 195.093us 2.688us 17.25% 2.688us 2.688us 1
|
| 4158 |
+
aten::slice 3.65% 25.463us 4.50% 31.402us 5.234us 0.000us 0.00% 0.000us 0.000us 6
|
| 4159 |
+
aten::as_strided 0.85% 5.939us 0.85% 5.939us 0.990us 0.000us 0.00% 0.000us 0.000us 6
|
| 4160 |
+
cudaLaunchKernel 31.17% 217.586us 31.17% 217.586us 36.264us 0.000us 0.00% 0.000us 0.000us 6
|
| 4161 |
+
cudaDeviceSynchronize 0.69% 4.809us 0.69% 4.809us 4.809us 0.000us 0.00% 0.000us 0.000us 1
|
| 4162 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4163 |
+
Self CPU time total: 697.961us
|
| 4164 |
+
Self CUDA time total: 15.584us
|
| 4165 |
|
| 4166 |
|
| 4167 |
|
|
|
|
| 4171 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4172 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4173 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4174 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 162.367us 719.68% 162.367us 162.367us 1
|
| 4175 |
+
torch_eager 5.30% 112.718us 99.76% 2.123ms 2.123ms 0.000us 0.00% 26.497us 26.497us 1
|
| 4176 |
+
aten::silu 1.99% 42.361us 90.94% 1.935ms 644.944us 11.584us 51.35% 15.520us 5.173us 3
|
| 4177 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.584us 51.35% 11.584us 3.861us 3
|
| 4178 |
+
aten::mul 1.24% 26.291us 2.09% 44.551us 14.850us 10.977us 48.65% 10.977us 3.659us 3
|
| 4179 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.977us 48.65% 10.977us 3.659us 3
|
| 4180 |
+
Activity Buffer Request 79.75% 1.697ms 79.75% 1.697ms 1.697ms 3.936us 17.45% 3.936us 3.936us 1
|
| 4181 |
+
aten::slice 1.18% 25.032us 1.43% 30.473us 5.079us 0.000us 0.00% 0.000us 0.000us 6
|
| 4182 |
+
aten::as_strided 0.26% 5.441us 0.26% 5.441us 0.907us 0.000us 0.00% 0.000us 0.000us 6
|
| 4183 |
+
cudaLaunchKernel 10.06% 214.034us 10.06% 214.034us 35.672us 0.000us 0.00% 0.000us 0.000us 6
|
| 4184 |
+
cudaDeviceSynchronize 0.24% 5.051us 0.24% 5.051us 5.051us 0.000us 0.00% 0.000us 0.000us 1
|
| 4185 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4186 |
+
Self CPU time total: 2.128ms
|
| 4187 |
+
Self CUDA time total: 22.561us
|
| 4188 |
|
| 4189 |
|
| 4190 |
impl wl p50(ms) ok
|
|
|
|
| 4198 |
torch_eager cuda_T512_D2048 0.05 True
|
| 4199 |
torch_eager cuda_T512_D768 0.05 True
|
| 4200 |
</pre></div>
|
| 4201 |
+
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4202 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4203 |
+
<div class="uv-logs-content" style="display: none;">
|
| 4204 |
+
Installed 37 packages in 324ms
|
| 4205 |
+
</div>
|
| 4206 |
+
</div>
|
| 4207 |
<div class="cell-artifacts">
|
| 4208 |
<h4>Artifacts:</h4>
|
| 4209 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
activation/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
activation/results/combined_results.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3889,11 +3889,11 @@ body[data-tool="eraser"] .main-content {
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
-
<dc:date>2025-
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
| 3896 |
-
<dc:title>Matplotlib v3.10.
|
| 3897 |
</ns2:Agent>
|
| 3898 |
</dc:creator>
|
| 3899 |
</ns2:Work>
|
|
@@ -4038,83 +4038,83 @@ body[data-tool="eraser"] .main-content {
|
|
| 4038 |
<g id="matplotlib.axis_2">
|
| 4039 |
<g id="ytick_1">
|
| 4040 |
<g id="grid-y--2" class="grid grid-y">
|
| 4041 |
-
<path d="M 60.23
|
| 4042 |
</g>
|
| 4043 |
<g id="line2d_10">
|
| 4044 |
<defs>
|
| 4045 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4046 |
</defs>
|
| 4047 |
<g>
|
| 4048 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="text_10">
|
| 4052 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4053 |
</g>
|
| 4054 |
</g>
|
| 4055 |
<g id="ytick_2">
|
| 4056 |
<g id="grid-y--3" class="grid grid-y">
|
| 4057 |
-
<path d="M 60.23
|
| 4058 |
</g>
|
| 4059 |
<g id="line2d_11">
|
| 4060 |
<g>
|
| 4061 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="text_11">
|
| 4065 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4066 |
</g>
|
| 4067 |
</g>
|
| 4068 |
<g id="ytick_3">
|
| 4069 |
<g id="grid-y--4" class="grid grid-y">
|
| 4070 |
-
<path d="M 60.23
|
| 4071 |
</g>
|
| 4072 |
<g id="line2d_12">
|
| 4073 |
<g>
|
| 4074 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="text_12">
|
| 4078 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="ytick_4">
|
| 4082 |
<g id="grid-y--5" class="grid grid-y">
|
| 4083 |
-
<path d="M 60.23
|
| 4084 |
</g>
|
| 4085 |
<g id="line2d_13">
|
| 4086 |
<g>
|
| 4087 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="text_13">
|
| 4091 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4092 |
</g>
|
| 4093 |
</g>
|
| 4094 |
<g id="ytick_5">
|
| 4095 |
<g id="grid-y--6" class="grid grid-y">
|
| 4096 |
-
<path d="M 60.23
|
| 4097 |
</g>
|
| 4098 |
<g id="line2d_14">
|
| 4099 |
<g>
|
| 4100 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4101 |
</g>
|
| 4102 |
</g>
|
| 4103 |
<g id="text_14">
|
| 4104 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4105 |
</g>
|
| 4106 |
</g>
|
| 4107 |
<g id="ytick_6">
|
| 4108 |
<g id="grid-y--7" class="grid grid-y">
|
| 4109 |
-
<path d="M 60.23
|
| 4110 |
</g>
|
| 4111 |
<g id="line2d_15">
|
| 4112 |
<g>
|
| 4113 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4114 |
</g>
|
| 4115 |
</g>
|
| 4116 |
<g id="text_15">
|
| 4117 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4118 |
</g>
|
| 4119 |
</g>
|
| 4120 |
<g id="label--y" class="ylabel">
|
|
@@ -4122,37 +4122,37 @@ body[data-tool="eraser"] .main-content {
|
|
| 4122 |
</g>
|
| 4123 |
</g>
|
| 4124 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4125 |
-
<path d="M 96.005644 451.16779 L 185.444754
|
| 4126 |
<defs>
|
| 4127 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4128 |
</defs>
|
| 4129 |
<g clip-path="url(#p620c7d392f)">
|
| 4130 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4131 |
-
<use ns4:href="#md7efaf3aec" x="185.444754" y="
|
| 4132 |
-
<use ns4:href="#md7efaf3aec" x="274.883864" y="
|
| 4133 |
-
<use ns4:href="#md7efaf3aec" x="364.322974" y="
|
| 4134 |
-
<use ns4:href="#md7efaf3aec" x="453.762084" y="
|
| 4135 |
-
<use ns4:href="#md7efaf3aec" x="543.201194" y="
|
| 4136 |
-
<use ns4:href="#md7efaf3aec" x="632.640304" y="
|
| 4137 |
-
<use ns4:href="#md7efaf3aec" x="722.079415" y="
|
| 4138 |
-
<use ns4:href="#md7efaf3aec" x="811.518525" y="
|
| 4139 |
</g>
|
| 4140 |
</g>
|
| 4141 |
<g id="series--torch-eager" class="series">
|
| 4142 |
-
<path d="M 96.005644
|
| 4143 |
<defs>
|
| 4144 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4145 |
</defs>
|
| 4146 |
<g clip-path="url(#p620c7d392f)">
|
| 4147 |
-
<use ns4:href="#m9b8c54d372" x="96.005644" y="
|
| 4148 |
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4149 |
-
<use ns4:href="#m9b8c54d372" x="274.883864" y="
|
| 4150 |
-
<use ns4:href="#m9b8c54d372" x="364.322974" y="
|
| 4151 |
-
<use ns4:href="#m9b8c54d372" x="453.762084" y="
|
| 4152 |
-
<use ns4:href="#m9b8c54d372" x="543.201194" y="
|
| 4153 |
-
<use ns4:href="#m9b8c54d372" x="632.640304" y="
|
| 4154 |
-
<use ns4:href="#m9b8c54d372" x="722.079415" y="
|
| 4155 |
-
<use ns4:href="#m9b8c54d372" x="811.518525" y="
|
| 4156 |
</g>
|
| 4157 |
</g>
|
| 4158 |
<g id="patch_3">
|
|
@@ -4210,7 +4210,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4210 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4211 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4212 |
</span> |
|
| 4213 |
-
Cell: combine | 4.
|
| 4214 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4215 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4216 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4336,7 +4336,7 @@ Implementations included:
|
|
| 4336 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4337 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4338 |
<div class="uv-logs-content" style="display: none;">
|
| 4339 |
-
Installed 37 packages in
|
| 4340 |
</div>
|
| 4341 |
</div>
|
| 4342 |
<div class="cell-artifacts">
|
|
@@ -4349,11 +4349,11 @@ Installed 37 packages in 348ms
|
|
| 4349 |
<rdf:RDF>
|
| 4350 |
<ns2:Work>
|
| 4351 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4352 |
-
<dc:date>2025-
|
| 4353 |
<dc:format>image/svg+xml</dc:format>
|
| 4354 |
<dc:creator>
|
| 4355 |
<ns2:Agent>
|
| 4356 |
-
<dc:title>Matplotlib v3.10.
|
| 4357 |
</ns2:Agent>
|
| 4358 |
</dc:creator>
|
| 4359 |
</ns2:Work>
|
|
@@ -4498,83 +4498,83 @@ Installed 37 packages in 348ms
|
|
| 4498 |
<g id="matplotlib.axis_2">
|
| 4499 |
<g id="ytick_1">
|
| 4500 |
<g id="grid-y--2" class="grid grid-y">
|
| 4501 |
-
<path d="M 60.23
|
| 4502 |
</g>
|
| 4503 |
<g id="line2d_10">
|
| 4504 |
<defs>
|
| 4505 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4506 |
</defs>
|
| 4507 |
<g>
|
| 4508 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4509 |
</g>
|
| 4510 |
</g>
|
| 4511 |
<g id="text_10">
|
| 4512 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4513 |
</g>
|
| 4514 |
</g>
|
| 4515 |
<g id="ytick_2">
|
| 4516 |
<g id="grid-y--3" class="grid grid-y">
|
| 4517 |
-
<path d="M 60.23
|
| 4518 |
</g>
|
| 4519 |
<g id="line2d_11">
|
| 4520 |
<g>
|
| 4521 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="text_11">
|
| 4525 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4526 |
</g>
|
| 4527 |
</g>
|
| 4528 |
<g id="ytick_3">
|
| 4529 |
<g id="grid-y--4" class="grid grid-y">
|
| 4530 |
-
<path d="M 60.23
|
| 4531 |
</g>
|
| 4532 |
<g id="line2d_12">
|
| 4533 |
<g>
|
| 4534 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4535 |
</g>
|
| 4536 |
</g>
|
| 4537 |
<g id="text_12">
|
| 4538 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4539 |
</g>
|
| 4540 |
</g>
|
| 4541 |
<g id="ytick_4">
|
| 4542 |
<g id="grid-y--5" class="grid grid-y">
|
| 4543 |
-
<path d="M 60.23
|
| 4544 |
</g>
|
| 4545 |
<g id="line2d_13">
|
| 4546 |
<g>
|
| 4547 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4548 |
</g>
|
| 4549 |
</g>
|
| 4550 |
<g id="text_13">
|
| 4551 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4552 |
</g>
|
| 4553 |
</g>
|
| 4554 |
<g id="ytick_5">
|
| 4555 |
<g id="grid-y--6" class="grid grid-y">
|
| 4556 |
-
<path d="M 60.23
|
| 4557 |
</g>
|
| 4558 |
<g id="line2d_14">
|
| 4559 |
<g>
|
| 4560 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4561 |
</g>
|
| 4562 |
</g>
|
| 4563 |
<g id="text_14">
|
| 4564 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4565 |
</g>
|
| 4566 |
</g>
|
| 4567 |
<g id="ytick_6">
|
| 4568 |
<g id="grid-y--7" class="grid grid-y">
|
| 4569 |
-
<path d="M 60.23
|
| 4570 |
</g>
|
| 4571 |
<g id="line2d_15">
|
| 4572 |
<g>
|
| 4573 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4574 |
</g>
|
| 4575 |
</g>
|
| 4576 |
<g id="text_15">
|
| 4577 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4578 |
</g>
|
| 4579 |
</g>
|
| 4580 |
<g id="label--y" class="ylabel">
|
|
@@ -4582,37 +4582,37 @@ Installed 37 packages in 348ms
|
|
| 4582 |
</g>
|
| 4583 |
</g>
|
| 4584 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4585 |
-
<path d="M 96.005644 451.16779 L 185.444754
|
| 4586 |
<defs>
|
| 4587 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4588 |
</defs>
|
| 4589 |
<g clip-path="url(#p620c7d392f)">
|
| 4590 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4591 |
-
<use ns4:href="#md7efaf3aec" x="185.444754" y="
|
| 4592 |
-
<use ns4:href="#md7efaf3aec" x="274.883864" y="
|
| 4593 |
-
<use ns4:href="#md7efaf3aec" x="364.322974" y="
|
| 4594 |
-
<use ns4:href="#md7efaf3aec" x="453.762084" y="
|
| 4595 |
-
<use ns4:href="#md7efaf3aec" x="543.201194" y="
|
| 4596 |
-
<use ns4:href="#md7efaf3aec" x="632.640304" y="
|
| 4597 |
-
<use ns4:href="#md7efaf3aec" x="722.079415" y="
|
| 4598 |
-
<use ns4:href="#md7efaf3aec" x="811.518525" y="
|
| 4599 |
</g>
|
| 4600 |
</g>
|
| 4601 |
<g id="series--torch-eager" class="series">
|
| 4602 |
-
<path d="M 96.005644
|
| 4603 |
<defs>
|
| 4604 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4605 |
</defs>
|
| 4606 |
<g clip-path="url(#p620c7d392f)">
|
| 4607 |
-
<use ns4:href="#m9b8c54d372" x="96.005644" y="
|
| 4608 |
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4609 |
-
<use ns4:href="#m9b8c54d372" x="274.883864" y="
|
| 4610 |
-
<use ns4:href="#m9b8c54d372" x="364.322974" y="
|
| 4611 |
-
<use ns4:href="#m9b8c54d372" x="453.762084" y="
|
| 4612 |
-
<use ns4:href="#m9b8c54d372" x="543.201194" y="
|
| 4613 |
-
<use ns4:href="#m9b8c54d372" x="632.640304" y="
|
| 4614 |
-
<use ns4:href="#m9b8c54d372" x="722.079415" y="
|
| 4615 |
-
<use ns4:href="#m9b8c54d372" x="811.518525" y="
|
| 4616 |
</g>
|
| 4617 |
</g>
|
| 4618 |
<g id="patch_3">
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
+
<dc:date>2025-12-19T19:10:09.156027</dc:date>
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
| 3896 |
+
<dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
|
| 3897 |
</ns2:Agent>
|
| 3898 |
</dc:creator>
|
| 3899 |
</ns2:Work>
|
|
|
|
| 4038 |
<g id="matplotlib.axis_2">
|
| 4039 |
<g id="ytick_1">
|
| 4040 |
<g id="grid-y--2" class="grid grid-y">
|
| 4041 |
+
<path d="M 60.23 449.91292 L 847.294169 449.91292 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4042 |
</g>
|
| 4043 |
<g id="line2d_10">
|
| 4044 |
<defs>
|
| 4045 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4046 |
</defs>
|
| 4047 |
<g>
|
| 4048 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="449.91292" style="stroke: #000000; stroke-width: 0.8" />
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="text_10">
|
| 4052 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="453.712139" transform="rotate(-0 53.23 453.712139)">0.025</text>
|
| 4053 |
</g>
|
| 4054 |
</g>
|
| 4055 |
<g id="ytick_2">
|
| 4056 |
<g id="grid-y--3" class="grid grid-y">
|
| 4057 |
+
<path d="M 60.23 371.483588 L 847.294169 371.483588 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4058 |
</g>
|
| 4059 |
<g id="line2d_11">
|
| 4060 |
<g>
|
| 4061 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="371.483588" style="stroke: #000000; stroke-width: 0.8" />
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="text_11">
|
| 4065 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="375.282806" transform="rotate(-0 53.23 375.282806)">0.030</text>
|
| 4066 |
</g>
|
| 4067 |
</g>
|
| 4068 |
<g id="ytick_3">
|
| 4069 |
<g id="grid-y--4" class="grid grid-y">
|
| 4070 |
+
<path d="M 60.23 293.054255 L 847.294169 293.054255 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4071 |
</g>
|
| 4072 |
<g id="line2d_12">
|
| 4073 |
<g>
|
| 4074 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="293.054255" style="stroke: #000000; stroke-width: 0.8" />
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="text_12">
|
| 4078 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="296.853473" transform="rotate(-0 53.23 296.853473)">0.035</text>
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="ytick_4">
|
| 4082 |
<g id="grid-y--5" class="grid grid-y">
|
| 4083 |
+
<path d="M 60.23 214.624922 L 847.294169 214.624922 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4084 |
</g>
|
| 4085 |
<g id="line2d_13">
|
| 4086 |
<g>
|
| 4087 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="214.624922" style="stroke: #000000; stroke-width: 0.8" />
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="text_13">
|
| 4091 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="218.42414" transform="rotate(-0 53.23 218.42414)">0.040</text>
|
| 4092 |
</g>
|
| 4093 |
</g>
|
| 4094 |
<g id="ytick_5">
|
| 4095 |
<g id="grid-y--6" class="grid grid-y">
|
| 4096 |
+
<path d="M 60.23 136.195589 L 847.294169 136.195589 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4097 |
</g>
|
| 4098 |
<g id="line2d_14">
|
| 4099 |
<g>
|
| 4100 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="136.195589" style="stroke: #000000; stroke-width: 0.8" />
|
| 4101 |
</g>
|
| 4102 |
</g>
|
| 4103 |
<g id="text_14">
|
| 4104 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="139.994807" transform="rotate(-0 53.23 139.994807)">0.045</text>
|
| 4105 |
</g>
|
| 4106 |
</g>
|
| 4107 |
<g id="ytick_6">
|
| 4108 |
<g id="grid-y--7" class="grid grid-y">
|
| 4109 |
+
<path d="M 60.23 57.766256 L 847.294169 57.766256 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4110 |
</g>
|
| 4111 |
<g id="line2d_15">
|
| 4112 |
<g>
|
| 4113 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="57.766256" style="stroke: #000000; stroke-width: 0.8" />
|
| 4114 |
</g>
|
| 4115 |
</g>
|
| 4116 |
<g id="text_15">
|
| 4117 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="61.565474" transform="rotate(-0 53.23 61.565474)">0.050</text>
|
| 4118 |
</g>
|
| 4119 |
</g>
|
| 4120 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4122 |
</g>
|
| 4123 |
</g>
|
| 4124 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4125 |
+
<path d="M 96.005644 451.16779 L 185.444754 386.071444 L 274.883864 387.781203 L 364.322974 389.506648 L 453.762084 389.992912 L 543.201194 390.290944 L 632.640304 396.894691 L 722.079415 389.992908 L 811.518525 394.228094 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4126 |
<defs>
|
| 4127 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4128 |
</defs>
|
| 4129 |
<g clip-path="url(#p620c7d392f)">
|
| 4130 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4131 |
+
<use ns4:href="#md7efaf3aec" x="185.444754" y="386.071444" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4132 |
+
<use ns4:href="#md7efaf3aec" x="274.883864" y="387.781203" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4133 |
+
<use ns4:href="#md7efaf3aec" x="364.322974" y="389.506648" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4134 |
+
<use ns4:href="#md7efaf3aec" x="453.762084" y="389.992912" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4135 |
+
<use ns4:href="#md7efaf3aec" x="543.201194" y="390.290944" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4136 |
+
<use ns4:href="#md7efaf3aec" x="632.640304" y="396.894691" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4137 |
+
<use ns4:href="#md7efaf3aec" x="722.079415" y="389.992908" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4138 |
+
<use ns4:href="#md7efaf3aec" x="811.518525" y="394.228094" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4139 |
</g>
|
| 4140 |
</g>
|
| 4141 |
<g id="series--torch-eager" class="series">
|
| 4142 |
+
<path d="M 96.005644 171.802506 L 185.444754 47.08418 L 274.883864 55.554548 L 364.322974 56.966277 L 453.762084 62.45633 L 543.201194 80.651935 L 632.640304 68.416959 L 722.079415 69.358111 L 811.518525 82.847956 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4143 |
<defs>
|
| 4144 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4145 |
</defs>
|
| 4146 |
<g clip-path="url(#p620c7d392f)">
|
| 4147 |
+
<use ns4:href="#m9b8c54d372" x="96.005644" y="171.802506" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4148 |
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4149 |
+
<use ns4:href="#m9b8c54d372" x="274.883864" y="55.554548" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4150 |
+
<use ns4:href="#m9b8c54d372" x="364.322974" y="56.966277" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4151 |
+
<use ns4:href="#m9b8c54d372" x="453.762084" y="62.45633" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4152 |
+
<use ns4:href="#m9b8c54d372" x="543.201194" y="80.651935" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4153 |
+
<use ns4:href="#m9b8c54d372" x="632.640304" y="68.416959" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4154 |
+
<use ns4:href="#m9b8c54d372" x="722.079415" y="69.358111" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4155 |
+
<use ns4:href="#m9b8c54d372" x="811.518525" y="82.847956" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4156 |
</g>
|
| 4157 |
</g>
|
| 4158 |
<g id="patch_3">
|
|
|
|
| 4210 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4211 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4212 |
</span> |
|
| 4213 |
+
Cell: combine | 4.43s
|
| 4214 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4215 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4216 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4336 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4337 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4338 |
<div class="uv-logs-content" style="display: none;">
|
| 4339 |
+
Installed 37 packages in 283ms
|
| 4340 |
</div>
|
| 4341 |
</div>
|
| 4342 |
<div class="cell-artifacts">
|
|
|
|
| 4349 |
<rdf:RDF>
|
| 4350 |
<ns2:Work>
|
| 4351 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4352 |
+
<dc:date>2025-12-19T19:10:09.156027</dc:date>
|
| 4353 |
<dc:format>image/svg+xml</dc:format>
|
| 4354 |
<dc:creator>
|
| 4355 |
<ns2:Agent>
|
| 4356 |
+
<dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
|
| 4357 |
</ns2:Agent>
|
| 4358 |
</dc:creator>
|
| 4359 |
</ns2:Work>
|
|
|
|
| 4498 |
<g id="matplotlib.axis_2">
|
| 4499 |
<g id="ytick_1">
|
| 4500 |
<g id="grid-y--2" class="grid grid-y">
|
| 4501 |
+
<path d="M 60.23 449.91292 L 847.294169 449.91292 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4502 |
</g>
|
| 4503 |
<g id="line2d_10">
|
| 4504 |
<defs>
|
| 4505 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4506 |
</defs>
|
| 4507 |
<g>
|
| 4508 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="449.91292" style="stroke: #000000; stroke-width: 0.8" />
|
| 4509 |
</g>
|
| 4510 |
</g>
|
| 4511 |
<g id="text_10">
|
| 4512 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="453.712139" transform="rotate(-0 53.23 453.712139)">0.025</text>
|
| 4513 |
</g>
|
| 4514 |
</g>
|
| 4515 |
<g id="ytick_2">
|
| 4516 |
<g id="grid-y--3" class="grid grid-y">
|
| 4517 |
+
<path d="M 60.23 371.483588 L 847.294169 371.483588 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4518 |
</g>
|
| 4519 |
<g id="line2d_11">
|
| 4520 |
<g>
|
| 4521 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="371.483588" style="stroke: #000000; stroke-width: 0.8" />
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="text_11">
|
| 4525 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="375.282806" transform="rotate(-0 53.23 375.282806)">0.030</text>
|
| 4526 |
</g>
|
| 4527 |
</g>
|
| 4528 |
<g id="ytick_3">
|
| 4529 |
<g id="grid-y--4" class="grid grid-y">
|
| 4530 |
+
<path d="M 60.23 293.054255 L 847.294169 293.054255 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4531 |
</g>
|
| 4532 |
<g id="line2d_12">
|
| 4533 |
<g>
|
| 4534 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="293.054255" style="stroke: #000000; stroke-width: 0.8" />
|
| 4535 |
</g>
|
| 4536 |
</g>
|
| 4537 |
<g id="text_12">
|
| 4538 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="296.853473" transform="rotate(-0 53.23 296.853473)">0.035</text>
|
| 4539 |
</g>
|
| 4540 |
</g>
|
| 4541 |
<g id="ytick_4">
|
| 4542 |
<g id="grid-y--5" class="grid grid-y">
|
| 4543 |
+
<path d="M 60.23 214.624922 L 847.294169 214.624922 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4544 |
</g>
|
| 4545 |
<g id="line2d_13">
|
| 4546 |
<g>
|
| 4547 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="214.624922" style="stroke: #000000; stroke-width: 0.8" />
|
| 4548 |
</g>
|
| 4549 |
</g>
|
| 4550 |
<g id="text_13">
|
| 4551 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="218.42414" transform="rotate(-0 53.23 218.42414)">0.040</text>
|
| 4552 |
</g>
|
| 4553 |
</g>
|
| 4554 |
<g id="ytick_5">
|
| 4555 |
<g id="grid-y--6" class="grid grid-y">
|
| 4556 |
+
<path d="M 60.23 136.195589 L 847.294169 136.195589 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4557 |
</g>
|
| 4558 |
<g id="line2d_14">
|
| 4559 |
<g>
|
| 4560 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="136.195589" style="stroke: #000000; stroke-width: 0.8" />
|
| 4561 |
</g>
|
| 4562 |
</g>
|
| 4563 |
<g id="text_14">
|
| 4564 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="139.994807" transform="rotate(-0 53.23 139.994807)">0.045</text>
|
| 4565 |
</g>
|
| 4566 |
</g>
|
| 4567 |
<g id="ytick_6">
|
| 4568 |
<g id="grid-y--7" class="grid grid-y">
|
| 4569 |
+
<path d="M 60.23 57.766256 L 847.294169 57.766256 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4570 |
</g>
|
| 4571 |
<g id="line2d_15">
|
| 4572 |
<g>
|
| 4573 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="57.766256" style="stroke: #000000; stroke-width: 0.8" />
|
| 4574 |
</g>
|
| 4575 |
</g>
|
| 4576 |
<g id="text_15">
|
| 4577 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="61.565474" transform="rotate(-0 53.23 61.565474)">0.050</text>
|
| 4578 |
</g>
|
| 4579 |
</g>
|
| 4580 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4582 |
</g>
|
| 4583 |
</g>
|
| 4584 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4585 |
+
<path d="M 96.005644 451.16779 L 185.444754 386.071444 L 274.883864 387.781203 L 364.322974 389.506648 L 453.762084 389.992912 L 543.201194 390.290944 L 632.640304 396.894691 L 722.079415 389.992908 L 811.518525 394.228094 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4586 |
<defs>
|
| 4587 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4588 |
</defs>
|
| 4589 |
<g clip-path="url(#p620c7d392f)">
|
| 4590 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4591 |
+
<use ns4:href="#md7efaf3aec" x="185.444754" y="386.071444" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4592 |
+
<use ns4:href="#md7efaf3aec" x="274.883864" y="387.781203" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4593 |
+
<use ns4:href="#md7efaf3aec" x="364.322974" y="389.506648" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4594 |
+
<use ns4:href="#md7efaf3aec" x="453.762084" y="389.992912" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4595 |
+
<use ns4:href="#md7efaf3aec" x="543.201194" y="390.290944" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4596 |
+
<use ns4:href="#md7efaf3aec" x="632.640304" y="396.894691" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4597 |
+
<use ns4:href="#md7efaf3aec" x="722.079415" y="389.992908" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4598 |
+
<use ns4:href="#md7efaf3aec" x="811.518525" y="394.228094" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4599 |
</g>
|
| 4600 |
</g>
|
| 4601 |
<g id="series--torch-eager" class="series">
|
| 4602 |
+
<path d="M 96.005644 171.802506 L 185.444754 47.08418 L 274.883864 55.554548 L 364.322974 56.966277 L 453.762084 62.45633 L 543.201194 80.651935 L 632.640304 68.416959 L 722.079415 69.358111 L 811.518525 82.847956 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4603 |
<defs>
|
| 4604 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4605 |
</defs>
|
| 4606 |
<g clip-path="url(#p620c7d392f)">
|
| 4607 |
+
<use ns4:href="#m9b8c54d372" x="96.005644" y="171.802506" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4608 |
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4609 |
+
<use ns4:href="#m9b8c54d372" x="274.883864" y="55.554548" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4610 |
+
<use ns4:href="#m9b8c54d372" x="364.322974" y="56.966277" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4611 |
+
<use ns4:href="#m9b8c54d372" x="453.762084" y="62.45633" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4612 |
+
<use ns4:href="#m9b8c54d372" x="543.201194" y="80.651935" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4613 |
+
<use ns4:href="#m9b8c54d372" x="632.640304" y="68.416959" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4614 |
+
<use ns4:href="#m9b8c54d372" x="722.079415" y="69.358111" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4615 |
+
<use ns4:href="#m9b8c54d372" x="811.518525" y="82.847956" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4616 |
</g>
|
| 4617 |
</g>
|
| 4618 |
<g id="patch_3">
|
causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
CHANGED
|
@@ -1,24 +1,24 @@
|
|
| 1 |
-
{"ts": "2025-
|
| 2 |
-
{"ts": "2025-
|
| 3 |
-
{"ts": "2025-
|
| 4 |
-
{"ts": "2025-
|
| 5 |
-
{"ts": "2025-
|
| 6 |
-
{"ts": "2025-
|
| 7 |
-
{"ts": "2025-
|
| 8 |
-
{"ts": "2025-
|
| 9 |
-
{"ts": "2025-
|
| 10 |
-
{"ts": "2025-
|
| 11 |
-
{"ts": "2025-
|
| 12 |
-
{"ts": "2025-
|
| 13 |
-
{"ts": "2025-
|
| 14 |
-
{"ts": "2025-
|
| 15 |
-
{"ts": "2025-
|
| 16 |
-
{"ts": "2025-
|
| 17 |
-
{"ts": "2025-
|
| 18 |
-
{"ts": "2025-
|
| 19 |
-
{"ts": "2025-
|
| 20 |
-
{"ts": "2025-
|
| 21 |
-
{"ts": "2025-
|
| 22 |
-
{"ts": "2025-
|
| 23 |
-
{"ts": "2025-
|
| 24 |
-
{"ts": "2025-
|
|
|
|
| 1 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07121199996618088, "p50": 0.07280099998752121, "p90": 0.07319100001268453, "mean": 0.07264140000415864, "iqr": 0.0009299999987888441, "raw_times": [0.07226100001389568, 0.07280099998752121, 0.07121199996618088, 0.07319100001268453, 0.0737420000405109], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07994200001348872, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08309200001122008, "p50": 0.08365100001128667, "p90": 0.08423100001664352, "mean": 0.08372920000283557, "iqr": 0.0009500000146545062, "raw_times": [0.08328100000198901, 0.08439099997303856, 0.08365100001128667, 0.08309200001122008, 0.08423100001664352], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08683200002224112, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08177099999784332, "p50": 0.08268100003760992, "p90": 0.08301100001517625, "mean": 0.08273520002148871, "iqr": 0.0009299999987888441, "raw_times": [0.0820810000163874, 0.08413200004042665, 0.08301100001517625, 0.08268100003760992, 0.08177099999784332], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08564099999830432, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08037100002411535, "p50": 0.0813119999634182, "p90": 0.08183099998859689, "mean": 0.08147719998987668, "iqr": 0.0007300000106624793, "raw_times": [0.08037100002411535, 0.08183099998859689, 0.08277099999531856, 0.08110099997793441, 0.0813119999634182], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08382099997561454, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07959199996321331, "p50": 0.08119099999248647, "p90": 0.08157100000971695, "mean": 0.08080360000803921, "iqr": 0.0016889999869817984, "raw_times": [0.08157100000971695, 0.08178200005204417, 0.08119099999248647, 0.07988200002273516, 0.07959199996321331], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08668100002751089, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0801809999870784, "p50": 0.08213100005605156, "p90": 0.08258200000454963, "mean": 0.08208560000184661, "iqr": 0.0013300000318849925, "raw_times": [0.0801809999870784, 0.08213100005605156, 0.08428199998888886, 0.08258200000454963, 0.08125199997266463], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08745099995621786, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07887199996048366, "p50": 0.08016100002805615, "p90": 0.08020199999236866, "mean": 0.0800293999986934, "iqr": 7.099998811099795e-05, "raw_times": [0.08078100000830091, 0.08013100000425766, 0.08016100002805615, 0.08020199999236866, 0.07887199996048366], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0863010000102804, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07991100000026563, "p50": 0.0806710000347266, "p90": 0.08072099996070392, "mean": 0.08052299999690149, "iqr": 0.0007399999617518915, "raw_times": [0.0806710000347266, 0.07998099999895203, 0.08133099998985927, 0.08072099996070392, 0.07991100000026563], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08638100001689963, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07996199997251097, "p50": 0.08190199997670788, "p90": 0.08266099996490084, "mean": 0.08240359998126223, "iqr": 0.0024089999897114467, "raw_times": [0.07996199997251097, 0.08724100001700208, 0.08266099996490084, 0.08025199997518939, 0.08190199997670788], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0852109999982531, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 10 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08023199995932373, "p50": 0.08207100000845458, "p90": 0.08239099997808808, "mean": 0.08726959998739403, "iqr": 0.0012489999789977446, "raw_times": [0.08239099997808808, 0.11051199999201344, 0.08023199995932373, 0.08114199999909033, 0.08207100000845458], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0862620000248171, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 11 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.1583530000175415, "p50": 0.15909299997929338, "p90": 0.15925299999253184, "mean": 0.15925679999782005, "iqr": 0.00036099999078942346, "raw_times": [0.15909299997929338, 0.15889200000174242, 0.16069299999799114, 0.15925299999253184, 0.1583530000175415], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1604330000191112, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 12 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16316300002472417, "p50": 0.164162999965356, "p90": 0.16488300002492906, "mean": 0.16412500000342334, "iqr": 0.001400000030571391, "raw_times": [0.164162999965356, 0.16316300002472417, 0.16488300002492906, 0.1649330000077498, 0.16348299999435767], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.16465299995616078, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 13 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07947099999228158, "p50": 0.08210099997540965, "p90": 0.08814200003826045, "mean": 0.08720339999399584, "iqr": 0.0077410000471900275, "raw_times": [0.08814200003826045, 0.08040099999107042, 0.10590199997295713, 0.08210099997540965, 0.07947099999228158], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08537200000091616, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 14 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07920100000546881, "p50": 0.0799709999910192, "p90": 0.0812010000004193, "mean": 0.08044520000112243, "iqr": 0.0014700000292577897, "raw_times": [0.07973099997116151, 0.08212200003754333, 0.0812010000004193, 0.07920100000546881, 0.0799709999910192], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0854219999837369, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 15 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07912099999884958, "p50": 0.08021200000030149, "p90": 0.08024099997783196, "mean": 0.07987539999021465, "iqr": 0.0010200000133409048, "raw_times": [0.07912099999884958, 0.07922099996449106, 0.08021200000030149, 0.08024099997783196, 0.08058200000959914], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.084330999982285, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 16 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08098199998585187, "p50": 0.08176099998991049, "p90": 0.08322100001123545, "mean": 0.08342759999777627, "iqr": 0.0014889999988554337, "raw_times": [0.08098199998585187, 0.08322100001123545, 0.08176099998991049, 0.08173200001238001, 0.08944199998950353], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08506099999294747, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 17 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07974099997909434, "p50": 0.0811310000017329, "p90": 0.08221100000582737, "mean": 0.08252540000057706, "iqr": 0.001659000020026724, "raw_times": [0.07974099997909434, 0.08221100000582737, 0.0811310000017329, 0.08899200003043006, 0.08055199998580065], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0852420000114762, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 18 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08050200000297991, "p50": 0.08192199999257355, "p90": 0.08227199998600554, "mean": 0.08175159999836978, "iqr": 0.000530999955117295, "raw_times": [0.08174100003088824, 0.08050200000297991, 0.08192199999257355, 0.08227199998600554, 0.08232099997940168], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0854219999837369, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 19 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07911099999091675, "p50": 0.08030099996858553, "p90": 0.08107100001097933, "mean": 0.08047899999610308, "iqr": 0.0018000000068241206, "raw_times": [0.08030099996858553, 0.0826410000058786, 0.08107100001097933, 0.07911099999091675, 0.07927100000415521], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0837320000073305, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 20 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0812719999885303, "p50": 0.08212100004811873, "p90": 0.08214100000714097, "mean": 0.08211120000396477, "iqr": 0.00043000000005122274, "raw_times": [0.08171100000708975, 0.08212100004811873, 0.08331099996894409, 0.08214100000714097, 0.0812719999885303], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0876720000064779, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 21 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09240200000704135, "p50": 0.09415100004162014, "p90": 0.09504199999810226, "mean": 0.09392180000986627, "iqr": 0.002310000013494573, "raw_times": [0.09240200000704135, 0.09273199998460768, 0.09504199999810226, 0.09528200001795994, 0.09415100004162014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09613200001012956, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 22 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09898100000782506, "p50": 0.09988099998281541, "p90": 0.10064200000670098, "mean": 0.1000036000050386, "iqr": 0.000919999990856013, "raw_times": [0.09972200001584497, 0.09898100000782506, 0.09988099998281541, 0.10079200001200661, 0.10064200000670098], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10166200002004189, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 23 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4828179999663007, "p50": 0.4840080000008129, "p90": 0.4856980000340627, "mean": 0.48515199999883407, "iqr": 0.0026800000227922283, "raw_times": [0.4830180000112705, 0.4828179999663007, 0.4856980000340627, 0.49021799998172355, 0.4840080000008129], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4863379999733297, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 24 |
+
{"ts": "2025-12-19T18:56:42Z", "run": "26c66aa2ac5c4dd48f4b912d28da3939", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.49664799996662623, "p50": 0.4980280000381754, "p90": 0.5011090000266449, "mean": 0.4989826000155517, "iqr": 0.004230000001825829, "raw_times": [0.4980280000381754, 0.502249000021493, 0.49664799996662623, 0.4968790000248191, 0.5011090000266449], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.49949900000001435, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
causal_conv1d/impls/hf_kernels_causal_conv1d.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/impls/torch_causal_conv1d.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
causal_conv1d/results/combined_results.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3889,11 +3889,11 @@ body[data-tool="eraser"] .main-content {
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
-
<dc:date>2025-
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
| 3896 |
-
<dc:title>Matplotlib v3.10.
|
| 3897 |
</ns2:Agent>
|
| 3898 |
</dc:creator>
|
| 3899 |
</ns2:Work>
|
|
@@ -4233,70 +4233,70 @@ body[data-tool="eraser"] .main-content {
|
|
| 4233 |
<g id="matplotlib.axis_2">
|
| 4234 |
<g id="ytick_1">
|
| 4235 |
<g id="grid-y--2" class="grid grid-y">
|
| 4236 |
-
<path d="M 47.72
|
| 4237 |
</g>
|
| 4238 |
<g id="line2d_25">
|
| 4239 |
<defs>
|
| 4240 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4241 |
</defs>
|
| 4242 |
<g>
|
| 4243 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4244 |
</g>
|
| 4245 |
</g>
|
| 4246 |
<g id="text_25">
|
| 4247 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4248 |
</g>
|
| 4249 |
</g>
|
| 4250 |
<g id="ytick_2">
|
| 4251 |
<g id="grid-y--3" class="grid grid-y">
|
| 4252 |
-
<path d="M 47.72
|
| 4253 |
</g>
|
| 4254 |
<g id="line2d_26">
|
| 4255 |
<g>
|
| 4256 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4257 |
</g>
|
| 4258 |
</g>
|
| 4259 |
<g id="text_26">
|
| 4260 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4261 |
</g>
|
| 4262 |
</g>
|
| 4263 |
<g id="ytick_3">
|
| 4264 |
<g id="grid-y--4" class="grid grid-y">
|
| 4265 |
-
<path d="M 47.72
|
| 4266 |
</g>
|
| 4267 |
<g id="line2d_27">
|
| 4268 |
<g>
|
| 4269 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4270 |
</g>
|
| 4271 |
</g>
|
| 4272 |
<g id="text_27">
|
| 4273 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.
|
| 4274 |
</g>
|
| 4275 |
</g>
|
| 4276 |
<g id="ytick_4">
|
| 4277 |
<g id="grid-y--5" class="grid grid-y">
|
| 4278 |
-
<path d="M 47.72
|
| 4279 |
</g>
|
| 4280 |
<g id="line2d_28">
|
| 4281 |
<g>
|
| 4282 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4283 |
</g>
|
| 4284 |
</g>
|
| 4285 |
<g id="text_28">
|
| 4286 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="ytick_5">
|
| 4290 |
<g id="grid-y--6" class="grid grid-y">
|
| 4291 |
-
<path d="M 47.72
|
| 4292 |
</g>
|
| 4293 |
<g id="line2d_29">
|
| 4294 |
<g>
|
| 4295 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4296 |
</g>
|
| 4297 |
</g>
|
| 4298 |
<g id="text_29">
|
| 4299 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4300 |
</g>
|
| 4301 |
</g>
|
| 4302 |
<g id="label--y" class="ylabel">
|
|
@@ -4304,66 +4304,66 @@ body[data-tool="eraser"] .main-content {
|
|
| 4304 |
</g>
|
| 4305 |
</g>
|
| 4306 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4307 |
-
<path d="M 83.325193 420.186871 L 114.286231
|
| 4308 |
<defs>
|
| 4309 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4310 |
</defs>
|
| 4311 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4312 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4313 |
-
<use ns4:href="#md7efaf3aec" x="114.286231" y="
|
| 4314 |
-
<use ns4:href="#md7efaf3aec" x="145.247268" y="
|
| 4315 |
-
<use ns4:href="#md7efaf3aec" x="176.208306" y="
|
| 4316 |
-
<use ns4:href="#md7efaf3aec" x="207.169343" y="
|
| 4317 |
-
<use ns4:href="#md7efaf3aec" x="238.130381" y="
|
| 4318 |
-
<use ns4:href="#md7efaf3aec" x="269.091418" y="
|
| 4319 |
-
<use ns4:href="#md7efaf3aec" x="300.052455" y="
|
| 4320 |
-
<use ns4:href="#md7efaf3aec" x="331.013493" y="
|
| 4321 |
-
<use ns4:href="#md7efaf3aec" x="361.97453" y="
|
| 4322 |
-
<use ns4:href="#md7efaf3aec" x="392.935568" y="
|
| 4323 |
-
<use ns4:href="#md7efaf3aec" x="423.896605" y="
|
| 4324 |
-
<use ns4:href="#md7efaf3aec" x="454.857643" y="
|
| 4325 |
-
<use ns4:href="#md7efaf3aec" x="485.81868" y="
|
| 4326 |
-
<use ns4:href="#md7efaf3aec" x="516.779718" y="
|
| 4327 |
-
<use ns4:href="#md7efaf3aec" x="547.740755" y="
|
| 4328 |
-
<use ns4:href="#md7efaf3aec" x="578.701793" y="
|
| 4329 |
-
<use ns4:href="#md7efaf3aec" x="609.66283" y="
|
| 4330 |
-
<use ns4:href="#md7efaf3aec" x="640.623868" y="
|
| 4331 |
-
<use ns4:href="#md7efaf3aec" x="671.584905" y="
|
| 4332 |
-
<use ns4:href="#md7efaf3aec" x="702.545943" y="
|
| 4333 |
-
<use ns4:href="#md7efaf3aec" x="733.50698" y="
|
| 4334 |
-
<use ns4:href="#md7efaf3aec" x="764.468018" y="
|
| 4335 |
-
<use ns4:href="#md7efaf3aec" x="795.429055" y="
|
| 4336 |
</g>
|
| 4337 |
</g>
|
| 4338 |
<g id="series--torch-eager" class="series">
|
| 4339 |
-
<path d="M 83.325193 398.
|
| 4340 |
<defs>
|
| 4341 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4342 |
</defs>
|
| 4343 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4344 |
-
<use ns4:href="#m9b8c54d372" x="83.325193" y="398.
|
| 4345 |
-
<use ns4:href="#m9b8c54d372" x="114.286231" y="
|
| 4346 |
-
<use ns4:href="#m9b8c54d372" x="145.247268" y="
|
| 4347 |
-
<use ns4:href="#m9b8c54d372" x="176.208306" y="
|
| 4348 |
-
<use ns4:href="#m9b8c54d372" x="207.169343" y="
|
| 4349 |
-
<use ns4:href="#m9b8c54d372" x="238.130381" y="
|
| 4350 |
-
<use ns4:href="#m9b8c54d372" x="269.091418" y="
|
| 4351 |
-
<use ns4:href="#m9b8c54d372" x="300.052455" y="
|
| 4352 |
-
<use ns4:href="#m9b8c54d372" x="331.013493" y="390.
|
| 4353 |
-
<use ns4:href="#m9b8c54d372" x="361.97453" y="
|
| 4354 |
-
<use ns4:href="#m9b8c54d372" x="392.935568" y="
|
| 4355 |
-
<use ns4:href="#m9b8c54d372" x="423.896605" y="
|
| 4356 |
-
<use ns4:href="#m9b8c54d372" x="454.857643" y="390.
|
| 4357 |
-
<use ns4:href="#m9b8c54d372" x="485.81868" y="
|
| 4358 |
-
<use ns4:href="#m9b8c54d372" x="516.779718" y="
|
| 4359 |
-
<use ns4:href="#m9b8c54d372" x="547.740755" y="
|
| 4360 |
-
<use ns4:href="#m9b8c54d372" x="578.701793" y="
|
| 4361 |
-
<use ns4:href="#m9b8c54d372" x="609.66283" y="
|
| 4362 |
-
<use ns4:href="#m9b8c54d372" x="640.623868" y="
|
| 4363 |
-
<use ns4:href="#m9b8c54d372" x="671.584905" y="
|
| 4364 |
-
<use ns4:href="#m9b8c54d372" x="702.545943" y="380.
|
| 4365 |
-
<use ns4:href="#m9b8c54d372" x="733.50698" y="
|
| 4366 |
-
<use ns4:href="#m9b8c54d372" x="764.468018" y="
|
| 4367 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4368 |
</g>
|
| 4369 |
</g>
|
|
@@ -4422,7 +4422,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4422 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4423 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4424 |
</span> |
|
| 4425 |
-
Cell: combine | 4.
|
| 4426 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4427 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4428 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4515,7 +4515,7 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W2 0.05 True
|
|
| 4515 |
hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True
|
| 4516 |
hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True
|
| 4517 |
hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True
|
| 4518 |
-
hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.
|
| 4519 |
hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.05 True
|
| 4520 |
hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.05 True
|
| 4521 |
hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True
|
|
@@ -4576,7 +4576,7 @@ Implementations included:
|
|
| 4576 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4577 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4578 |
<div class="uv-logs-content" style="display: none;">
|
| 4579 |
-
Installed 37 packages in
|
| 4580 |
</div>
|
| 4581 |
</div>
|
| 4582 |
<div class="cell-artifacts">
|
|
@@ -4589,11 +4589,11 @@ Installed 37 packages in 336ms
|
|
| 4589 |
<rdf:RDF>
|
| 4590 |
<ns2:Work>
|
| 4591 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4592 |
-
<dc:date>2025-
|
| 4593 |
<dc:format>image/svg+xml</dc:format>
|
| 4594 |
<dc:creator>
|
| 4595 |
<ns2:Agent>
|
| 4596 |
-
<dc:title>Matplotlib v3.10.
|
| 4597 |
</ns2:Agent>
|
| 4598 |
</dc:creator>
|
| 4599 |
</ns2:Work>
|
|
@@ -4933,70 +4933,70 @@ Installed 37 packages in 336ms
|
|
| 4933 |
<g id="matplotlib.axis_2">
|
| 4934 |
<g id="ytick_1">
|
| 4935 |
<g id="grid-y--2" class="grid grid-y">
|
| 4936 |
-
<path d="M 47.72
|
| 4937 |
</g>
|
| 4938 |
<g id="line2d_25">
|
| 4939 |
<defs>
|
| 4940 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4941 |
</defs>
|
| 4942 |
<g>
|
| 4943 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4944 |
</g>
|
| 4945 |
</g>
|
| 4946 |
<g id="text_25">
|
| 4947 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4948 |
</g>
|
| 4949 |
</g>
|
| 4950 |
<g id="ytick_2">
|
| 4951 |
<g id="grid-y--3" class="grid grid-y">
|
| 4952 |
-
<path d="M 47.72
|
| 4953 |
</g>
|
| 4954 |
<g id="line2d_26">
|
| 4955 |
<g>
|
| 4956 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4957 |
</g>
|
| 4958 |
</g>
|
| 4959 |
<g id="text_26">
|
| 4960 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4961 |
</g>
|
| 4962 |
</g>
|
| 4963 |
<g id="ytick_3">
|
| 4964 |
<g id="grid-y--4" class="grid grid-y">
|
| 4965 |
-
<path d="M 47.72
|
| 4966 |
</g>
|
| 4967 |
<g id="line2d_27">
|
| 4968 |
<g>
|
| 4969 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4970 |
</g>
|
| 4971 |
</g>
|
| 4972 |
<g id="text_27">
|
| 4973 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.
|
| 4974 |
</g>
|
| 4975 |
</g>
|
| 4976 |
<g id="ytick_4">
|
| 4977 |
<g id="grid-y--5" class="grid grid-y">
|
| 4978 |
-
<path d="M 47.72
|
| 4979 |
</g>
|
| 4980 |
<g id="line2d_28">
|
| 4981 |
<g>
|
| 4982 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4983 |
</g>
|
| 4984 |
</g>
|
| 4985 |
<g id="text_28">
|
| 4986 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4987 |
</g>
|
| 4988 |
</g>
|
| 4989 |
<g id="ytick_5">
|
| 4990 |
<g id="grid-y--6" class="grid grid-y">
|
| 4991 |
-
<path d="M 47.72
|
| 4992 |
</g>
|
| 4993 |
<g id="line2d_29">
|
| 4994 |
<g>
|
| 4995 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4996 |
</g>
|
| 4997 |
</g>
|
| 4998 |
<g id="text_29">
|
| 4999 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 5000 |
</g>
|
| 5001 |
</g>
|
| 5002 |
<g id="label--y" class="ylabel">
|
|
@@ -5004,66 +5004,66 @@ Installed 37 packages in 336ms
|
|
| 5004 |
</g>
|
| 5005 |
</g>
|
| 5006 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 5007 |
-
<path d="M 83.325193 420.186871 L 114.286231
|
| 5008 |
<defs>
|
| 5009 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 5010 |
</defs>
|
| 5011 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5012 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5013 |
-
<use ns4:href="#md7efaf3aec" x="114.286231" y="
|
| 5014 |
-
<use ns4:href="#md7efaf3aec" x="145.247268" y="
|
| 5015 |
-
<use ns4:href="#md7efaf3aec" x="176.208306" y="
|
| 5016 |
-
<use ns4:href="#md7efaf3aec" x="207.169343" y="
|
| 5017 |
-
<use ns4:href="#md7efaf3aec" x="238.130381" y="
|
| 5018 |
-
<use ns4:href="#md7efaf3aec" x="269.091418" y="
|
| 5019 |
-
<use ns4:href="#md7efaf3aec" x="300.052455" y="
|
| 5020 |
-
<use ns4:href="#md7efaf3aec" x="331.013493" y="
|
| 5021 |
-
<use ns4:href="#md7efaf3aec" x="361.97453" y="
|
| 5022 |
-
<use ns4:href="#md7efaf3aec" x="392.935568" y="
|
| 5023 |
-
<use ns4:href="#md7efaf3aec" x="423.896605" y="
|
| 5024 |
-
<use ns4:href="#md7efaf3aec" x="454.857643" y="
|
| 5025 |
-
<use ns4:href="#md7efaf3aec" x="485.81868" y="
|
| 5026 |
-
<use ns4:href="#md7efaf3aec" x="516.779718" y="
|
| 5027 |
-
<use ns4:href="#md7efaf3aec" x="547.740755" y="
|
| 5028 |
-
<use ns4:href="#md7efaf3aec" x="578.701793" y="
|
| 5029 |
-
<use ns4:href="#md7efaf3aec" x="609.66283" y="
|
| 5030 |
-
<use ns4:href="#md7efaf3aec" x="640.623868" y="
|
| 5031 |
-
<use ns4:href="#md7efaf3aec" x="671.584905" y="
|
| 5032 |
-
<use ns4:href="#md7efaf3aec" x="702.545943" y="
|
| 5033 |
-
<use ns4:href="#md7efaf3aec" x="733.50698" y="
|
| 5034 |
-
<use ns4:href="#md7efaf3aec" x="764.468018" y="
|
| 5035 |
-
<use ns4:href="#md7efaf3aec" x="795.429055" y="
|
| 5036 |
</g>
|
| 5037 |
</g>
|
| 5038 |
<g id="series--torch-eager" class="series">
|
| 5039 |
-
<path d="M 83.325193 398.
|
| 5040 |
<defs>
|
| 5041 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 5042 |
</defs>
|
| 5043 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5044 |
-
<use ns4:href="#m9b8c54d372" x="83.325193" y="398.
|
| 5045 |
-
<use ns4:href="#m9b8c54d372" x="114.286231" y="
|
| 5046 |
-
<use ns4:href="#m9b8c54d372" x="145.247268" y="
|
| 5047 |
-
<use ns4:href="#m9b8c54d372" x="176.208306" y="
|
| 5048 |
-
<use ns4:href="#m9b8c54d372" x="207.169343" y="
|
| 5049 |
-
<use ns4:href="#m9b8c54d372" x="238.130381" y="
|
| 5050 |
-
<use ns4:href="#m9b8c54d372" x="269.091418" y="
|
| 5051 |
-
<use ns4:href="#m9b8c54d372" x="300.052455" y="
|
| 5052 |
-
<use ns4:href="#m9b8c54d372" x="331.013493" y="390.
|
| 5053 |
-
<use ns4:href="#m9b8c54d372" x="361.97453" y="
|
| 5054 |
-
<use ns4:href="#m9b8c54d372" x="392.935568" y="
|
| 5055 |
-
<use ns4:href="#m9b8c54d372" x="423.896605" y="
|
| 5056 |
-
<use ns4:href="#m9b8c54d372" x="454.857643" y="390.
|
| 5057 |
-
<use ns4:href="#m9b8c54d372" x="485.81868" y="
|
| 5058 |
-
<use ns4:href="#m9b8c54d372" x="516.779718" y="
|
| 5059 |
-
<use ns4:href="#m9b8c54d372" x="547.740755" y="
|
| 5060 |
-
<use ns4:href="#m9b8c54d372" x="578.701793" y="
|
| 5061 |
-
<use ns4:href="#m9b8c54d372" x="609.66283" y="
|
| 5062 |
-
<use ns4:href="#m9b8c54d372" x="640.623868" y="
|
| 5063 |
-
<use ns4:href="#m9b8c54d372" x="671.584905" y="
|
| 5064 |
-
<use ns4:href="#m9b8c54d372" x="702.545943" y="380.
|
| 5065 |
-
<use ns4:href="#m9b8c54d372" x="733.50698" y="
|
| 5066 |
-
<use ns4:href="#m9b8c54d372" x="764.468018" y="
|
| 5067 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5068 |
</g>
|
| 5069 |
</g>
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
+
<dc:date>2025-12-19T19:09:46.065014</dc:date>
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
| 3896 |
+
<dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
|
| 3897 |
</ns2:Agent>
|
| 3898 |
</dc:creator>
|
| 3899 |
</ns2:Work>
|
|
|
|
| 4233 |
<g id="matplotlib.axis_2">
|
| 4234 |
<g id="ytick_1">
|
| 4235 |
<g id="grid-y--2" class="grid grid-y">
|
| 4236 |
+
<path d="M 47.72 375.695489 L 831.034248 375.695489 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4237 |
</g>
|
| 4238 |
<g id="line2d_25">
|
| 4239 |
<defs>
|
| 4240 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4241 |
</defs>
|
| 4242 |
<g>
|
| 4243 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="375.695489" style="stroke: #000000; stroke-width: 0.8" />
|
| 4244 |
</g>
|
| 4245 |
</g>
|
| 4246 |
<g id="text_25">
|
| 4247 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.494708" transform="rotate(-0 40.72 379.494708)">0.1</text>
|
| 4248 |
</g>
|
| 4249 |
</g>
|
| 4250 |
<g id="ytick_2">
|
| 4251 |
<g id="grid-y--3" class="grid grid-y">
|
| 4252 |
+
<path d="M 47.72 292.764994 L 831.034248 292.764994 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4253 |
</g>
|
| 4254 |
<g id="line2d_26">
|
| 4255 |
<g>
|
| 4256 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="292.764994" style="stroke: #000000; stroke-width: 0.8" />
|
| 4257 |
</g>
|
| 4258 |
</g>
|
| 4259 |
<g id="text_26">
|
| 4260 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.564213" transform="rotate(-0 40.72 296.564213)">0.2</text>
|
| 4261 |
</g>
|
| 4262 |
</g>
|
| 4263 |
<g id="ytick_3">
|
| 4264 |
<g id="grid-y--4" class="grid grid-y">
|
| 4265 |
+
<path d="M 47.72 209.834499 L 831.034248 209.834499 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4266 |
</g>
|
| 4267 |
<g id="line2d_27">
|
| 4268 |
<g>
|
| 4269 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="209.834499" style="stroke: #000000; stroke-width: 0.8" />
|
| 4270 |
</g>
|
| 4271 |
</g>
|
| 4272 |
<g id="text_27">
|
| 4273 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.633718" transform="rotate(-0 40.72 213.633718)">0.3</text>
|
| 4274 |
</g>
|
| 4275 |
</g>
|
| 4276 |
<g id="ytick_4">
|
| 4277 |
<g id="grid-y--5" class="grid grid-y">
|
| 4278 |
+
<path d="M 47.72 126.904004 L 831.034248 126.904004 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4279 |
</g>
|
| 4280 |
<g id="line2d_28">
|
| 4281 |
<g>
|
| 4282 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="126.904004" style="stroke: #000000; stroke-width: 0.8" />
|
| 4283 |
</g>
|
| 4284 |
</g>
|
| 4285 |
<g id="text_28">
|
| 4286 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.703223" transform="rotate(-0 40.72 130.703223)">0.4</text>
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="ytick_5">
|
| 4290 |
<g id="grid-y--6" class="grid grid-y">
|
| 4291 |
+
<path d="M 47.72 43.973509 L 831.034248 43.973509 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4292 |
</g>
|
| 4293 |
<g id="line2d_29">
|
| 4294 |
<g>
|
| 4295 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="43.973509" style="stroke: #000000; stroke-width: 0.8" />
|
| 4296 |
</g>
|
| 4297 |
</g>
|
| 4298 |
<g id="text_29">
|
| 4299 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.772728" transform="rotate(-0 40.72 47.772728)">0.5</text>
|
| 4300 |
</g>
|
| 4301 |
</g>
|
| 4302 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4304 |
</g>
|
| 4305 |
</g>
|
| 4306 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4307 |
+
<path d="M 83.325193 420.186871 L 114.286231 415.418367 L 145.247268 416.273381 L 176.208306 416.636616 L 207.169343 417.035512 L 238.130381 417.110979 L 269.091418 417.475043 L 300.052455 417.781886 L 331.013493 417.616025 L 361.97453 417.516509 L 392.935568 417.674077 L 423.896605 417.018926 L 454.857643 417.358941 L 485.81868 417.823352 L 516.779718 416.969997 L 547.740755 416.968338 L 578.701793 417.640904 L 609.66283 417.300889 L 640.623868 417.38382 L 671.584905 417.980919 L 702.545943 416.836479 L 733.50698 417.110149 L 764.468018 417.450164 L 795.429055 417.582853 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4308 |
<defs>
|
| 4309 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4310 |
</defs>
|
| 4311 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4312 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4313 |
+
<use ns4:href="#md7efaf3aec" x="114.286231" y="415.418367" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4314 |
+
<use ns4:href="#md7efaf3aec" x="145.247268" y="416.273381" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4315 |
+
<use ns4:href="#md7efaf3aec" x="176.208306" y="416.636616" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4316 |
+
<use ns4:href="#md7efaf3aec" x="207.169343" y="417.035512" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4317 |
+
<use ns4:href="#md7efaf3aec" x="238.130381" y="417.110979" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4318 |
+
<use ns4:href="#md7efaf3aec" x="269.091418" y="417.475043" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4319 |
+
<use ns4:href="#md7efaf3aec" x="300.052455" y="417.781886" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4320 |
+
<use ns4:href="#md7efaf3aec" x="331.013493" y="417.616025" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4321 |
+
<use ns4:href="#md7efaf3aec" x="361.97453" y="417.516509" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4322 |
+
<use ns4:href="#md7efaf3aec" x="392.935568" y="417.674077" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4323 |
+
<use ns4:href="#md7efaf3aec" x="423.896605" y="417.018926" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4324 |
+
<use ns4:href="#md7efaf3aec" x="454.857643" y="417.358941" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4325 |
+
<use ns4:href="#md7efaf3aec" x="485.81868" y="417.823352" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4326 |
+
<use ns4:href="#md7efaf3aec" x="516.779718" y="416.969997" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4327 |
+
<use ns4:href="#md7efaf3aec" x="547.740755" y="416.968338" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4328 |
+
<use ns4:href="#md7efaf3aec" x="578.701793" y="417.640904" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4329 |
+
<use ns4:href="#md7efaf3aec" x="609.66283" y="417.300889" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4330 |
+
<use ns4:href="#md7efaf3aec" x="640.623868" y="417.38382" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4331 |
+
<use ns4:href="#md7efaf3aec" x="671.584905" y="417.980919" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4332 |
+
<use ns4:href="#md7efaf3aec" x="702.545943" y="416.836479" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4333 |
+
<use ns4:href="#md7efaf3aec" x="733.50698" y="417.110149" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4334 |
+
<use ns4:href="#md7efaf3aec" x="764.468018" y="417.450164" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4335 |
+
<use ns4:href="#md7efaf3aec" x="795.429055" y="417.582853" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4336 |
</g>
|
| 4337 |
</g>
|
| 4338 |
<g id="series--torch-eager" class="series">
|
| 4339 |
+
<path d="M 83.325193 398.251755 L 114.286231 389.253796 L 145.247268 390.058222 L 176.208306 391.19354 L 207.169343 391.293886 L 238.130381 390.514339 L 269.091418 392.14807 L 300.052455 391.725125 L 331.013493 390.70425 L 361.97453 390.564098 L 392.935568 326.689372 L 423.896605 322.484796 L 454.857643 390.539219 L 485.81868 392.305638 L 516.779718 392.105776 L 547.740755 390.821182 L 578.701793 391.343644 L 609.66283 390.687664 L 640.623868 392.031968 L 671.584905 390.522632 L 702.545943 380.546094 L 733.50698 375.794177 L 764.468018 57.235754 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4340 |
<defs>
|
| 4341 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4342 |
</defs>
|
| 4343 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4344 |
+
<use ns4:href="#m9b8c54d372" x="83.325193" y="398.251755" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4345 |
+
<use ns4:href="#m9b8c54d372" x="114.286231" y="389.253796" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4346 |
+
<use ns4:href="#m9b8c54d372" x="145.247268" y="390.058222" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4347 |
+
<use ns4:href="#m9b8c54d372" x="176.208306" y="391.19354" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4348 |
+
<use ns4:href="#m9b8c54d372" x="207.169343" y="391.293886" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4349 |
+
<use ns4:href="#m9b8c54d372" x="238.130381" y="390.514339" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4350 |
+
<use ns4:href="#m9b8c54d372" x="269.091418" y="392.14807" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4351 |
+
<use ns4:href="#m9b8c54d372" x="300.052455" y="391.725125" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4352 |
+
<use ns4:href="#m9b8c54d372" x="331.013493" y="390.70425" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4353 |
+
<use ns4:href="#m9b8c54d372" x="361.97453" y="390.564098" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4354 |
+
<use ns4:href="#m9b8c54d372" x="392.935568" y="326.689372" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4355 |
+
<use ns4:href="#m9b8c54d372" x="423.896605" y="322.484796" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4356 |
+
<use ns4:href="#m9b8c54d372" x="454.857643" y="390.539219" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4357 |
+
<use ns4:href="#m9b8c54d372" x="485.81868" y="392.305638" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4358 |
+
<use ns4:href="#m9b8c54d372" x="516.779718" y="392.105776" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4359 |
+
<use ns4:href="#m9b8c54d372" x="547.740755" y="390.821182" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4360 |
+
<use ns4:href="#m9b8c54d372" x="578.701793" y="391.343644" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4361 |
+
<use ns4:href="#m9b8c54d372" x="609.66283" y="390.687664" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4362 |
+
<use ns4:href="#m9b8c54d372" x="640.623868" y="392.031968" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4363 |
+
<use ns4:href="#m9b8c54d372" x="671.584905" y="390.522632" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4364 |
+
<use ns4:href="#m9b8c54d372" x="702.545943" y="380.546094" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4365 |
+
<use ns4:href="#m9b8c54d372" x="733.50698" y="375.794177" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4366 |
+
<use ns4:href="#m9b8c54d372" x="764.468018" y="57.235754" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4367 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4368 |
</g>
|
| 4369 |
</g>
|
|
|
|
| 4422 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4423 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4424 |
</span> |
|
| 4425 |
+
Cell: combine | 4.61s
|
| 4426 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4427 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4428 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4515 |
hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True
|
| 4516 |
hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True
|
| 4517 |
hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True
|
| 4518 |
+
hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.05 True
|
| 4519 |
hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.05 True
|
| 4520 |
hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.05 True
|
| 4521 |
hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True
|
|
|
|
| 4576 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4577 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4578 |
<div class="uv-logs-content" style="display: none;">
|
| 4579 |
+
Installed 37 packages in 314ms
|
| 4580 |
</div>
|
| 4581 |
</div>
|
| 4582 |
<div class="cell-artifacts">
|
|
|
|
| 4589 |
<rdf:RDF>
|
| 4590 |
<ns2:Work>
|
| 4591 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4592 |
+
<dc:date>2025-12-19T19:09:46.065014</dc:date>
|
| 4593 |
<dc:format>image/svg+xml</dc:format>
|
| 4594 |
<dc:creator>
|
| 4595 |
<ns2:Agent>
|
| 4596 |
+
<dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
|
| 4597 |
</ns2:Agent>
|
| 4598 |
</dc:creator>
|
| 4599 |
</ns2:Work>
|
|
|
|
| 4933 |
<g id="matplotlib.axis_2">
|
| 4934 |
<g id="ytick_1">
|
| 4935 |
<g id="grid-y--2" class="grid grid-y">
|
| 4936 |
+
<path d="M 47.72 375.695489 L 831.034248 375.695489 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4937 |
</g>
|
| 4938 |
<g id="line2d_25">
|
| 4939 |
<defs>
|
| 4940 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4941 |
</defs>
|
| 4942 |
<g>
|
| 4943 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="375.695489" style="stroke: #000000; stroke-width: 0.8" />
|
| 4944 |
</g>
|
| 4945 |
</g>
|
| 4946 |
<g id="text_25">
|
| 4947 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.494708" transform="rotate(-0 40.72 379.494708)">0.1</text>
|
| 4948 |
</g>
|
| 4949 |
</g>
|
| 4950 |
<g id="ytick_2">
|
| 4951 |
<g id="grid-y--3" class="grid grid-y">
|
| 4952 |
+
<path d="M 47.72 292.764994 L 831.034248 292.764994 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4953 |
</g>
|
| 4954 |
<g id="line2d_26">
|
| 4955 |
<g>
|
| 4956 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="292.764994" style="stroke: #000000; stroke-width: 0.8" />
|
| 4957 |
</g>
|
| 4958 |
</g>
|
| 4959 |
<g id="text_26">
|
| 4960 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.564213" transform="rotate(-0 40.72 296.564213)">0.2</text>
|
| 4961 |
</g>
|
| 4962 |
</g>
|
| 4963 |
<g id="ytick_3">
|
| 4964 |
<g id="grid-y--4" class="grid grid-y">
|
| 4965 |
+
<path d="M 47.72 209.834499 L 831.034248 209.834499 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4966 |
</g>
|
| 4967 |
<g id="line2d_27">
|
| 4968 |
<g>
|
| 4969 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="209.834499" style="stroke: #000000; stroke-width: 0.8" />
|
| 4970 |
</g>
|
| 4971 |
</g>
|
| 4972 |
<g id="text_27">
|
| 4973 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.633718" transform="rotate(-0 40.72 213.633718)">0.3</text>
|
| 4974 |
</g>
|
| 4975 |
</g>
|
| 4976 |
<g id="ytick_4">
|
| 4977 |
<g id="grid-y--5" class="grid grid-y">
|
| 4978 |
+
<path d="M 47.72 126.904004 L 831.034248 126.904004 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4979 |
</g>
|
| 4980 |
<g id="line2d_28">
|
| 4981 |
<g>
|
| 4982 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="126.904004" style="stroke: #000000; stroke-width: 0.8" />
|
| 4983 |
</g>
|
| 4984 |
</g>
|
| 4985 |
<g id="text_28">
|
| 4986 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.703223" transform="rotate(-0 40.72 130.703223)">0.4</text>
|
| 4987 |
</g>
|
| 4988 |
</g>
|
| 4989 |
<g id="ytick_5">
|
| 4990 |
<g id="grid-y--6" class="grid grid-y">
|
| 4991 |
+
<path d="M 47.72 43.973509 L 831.034248 43.973509 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4992 |
</g>
|
| 4993 |
<g id="line2d_29">
|
| 4994 |
<g>
|
| 4995 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="43.973509" style="stroke: #000000; stroke-width: 0.8" />
|
| 4996 |
</g>
|
| 4997 |
</g>
|
| 4998 |
<g id="text_29">
|
| 4999 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.772728" transform="rotate(-0 40.72 47.772728)">0.5</text>
|
| 5000 |
</g>
|
| 5001 |
</g>
|
| 5002 |
<g id="label--y" class="ylabel">
|
|
|
|
| 5004 |
</g>
|
| 5005 |
</g>
|
| 5006 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 5007 |
+
<path d="M 83.325193 420.186871 L 114.286231 415.418367 L 145.247268 416.273381 L 176.208306 416.636616 L 207.169343 417.035512 L 238.130381 417.110979 L 269.091418 417.475043 L 300.052455 417.781886 L 331.013493 417.616025 L 361.97453 417.516509 L 392.935568 417.674077 L 423.896605 417.018926 L 454.857643 417.358941 L 485.81868 417.823352 L 516.779718 416.969997 L 547.740755 416.968338 L 578.701793 417.640904 L 609.66283 417.300889 L 640.623868 417.38382 L 671.584905 417.980919 L 702.545943 416.836479 L 733.50698 417.110149 L 764.468018 417.450164 L 795.429055 417.582853 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 5008 |
<defs>
|
| 5009 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 5010 |
</defs>
|
| 5011 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5012 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5013 |
+
<use ns4:href="#md7efaf3aec" x="114.286231" y="415.418367" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5014 |
+
<use ns4:href="#md7efaf3aec" x="145.247268" y="416.273381" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5015 |
+
<use ns4:href="#md7efaf3aec" x="176.208306" y="416.636616" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5016 |
+
<use ns4:href="#md7efaf3aec" x="207.169343" y="417.035512" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5017 |
+
<use ns4:href="#md7efaf3aec" x="238.130381" y="417.110979" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5018 |
+
<use ns4:href="#md7efaf3aec" x="269.091418" y="417.475043" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5019 |
+
<use ns4:href="#md7efaf3aec" x="300.052455" y="417.781886" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5020 |
+
<use ns4:href="#md7efaf3aec" x="331.013493" y="417.616025" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5021 |
+
<use ns4:href="#md7efaf3aec" x="361.97453" y="417.516509" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5022 |
+
<use ns4:href="#md7efaf3aec" x="392.935568" y="417.674077" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5023 |
+
<use ns4:href="#md7efaf3aec" x="423.896605" y="417.018926" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5024 |
+
<use ns4:href="#md7efaf3aec" x="454.857643" y="417.358941" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5025 |
+
<use ns4:href="#md7efaf3aec" x="485.81868" y="417.823352" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5026 |
+
<use ns4:href="#md7efaf3aec" x="516.779718" y="416.969997" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5027 |
+
<use ns4:href="#md7efaf3aec" x="547.740755" y="416.968338" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5028 |
+
<use ns4:href="#md7efaf3aec" x="578.701793" y="417.640904" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5029 |
+
<use ns4:href="#md7efaf3aec" x="609.66283" y="417.300889" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5030 |
+
<use ns4:href="#md7efaf3aec" x="640.623868" y="417.38382" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5031 |
+
<use ns4:href="#md7efaf3aec" x="671.584905" y="417.980919" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5032 |
+
<use ns4:href="#md7efaf3aec" x="702.545943" y="416.836479" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5033 |
+
<use ns4:href="#md7efaf3aec" x="733.50698" y="417.110149" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5034 |
+
<use ns4:href="#md7efaf3aec" x="764.468018" y="417.450164" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5035 |
+
<use ns4:href="#md7efaf3aec" x="795.429055" y="417.582853" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5036 |
</g>
|
| 5037 |
</g>
|
| 5038 |
<g id="series--torch-eager" class="series">
|
| 5039 |
+
<path d="M 83.325193 398.251755 L 114.286231 389.253796 L 145.247268 390.058222 L 176.208306 391.19354 L 207.169343 391.293886 L 238.130381 390.514339 L 269.091418 392.14807 L 300.052455 391.725125 L 331.013493 390.70425 L 361.97453 390.564098 L 392.935568 326.689372 L 423.896605 322.484796 L 454.857643 390.539219 L 485.81868 392.305638 L 516.779718 392.105776 L 547.740755 390.821182 L 578.701793 391.343644 L 609.66283 390.687664 L 640.623868 392.031968 L 671.584905 390.522632 L 702.545943 380.546094 L 733.50698 375.794177 L 764.468018 57.235754 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 5040 |
<defs>
|
| 5041 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 5042 |
</defs>
|
| 5043 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5044 |
+
<use ns4:href="#m9b8c54d372" x="83.325193" y="398.251755" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5045 |
+
<use ns4:href="#m9b8c54d372" x="114.286231" y="389.253796" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5046 |
+
<use ns4:href="#m9b8c54d372" x="145.247268" y="390.058222" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5047 |
+
<use ns4:href="#m9b8c54d372" x="176.208306" y="391.19354" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5048 |
+
<use ns4:href="#m9b8c54d372" x="207.169343" y="391.293886" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5049 |
+
<use ns4:href="#m9b8c54d372" x="238.130381" y="390.514339" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5050 |
+
<use ns4:href="#m9b8c54d372" x="269.091418" y="392.14807" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5051 |
+
<use ns4:href="#m9b8c54d372" x="300.052455" y="391.725125" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5052 |
+
<use ns4:href="#m9b8c54d372" x="331.013493" y="390.70425" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5053 |
+
<use ns4:href="#m9b8c54d372" x="361.97453" y="390.564098" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5054 |
+
<use ns4:href="#m9b8c54d372" x="392.935568" y="326.689372" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5055 |
+
<use ns4:href="#m9b8c54d372" x="423.896605" y="322.484796" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5056 |
+
<use ns4:href="#m9b8c54d372" x="454.857643" y="390.539219" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5057 |
+
<use ns4:href="#m9b8c54d372" x="485.81868" y="392.305638" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5058 |
+
<use ns4:href="#m9b8c54d372" x="516.779718" y="392.105776" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5059 |
+
<use ns4:href="#m9b8c54d372" x="547.740755" y="390.821182" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5060 |
+
<use ns4:href="#m9b8c54d372" x="578.701793" y="391.343644" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5061 |
+
<use ns4:href="#m9b8c54d372" x="609.66283" y="390.687664" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5062 |
+
<use ns4:href="#m9b8c54d372" x="640.623868" y="392.031968" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5063 |
+
<use ns4:href="#m9b8c54d372" x="671.584905" y="390.522632" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5064 |
+
<use ns4:href="#m9b8c54d372" x="702.545943" y="380.546094" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5065 |
+
<use ns4:href="#m9b8c54d372" x="733.50698" y="375.794177" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5066 |
+
<use ns4:href="#m9b8c54d372" x="764.468018" y="57.235754" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5067 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5068 |
</g>
|
| 5069 |
</g>
|
deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
{"ts": "2025-
|
| 2 |
-
{"ts": "2025-
|
| 3 |
-
{"ts": "2025-
|
| 4 |
-
{"ts": "2025-
|
|
|
|
| 1 |
+
{"ts": "2025-12-19T19:09:31Z", "run": "04da14902c784090beeb85878bc3f422", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_Q100_H8_E256_L4_P4", "batch_size": 1, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.03506000007291732, "p50": 0.03723999998328509, "p90": 0.03738000009434472, "mean": 0.036888000067847315, "iqr": 0.0004900000476482091, "raw_times": [0.03506000007291732, 0.03787000014199293, 0.03738000009434472, 0.036890000046696514, 0.03723999998328509], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04449099992598349, "peak_bytes": 2264064, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.539113701047427e-08, "mse": 6.418638644407112e-15, "ref": "deformable_detr_torch"}, "err": null}
|
| 2 |
+
{"ts": "2025-12-19T19:09:31Z", "run": "04da14902c784090beeb85878bc3f422", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_Q300_H8_E256_L4_P4", "batch_size": 1, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04158100000495324, "p50": 0.04245099989930168, "p90": 0.04307099993638985, "mean": 0.04284299998289498, "iqr": 0.0008199999683711212, "raw_times": [0.04158100000495324, 0.04245099989930168, 0.0448610001058114, 0.04307099993638985, 0.04225099996801873], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.046680999957970926, "peak_bytes": 4004864, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.559346050176828e-08, "mse": 6.4289483059246175e-15, "ref": "deformable_detr_torch"}, "err": null}
|
| 3 |
+
{"ts": "2025-12-19T19:09:31Z", "run": "04da14902c784090beeb85878bc3f422", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_Q100_H8_E256_L4_P4", "batch_size": 2, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04193000017949089, "p50": 0.04463999994186452, "p90": 0.0456009997833462, "mean": 0.044314399929135107, "iqr": 0.001990999862755416, "raw_times": [0.04193000017949089, 0.04463999994186452, 0.04579099982038315, 0.0456009997833462, 0.04360999992059078], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04799999987881165, "peak_bytes": 5459968, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.555110149657594e-08, "mse": 6.418781369458724e-15, "ref": "deformable_detr_torch"}, "err": null}
|
| 4 |
+
{"ts": "2025-12-19T19:09:31Z", "run": "04da14902c784090beeb85878bc3f422", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_Q300_H8_E256_L4_P4", "batch_size": 2, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044340000158626935, "p50": 0.0453610000477056, "p90": 0.045860999989599804, "mean": 0.04539080005088181, "iqr": 0.000529999852005858, "raw_times": [0.045860999989599804, 0.045331000137593946, 0.044340000158626935, 0.04606099992088275, 0.0453610000477056], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04745000001094013, "peak_bytes": 8008704, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.5905669427147586e-08, "mse": 6.485184940875199e-15, "ref": "deformable_detr_torch"}, "err": null}
|
deformable_detr/impls/cells/benchmark.py
CHANGED
|
@@ -4,6 +4,7 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
|
@@ -12,107 +13,30 @@
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
|
|
|
| 15 |
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
|
|
|
|
| 18 |
value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64
|
| 19 |
):
|
| 20 |
-
"""
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
# Split value tensor by levels
|
| 30 |
-
value_list = value.split([int(h * w) for h, w in spatial_shapes.tolist()], dim=1)
|
| 31 |
-
|
| 32 |
-
# Iterate through each level (can't avoid this loop easily)
|
| 33 |
-
for level_idx in range(num_levels):
|
| 34 |
-
h, w = spatial_shapes[level_idx].tolist()
|
| 35 |
-
value_level = value_list[level_idx] # (bs, h*w, num_heads, channels)
|
| 36 |
-
|
| 37 |
-
# Reshape to spatial grid: (bs, num_heads, channels, h, w)
|
| 38 |
-
value_spatial = value_level.reshape(bs, h, w, num_heads, channels).permute(0, 3, 4, 1, 2)
|
| 39 |
-
|
| 40 |
-
# Get sampling locations and weights for this level
|
| 41 |
-
# loc: (bs, num_queries, num_heads, num_points, 2)
|
| 42 |
-
loc = sampling_locations[:, :, :, level_idx, :, :]
|
| 43 |
-
# weight: (bs, num_queries, num_heads, num_points)
|
| 44 |
-
weight = attention_weights[:, :, :, level_idx, :]
|
| 45 |
-
|
| 46 |
-
# Convert normalized coordinates to pixel coordinates
|
| 47 |
-
# loc[..., 0] is x (width), loc[..., 1] is y (height)
|
| 48 |
-
x = loc[..., 0] * w - 0.5 # (bs, num_queries, num_heads, num_points)
|
| 49 |
-
y = loc[..., 1] * h - 0.5
|
| 50 |
-
|
| 51 |
-
# Get integer coordinates for bilinear interpolation
|
| 52 |
-
x0 = torch.floor(x).long()
|
| 53 |
-
y0 = torch.floor(y).long()
|
| 54 |
-
x1 = x0 + 1
|
| 55 |
-
y1 = y0 + 1
|
| 56 |
-
|
| 57 |
-
# Compute interpolation weights BEFORE clamping (important!)
|
| 58 |
-
lw = x - x0.float() # weight for x direction
|
| 59 |
-
lh = y - y0.float() # weight for y direction
|
| 60 |
-
hw = 1 - lw
|
| 61 |
-
hh = 1 - lh
|
| 62 |
-
|
| 63 |
-
# Create mask for valid sample locations
|
| 64 |
-
valid = (y > -1) & (x > -1) & (y < h) & (x < w)
|
| 65 |
-
|
| 66 |
-
# Create masks for each corner being in bounds
|
| 67 |
-
mask_tl = ((y0 >= 0) & (x0 >= 0)).unsqueeze(-1).float()
|
| 68 |
-
mask_tr = ((y0 >= 0) & (x1 <= w - 1)).unsqueeze(-1).float()
|
| 69 |
-
mask_bl = ((y1 <= h - 1) & (x0 >= 0)).unsqueeze(-1).float()
|
| 70 |
-
mask_br = ((y1 <= h - 1) & (x1 <= w - 1)).unsqueeze(-1).float()
|
| 71 |
-
|
| 72 |
-
# Clamp coordinates for safe indexing
|
| 73 |
-
x0_clamped = torch.clamp(x0, 0, w - 1)
|
| 74 |
-
x1_clamped = torch.clamp(x1, 0, w - 1)
|
| 75 |
-
y0_clamped = torch.clamp(y0, 0, h - 1)
|
| 76 |
-
y1_clamped = torch.clamp(y1, 0, h - 1)
|
| 77 |
-
|
| 78 |
-
# Bilinear interpolation weights for all 4 corners
|
| 79 |
-
w_tl = (hh * hw).unsqueeze(-1) # top-left: (bs, num_queries, num_heads, num_points, 1)
|
| 80 |
-
w_tr = (hh * lw).unsqueeze(-1) # top-right
|
| 81 |
-
w_bl = (lh * hw).unsqueeze(-1) # bottom-left
|
| 82 |
-
w_br = (lh * lw).unsqueeze(-1) # bottom-right
|
| 83 |
-
|
| 84 |
-
# Gather values from the 4 corners using advanced indexing
|
| 85 |
-
batch_idx = torch.arange(bs, device=value.device).view(bs, 1, 1, 1).expand(bs, num_queries, num_heads, num_points)
|
| 86 |
-
head_idx = torch.arange(num_heads, device=value.device).view(1, 1, num_heads, 1).expand(bs, num_queries, num_heads, num_points)
|
| 87 |
-
|
| 88 |
-
# Gather corner values with clamped indices, then apply corner masks
|
| 89 |
-
v_tl = value_spatial[batch_idx, head_idx, :, y0_clamped, x0_clamped] * mask_tl
|
| 90 |
-
v_tr = value_spatial[batch_idx, head_idx, :, y0_clamped, x1_clamped] * mask_tr
|
| 91 |
-
v_bl = value_spatial[batch_idx, head_idx, :, y1_clamped, x0_clamped] * mask_bl
|
| 92 |
-
v_br = value_spatial[batch_idx, head_idx, :, y1_clamped, x1_clamped] * mask_br
|
| 93 |
-
|
| 94 |
-
# Bilinear interpolation
|
| 95 |
-
sampled = w_tl * v_tl + w_tr * v_tr + w_bl * v_bl + w_br * v_br
|
| 96 |
-
|
| 97 |
-
# Apply valid mask (only accumulate if entire sample location is valid)
|
| 98 |
-
sampled = sampled * valid.unsqueeze(-1).float()
|
| 99 |
-
|
| 100 |
-
# Apply attention weights and sum over points
|
| 101 |
-
# weight: (bs, num_queries, num_heads, num_points)
|
| 102 |
-
# Expand weight: (bs, num_queries, num_heads, num_points, 1)
|
| 103 |
-
weighted_sampled = sampled * weight.unsqueeze(-1)
|
| 104 |
-
|
| 105 |
-
# Sum over points: (bs, num_queries, num_heads, channels)
|
| 106 |
-
output += weighted_sampled.sum(dim=3)
|
| 107 |
-
|
| 108 |
-
# Flatten last two dimensions to match kernel output
|
| 109 |
-
return output.reshape(bs, num_queries, num_heads * channels)
|
| 110 |
|
| 111 |
|
| 112 |
run_benchmark(
|
| 113 |
kernel_type=KernelTypeEnum.DEFORMABLE_DETR,
|
| 114 |
-
impl_name="
|
| 115 |
-
impl_tags={"family": "
|
| 116 |
-
impl_func=
|
| 117 |
dtype="float32",
|
| 118 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
+
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
+
from kernels import get_kernel
|
| 17 |
|
| 18 |
+
# Load the deformable DETR kernel
|
| 19 |
+
deformable_detr = get_kernel("kernels-community/deformable-detr")
|
| 20 |
|
| 21 |
+
|
| 22 |
+
def hf_kernels_deformable_detr(
|
| 23 |
value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64
|
| 24 |
):
|
| 25 |
+
"""HuggingFace Kernels Deformable DETR Multi-Scale Deformable Attention"""
|
| 26 |
+
return deformable_detr.ms_deform_attn_forward(
|
| 27 |
+
value=value,
|
| 28 |
+
spatial_shapes=spatial_shapes,
|
| 29 |
+
level_start_index=level_start_index,
|
| 30 |
+
sampling_loc=sampling_locations,
|
| 31 |
+
attn_weight=attention_weights,
|
| 32 |
+
im2col_step=im2col_step
|
| 33 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
run_benchmark(
|
| 37 |
kernel_type=KernelTypeEnum.DEFORMABLE_DETR,
|
| 38 |
+
impl_name="hf_kernels_deformable_detr",
|
| 39 |
+
impl_tags={"family": "hf-kernels", "backend": "cuda"},
|
| 40 |
+
impl_func=hf_kernels_deformable_detr,
|
| 41 |
dtype="float32",
|
| 42 |
)
|
deformable_detr/impls/hf_kernels_deformable_detr.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: nv | 0.
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3905,16 +3905,16 @@ Cell: nv | 0.22s
|
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div id="output-nv" class="cell-output">
|
| 3908 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
-
| NVIDIA-SMI 580.
|
| 3911 |
+-----------------------------------------+------------------------+----------------------+
|
| 3912 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3913 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3914 |
| | | MIG M. |
|
| 3915 |
|=========================================+========================+======================|
|
| 3916 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3917 |
-
| N/A
|
| 3918 |
| | | N/A |
|
| 3919 |
+-----------------------------------------+------------------------+----------------------+
|
| 3920 |
|
|
@@ -3938,7 +3938,7 @@ Cell: nv | 0.22s
|
|
| 3938 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3939 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3940 |
</span> |
|
| 3941 |
-
Cell: benchmark |
|
| 3942 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3943 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3944 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4003,24 +4003,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q100_H8_E256_L4_P4
|
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4005 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4006 |
-
hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4007 |
-
hf_kernels_deformable_detr
|
| 4008 |
-
_deformable_detr_57c3d32::ms_deform_attn_forward 3.04%
|
| 4009 |
-
void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.
|
| 4010 |
-
aten::zeros 0.
|
| 4011 |
-
aten::zero_ 0.
|
| 4012 |
-
aten::fill_ 1.
|
| 4013 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4014 |
-
Activity Buffer Request
|
| 4015 |
-
aten::empty 1.
|
| 4016 |
-
cudaLaunchKernel 2.
|
| 4017 |
-
aten::view 0.
|
| 4018 |
-
aten::select
|
| 4019 |
-
aten::as_strided 0.
|
| 4020 |
-
cudaDeviceSynchronize 0.
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
-
Self CPU time total: 2.
|
| 4023 |
-
Self CUDA time total: 25.
|
| 4024 |
|
| 4025 |
|
| 4026 |
|
|
@@ -4030,24 +4030,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q300_H8_E256_L4_P4
|
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4032 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4033 |
-
hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4034 |
-
hf_kernels_deformable_detr
|
| 4035 |
-
_deformable_detr_57c3d32::ms_deform_attn_forward 1.
|
| 4036 |
-
void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 23.
|
| 4037 |
-
aten::zeros 0.
|
| 4038 |
-
aten::zero_ 0.
|
| 4039 |
-
aten::fill_ 1.
|
| 4040 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4041 |
-
Activity Buffer Request
|
| 4042 |
-
aten::empty 0.
|
| 4043 |
-
cudaLaunchKernel
|
| 4044 |
-
aten::view 0.
|
| 4045 |
-
aten::select 0.
|
| 4046 |
-
aten::as_strided 0.
|
| 4047 |
-
cudaDeviceSynchronize 0.28% 5.
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
-
Self CPU time total:
|
| 4050 |
-
Self CUDA time total: 26.
|
| 4051 |
|
| 4052 |
|
| 4053 |
|
|
@@ -4057,24 +4057,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q100_H8_E256_L4_P4
|
|
| 4057 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4058 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4059 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4060 |
-
hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 138.
|
| 4061 |
-
hf_kernels_deformable_detr
|
| 4062 |
-
_deformable_detr_57c3d32::ms_deform_attn_forward 1.
|
| 4063 |
-
void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.
|
| 4064 |
-
aten::zeros 0.
|
| 4065 |
-
aten::zero_ 0.
|
| 4066 |
-
aten::fill_ 1.
|
| 4067 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.784us 10.
|
| 4068 |
-
Activity Buffer Request
|
| 4069 |
-
aten::empty 0.
|
| 4070 |
-
cudaLaunchKernel
|
| 4071 |
-
aten::view 0.
|
| 4072 |
-
aten::select 0.
|
| 4073 |
-
aten::as_strided 0.
|
| 4074 |
-
cudaDeviceSynchronize 0.
|
| 4075 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4076 |
-
Self CPU time total: 1.
|
| 4077 |
-
Self CUDA time total: 25.
|
| 4078 |
|
| 4079 |
|
| 4080 |
|
|
@@ -4084,28 +4084,28 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q300_H8_E256_L4_P4
|
|
| 4084 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4085 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4086 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4087 |
-
hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4088 |
-
hf_kernels_deformable_detr
|
| 4089 |
-
_deformable_detr_57c3d32::ms_deform_attn_forward 1.
|
| 4090 |
-
void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4091 |
-
aten::zeros 0.
|
| 4092 |
-
aten::zero_ 0.
|
| 4093 |
-
aten::fill_ 1.
|
| 4094 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4095 |
-
Activity Buffer Request
|
| 4096 |
-
aten::empty 0.
|
| 4097 |
-
cudaLaunchKernel
|
| 4098 |
-
aten::view 0.41% 9.
|
| 4099 |
-
aten::select 0.48% 10.
|
| 4100 |
-
aten::as_strided 0.
|
| 4101 |
-
cudaDeviceSynchronize 0.22% 4.
|
| 4102 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4103 |
-
Self CPU time total: 2.
|
| 4104 |
-
Self CUDA time total:
|
| 4105 |
|
| 4106 |
|
| 4107 |
impl wl p50(ms) ok
|
| 4108 |
-
hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.
|
| 4109 |
hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.04 True
|
| 4110 |
hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.04 True
|
| 4111 |
hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
|
|
@@ -4113,12 +4113,14 @@ hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
|
|
| 4113 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4114 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4115 |
<div class="uv-logs-content" style="display: none;">
|
| 4116 |
-
Installed
|
| 4117 |
</div>
|
| 4118 |
</div>
|
| 4119 |
-
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4120 |
-
|
| 4121 |
-
Fetching 7 files:
|
|
|
|
|
|
|
| 4122 |
<div class="cell-artifacts">
|
| 4123 |
<h4>Artifacts:</h4>
|
| 4124 |
<a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: nv | 0.25s
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div id="output-nv" class="cell-output">
|
| 3908 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:55:49 2025
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
+
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3911 |
+-----------------------------------------+------------------------+----------------------+
|
| 3912 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3913 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3914 |
| | | MIG M. |
|
| 3915 |
|=========================================+========================+======================|
|
| 3916 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3917 |
+
| N/A 30C P0 77W / 350W | 0MiB / 46068MiB | 11% Default |
|
| 3918 |
| | | N/A |
|
| 3919 |
+-----------------------------------------+------------------------+----------------------+
|
| 3920 |
|
|
|
|
| 3938 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3939 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3940 |
</span> |
|
| 3941 |
+
Cell: benchmark | 4.73s
|
| 3942 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3943 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3944 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4005 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4006 |
+
hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 199.744us 791.13% 199.744us 199.744us 1
|
| 4007 |
+
hf_kernels_deformable_detr 5.99% 129.162us 99.60% 2.148ms 2.148ms 0.000us 0.00% 26.304us 26.304us 1
|
| 4008 |
+
_deformable_detr_57c3d32::ms_deform_attn_forward 3.04% 65.452us 93.61% 2.019ms 672.874us 22.336us 88.47% 26.304us 8.768us 3
|
| 4009 |
+
void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.336us 88.47% 22.336us 7.445us 3
|
| 4010 |
+
aten::zeros 0.91% 19.609us 87.96% 1.897ms 632.230us 0.000us 0.00% 3.968us 1.323us 3
|
| 4011 |
+
aten::zero_ 0.66% 14.208us 85.42% 1.842ms 614.026us 0.000us 0.00% 3.968us 1.323us 3
|
| 4012 |
+
aten::fill_ 1.51% 32.653us 84.76% 1.828ms 609.290us 2.912us 11.53% 3.968us 1.323us 3
|
| 4013 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.912us 11.53% 2.912us 0.971us 3
|
| 4014 |
+
Activity Buffer Request 81.38% 1.755ms 81.38% 1.755ms 1.755ms 1.056us 4.18% 1.056us 1.056us 1
|
| 4015 |
+
aten::empty 1.62% 35.003us 1.62% 35.003us 11.668us 0.000us 0.00% 0.000us 0.000us 3
|
| 4016 |
+
cudaLaunchKernel 2.65% 57.140us 2.65% 57.140us 9.523us 0.000us 0.00% 0.000us 0.000us 6
|
| 4017 |
+
aten::view 0.79% 17.140us 0.79% 17.140us 2.857us 0.000us 0.00% 0.000us 0.000us 6
|
| 4018 |
+
aten::select 0.89% 19.100us 1.05% 22.620us 7.540us 0.000us 0.00% 0.000us 0.000us 3
|
| 4019 |
+
aten::as_strided 0.16% 3.520us 0.16% 3.520us 1.173us 0.000us 0.00% 0.000us 0.000us 3
|
| 4020 |
+
cudaDeviceSynchronize 0.40% 8.641us 0.40% 8.641us 8.641us 0.000us 0.00% 0.000us 0.000us 1
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
+
Self CPU time total: 2.156ms
|
| 4023 |
+
Self CUDA time total: 25.248us
|
| 4024 |
|
| 4025 |
|
| 4026 |
|
|
|
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4032 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4033 |
+
hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 136.418us 517.99% 136.418us 136.418us 1
|
| 4034 |
+
hf_kernels_deformable_detr 5.06% 104.032us 99.72% 2.049ms 2.049ms 0.000us 0.00% 27.296us 27.296us 1
|
| 4035 |
+
_deformable_detr_57c3d32::ms_deform_attn_forward 1.59% 32.619us 94.66% 1.945ms 648.480us 23.488us 89.19% 27.296us 9.099us 3
|
| 4036 |
+
void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 23.488us 89.19% 23.488us 7.829us 3
|
| 4037 |
+
aten::zeros 0.44% 8.979us 91.25% 1.875ms 625.117us 0.000us 0.00% 3.808us 1.269us 3
|
| 4038 |
+
aten::zero_ 0.41% 8.351us 89.97% 1.849ms 616.327us 0.000us 0.00% 3.808us 1.269us 3
|
| 4039 |
+
aten::fill_ 1.21% 24.960us 89.56% 1.841ms 613.543us 2.848us 10.81% 3.808us 1.269us 3
|
| 4040 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.848us 10.81% 2.848us 0.949us 3
|
| 4041 |
+
Activity Buffer Request 87.10% 1.790ms 87.10% 1.790ms 1.790ms 0.960us 3.65% 0.960us 0.960us 1
|
| 4042 |
+
aten::empty 0.85% 17.391us 0.85% 17.391us 5.797us 0.000us 0.00% 0.000us 0.000us 3
|
| 4043 |
+
cudaLaunchKernel 1.95% 40.151us 1.95% 40.151us 6.692us 0.000us 0.00% 0.000us 0.000us 6
|
| 4044 |
+
aten::view 0.44% 9.121us 0.44% 9.121us 1.520us 0.000us 0.00% 0.000us 0.000us 6
|
| 4045 |
+
aten::select 0.58% 11.950us 0.68% 13.920us 4.640us 0.000us 0.00% 0.000us 0.000us 3
|
| 4046 |
+
aten::as_strided 0.10% 1.970us 0.10% 1.970us 0.657us 0.000us 0.00% 0.000us 0.000us 3
|
| 4047 |
+
cudaDeviceSynchronize 0.28% 5.670us 0.28% 5.670us 5.670us 0.000us 0.00% 0.000us 0.000us 1
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
+
Self CPU time total: 2.055ms
|
| 4050 |
+
Self CUDA time total: 26.336us
|
| 4051 |
|
| 4052 |
|
| 4053 |
|
|
|
|
| 4057 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4058 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4059 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4060 |
+
hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 138.431us 541.44% 138.431us 138.431us 1
|
| 4061 |
+
hf_kernels_deformable_detr 4.88% 96.691us 99.73% 1.977ms 1.977ms 0.000us 0.00% 26.495us 26.495us 1
|
| 4062 |
+
_deformable_detr_57c3d32::ms_deform_attn_forward 1.70% 33.709us 94.86% 1.881ms 626.893us 22.783us 89.11% 26.495us 8.832us 3
|
| 4063 |
+
void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.783us 89.11% 22.783us 7.594us 3
|
| 4064 |
+
aten::zeros 0.43% 8.511us 91.28% 1.810ms 603.293us 0.000us 0.00% 3.712us 1.237us 3
|
| 4065 |
+
aten::zero_ 0.42% 8.319us 90.02% 1.785ms 594.946us 0.000us 0.00% 3.712us 1.237us 3
|
| 4066 |
+
aten::fill_ 1.36% 26.920us 89.60% 1.777ms 592.173us 2.784us 10.89% 3.712us 1.237us 3
|
| 4067 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.784us 10.89% 2.784us 0.928us 3
|
| 4068 |
+
Activity Buffer Request 86.99% 1.725ms 86.99% 1.725ms 1.725ms 0.928us 3.63% 0.928us 0.928us 1
|
| 4069 |
+
aten::empty 0.83% 16.530us 0.83% 16.530us 5.510us 0.000us 0.00% 0.000us 0.000us 3
|
| 4070 |
+
cudaLaunchKernel 1.99% 39.553us 1.99% 39.553us 6.592us 0.000us 0.00% 0.000us 0.000us 6
|
| 4071 |
+
aten::view 0.47% 9.270us 0.47% 9.270us 1.545us 0.000us 0.00% 0.000us 0.000us 6
|
| 4072 |
+
aten::select 0.56% 11.070us 0.66% 13.141us 4.380us 0.000us 0.00% 0.000us 0.000us 3
|
| 4073 |
+
aten::as_strided 0.10% 2.071us 0.10% 2.071us 0.690us 0.000us 0.00% 0.000us 0.000us 3
|
| 4074 |
+
cudaDeviceSynchronize 0.27% 5.300us 0.27% 5.300us 5.300us 0.000us 0.00% 0.000us 0.000us 1
|
| 4075 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4076 |
+
Self CPU time total: 1.983ms
|
| 4077 |
+
Self CUDA time total: 25.567us
|
| 4078 |
|
| 4079 |
|
| 4080 |
|
|
|
|
| 4084 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4085 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4086 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4087 |
+
hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 142.528us 304.46% 142.528us 142.528us 1
|
| 4088 |
+
hf_kernels_deformable_detr 4.36% 98.391us 99.78% 2.253ms 2.253ms 0.000us 0.00% 47.838us 47.838us 1
|
| 4089 |
+
_deformable_detr_57c3d32::ms_deform_attn_forward 1.43% 32.311us 95.42% 2.155ms 718.335us 43.743us 93.44% 47.838us 15.946us 3
|
| 4090 |
+
void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 43.743us 93.44% 43.743us 14.581us 3
|
| 4091 |
+
aten::zeros 0.35% 7.869us 92.42% 2.087ms 695.715us 0.000us 0.00% 4.095us 1.365us 3
|
| 4092 |
+
aten::zero_ 0.37% 8.381us 91.32% 2.062ms 687.455us 0.000us 0.00% 4.095us 1.365us 3
|
| 4093 |
+
aten::fill_ 1.13% 25.460us 90.95% 2.054ms 684.661us 3.071us 6.56% 4.095us 1.365us 3
|
| 4094 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.071us 6.56% 3.071us 1.024us 3
|
| 4095 |
+
Activity Buffer Request 79.30% 1.791ms 79.30% 1.791ms 1.791ms 1.024us 2.19% 1.024us 1.024us 1
|
| 4096 |
+
aten::empty 0.75% 16.910us 0.75% 16.910us 5.637us 0.000us 0.00% 0.000us 0.000us 3
|
| 4097 |
+
cudaLaunchKernel 11.13% 251.265us 11.13% 251.265us 41.878us 0.000us 0.00% 0.000us 0.000us 6
|
| 4098 |
+
aten::view 0.41% 9.300us 0.41% 9.300us 1.550us 0.000us 0.00% 0.000us 0.000us 6
|
| 4099 |
+
aten::select 0.48% 10.740us 0.56% 12.720us 4.240us 0.000us 0.00% 0.000us 0.000us 3
|
| 4100 |
+
aten::as_strided 0.09% 1.980us 0.09% 1.980us 0.660us 0.000us 0.00% 0.000us 0.000us 3
|
| 4101 |
+
cudaDeviceSynchronize 0.22% 4.929us 0.22% 4.929us 4.929us 0.000us 0.00% 0.000us 0.000us 1
|
| 4102 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4103 |
+
Self CPU time total: 2.258ms
|
| 4104 |
+
Self CUDA time total: 46.814us
|
| 4105 |
|
| 4106 |
|
| 4107 |
impl wl p50(ms) ok
|
| 4108 |
+
hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
|
| 4109 |
hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.04 True
|
| 4110 |
hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.04 True
|
| 4111 |
hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
|
|
|
|
| 4113 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4114 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4115 |
<div class="uv-logs-content" style="display: none;">
|
| 4116 |
+
Installed 14 packages in 11ms
|
| 4117 |
</div>
|
| 4118 |
</div>
|
| 4119 |
+
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
|
| 4120 |
+
|
| 4121 |
+
Fetching 7 files: 14%|█▍ | 1/7 [00:00<00:02, 2.99it/s]
|
| 4122 |
+
Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 9.51it/s]
|
| 4123 |
+
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 11.76it/s]</div>
|
| 4124 |
<div class="cell-artifacts">
|
| 4125 |
<h4>Artifacts:</h4>
|
| 4126 |
<a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
|
deformable_detr/impls/torch_deformable_detr.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: nv | 0.
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3904,16 +3904,16 @@ Cell: nv | 0.22s
|
|
| 3904 |
</div>
|
| 3905 |
</div>
|
| 3906 |
<div id="output-nv" class="cell-output">
|
| 3907 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3908 |
+-----------------------------------------------------------------------------------------+
|
| 3909 |
-
| NVIDIA-SMI 580.
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
| 3911 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3912 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3913 |
| | | MIG M. |
|
| 3914 |
|=========================================+========================+======================|
|
| 3915 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3916 |
-
| N/A
|
| 3917 |
| | | N/A |
|
| 3918 |
+-----------------------------------------+------------------------+----------------------+
|
| 3919 |
|
|
@@ -3935,9 +3935,9 @@ Cell: nv | 0.22s
|
|
| 3935 |
<span class="collapse-indicators">
|
| 3936 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3937 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3938 |
-
<span id="uv-indicator-benchmark" style="cursor:
|
| 3939 |
</span> |
|
| 3940 |
-
Cell: benchmark |
|
| 3941 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3942 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3943 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4077,29 +4077,29 @@ PROFILE TRACE: torch_eager | cuda_B1_Q100_H8_E256_L4_P4
|
|
| 4077 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4078 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4081 |
-
torch_eager 20.
|
| 4082 |
-
aten::index 4.
|
| 4083 |
-
aten::copy_ 4.
|
| 4084 |
-
aten::mul 5.
|
| 4085 |
-
void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4086 |
-
aten::to 0.
|
| 4087 |
-
aten::_to_copy
|
| 4088 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 202.
|
| 4089 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 167.
|
| 4090 |
-
aten::contiguous 0.
|
| 4091 |
-
aten::clone 0.
|
| 4092 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.
|
| 4093 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.
|
| 4094 |
-
aten::__and__
|
| 4095 |
-
aten::bitwise_and 2.
|
| 4096 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4097 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 86.
|
| 4098 |
-
aten::sub 2.
|
| 4099 |
-
aten::add 1.
|
| 4100 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4101 |
-
Self CPU time total:
|
| 4102 |
-
Self CUDA time total: 1.
|
| 4103 |
|
| 4104 |
|
| 4105 |
|
|
@@ -4109,29 +4109,29 @@ PROFILE TRACE: torch_eager | cuda_B1_Q300_H8_E256_L4_P4
|
|
| 4109 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4110 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4111 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4112 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 18.
|
| 4113 |
-
torch_eager 19.
|
| 4114 |
-
aten::index 4.47%
|
| 4115 |
-
aten::copy_ 4.
|
| 4116 |
-
aten::mul
|
| 4117 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 266.
|
| 4118 |
-
void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4119 |
-
aten::to 0.
|
| 4120 |
-
aten::_to_copy 1.
|
| 4121 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 169.
|
| 4122 |
-
aten::contiguous 0.
|
| 4123 |
-
aten::clone 0.
|
| 4124 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4125 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 117.
|
| 4126 |
-
aten::__and__ 0.
|
| 4127 |
-
aten::bitwise_and 2.
|
| 4128 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4129 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.
|
| 4130 |
-
aten::add 1.
|
| 4131 |
-
aten::sub 2.
|
| 4132 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4133 |
-
Self CPU time total: 20.
|
| 4134 |
-
Self CUDA time total: 1.
|
| 4135 |
|
| 4136 |
|
| 4137 |
|
|
@@ -4141,29 +4141,29 @@ PROFILE TRACE: torch_eager | cuda_B2_Q100_H8_E256_L4_P4
|
|
| 4141 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4142 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4143 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4144 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.
|
| 4145 |
-
torch_eager 19.
|
| 4146 |
-
aten::index 4.
|
| 4147 |
-
aten::copy_ 4.
|
| 4148 |
-
aten::mul
|
| 4149 |
-
void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4150 |
-
|
| 4151 |
-
|
| 4152 |
-
|
| 4153 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 168.
|
| 4154 |
-
aten::contiguous 0.
|
| 4155 |
-
aten::clone 0.
|
| 4156 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 134.
|
| 4157 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 116.
|
| 4158 |
-
aten::__and__ 0.
|
| 4159 |
-
aten::bitwise_and 2.
|
| 4160 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.
|
| 4161 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4162 |
-
aten::add 1.
|
| 4163 |
-
aten::sub 2.
|
| 4164 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4165 |
-
Self CPU time total: 21.
|
| 4166 |
-
Self CUDA time total: 1.
|
| 4167 |
|
| 4168 |
|
| 4169 |
|
|
@@ -4173,37 +4173,43 @@ PROFILE TRACE: torch_eager | cuda_B2_Q300_H8_E256_L4_P4
|
|
| 4173 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4174 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4175 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4176 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4177 |
-
torch_eager
|
| 4178 |
-
aten::mul
|
| 4179 |
-
aten::index
|
| 4180 |
-
aten::copy_ 4.
|
| 4181 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4182 |
-
void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4183 |
-
aten::to 0.
|
| 4184 |
-
aten::_to_copy
|
| 4185 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 168.
|
| 4186 |
-
aten::contiguous 0.
|
| 4187 |
-
aten::clone 0.
|
| 4188 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 137.
|
| 4189 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4190 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 120.
|
| 4191 |
-
aten::add 1.
|
| 4192 |
-
aten::__and__ 0.
|
| 4193 |
-
aten::bitwise_and 2.
|
| 4194 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4195 |
-
aten::sub 2.
|
| 4196 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4197 |
-
Self CPU time total:
|
| 4198 |
-
Self CUDA time total: 1.
|
| 4199 |
|
| 4200 |
|
| 4201 |
impl wl p50(ms) ok
|
| 4202 |
-
torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.
|
| 4203 |
-
torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.
|
| 4204 |
-
torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.
|
| 4205 |
-
torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.
|
| 4206 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4207 |
<div class="cell-artifacts">
|
| 4208 |
<h4>Artifacts:</h4>
|
| 4209 |
<a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: nv | 0.25s
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3904 |
</div>
|
| 3905 |
</div>
|
| 3906 |
<div id="output-nv" class="cell-output">
|
| 3907 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:55:49 2025
|
| 3908 |
+-----------------------------------------------------------------------------------------+
|
| 3909 |
+
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
| 3911 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3912 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3913 |
| | | MIG M. |
|
| 3914 |
|=========================================+========================+======================|
|
| 3915 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3916 |
+
| N/A 30C P0 77W / 350W | 0MiB / 46068MiB | 11% Default |
|
| 3917 |
| | | N/A |
|
| 3918 |
+-----------------------------------------+------------------------+----------------------+
|
| 3919 |
|
|
|
|
| 3935 |
<span class="collapse-indicators">
|
| 3936 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3937 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3938 |
+
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3939 |
</span> |
|
| 3940 |
+
Cell: benchmark | 9.12s
|
| 3941 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3942 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3943 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4077 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4078 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.928ms 1345.17% 19.928ms 19.928ms 1
|
| 4081 |
+
torch_eager 20.03% 4.383ms 99.97% 21.877ms 21.877ms 0.000us 0.00% 1.483ms 1.483ms 1
|
| 4082 |
+
aten::index 4.57% 999.946us 16.87% 3.693ms 76.930us 235.999us 15.93% 369.535us 7.699us 48
|
| 4083 |
+
aten::copy_ 4.70% 1.029ms 11.50% 2.517ms 11.491us 366.142us 24.72% 366.142us 1.672us 219
|
| 4084 |
+
aten::mul 5.86% 1.283ms 10.10% 2.209ms 11.507us 293.927us 19.84% 293.927us 1.531us 192
|
| 4085 |
+
void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 235.999us 15.93% 235.999us 4.917us 48
|
| 4086 |
+
aten::to 0.58% 126.416us 10.88% 2.380ms 13.921us 0.000us 0.00% 232.606us 1.360us 171
|
| 4087 |
+
aten::_to_copy 1.91% 417.236us 10.30% 2.254ms 18.325us 0.000us 0.00% 232.606us 1.891us 123
|
| 4088 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 202.308us 13.66% 202.308us 1.686us 120
|
| 4089 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 167.963us 11.34% 167.963us 2.000us 84
|
| 4090 |
+
aten::contiguous 0.37% 80.417us 8.79% 1.925ms 20.049us 0.000us 0.00% 133.536us 1.391us 96
|
| 4091 |
+
aten::clone 0.80% 175.766us 8.43% 1.844ms 19.211us 0.000us 0.00% 133.536us 1.391us 96
|
| 4092 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.536us 9.01% 133.536us 1.391us 96
|
| 4093 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.352us 7.79% 115.352us 1.202us 96
|
| 4094 |
+
aten::__and__ 0.45% 97.450us 4.50% 984.021us 11.715us 0.000us 0.00% 98.725us 1.175us 84
|
| 4095 |
+
aten::bitwise_and 2.51% 548.975us 4.05% 886.571us 10.554us 98.725us 6.66% 98.725us 1.175us 84
|
| 4096 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 98.725us 6.66% 98.725us 1.175us 84
|
| 4097 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 86.111us 5.81% 86.111us 1.196us 72
|
| 4098 |
+
aten::sub 2.21% 483.704us 3.73% 817.012us 11.347us 79.134us 5.34% 79.134us 1.099us 72
|
| 4099 |
+
aten::add 1.64% 359.872us 2.73% 597.608us 9.960us 74.367us 5.02% 74.367us 1.239us 60
|
| 4100 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4101 |
+
Self CPU time total: 21.884ms
|
| 4102 |
+
Self CUDA time total: 1.481ms
|
| 4103 |
|
| 4104 |
|
| 4105 |
|
|
|
|
| 4109 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4110 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4111 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4112 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 18.701ms 1173.82% 18.701ms 18.701ms 1
|
| 4113 |
+
torch_eager 19.86% 4.084ms 99.94% 20.549ms 20.549ms 0.000us 0.00% 1.594ms 1.594ms 1
|
| 4114 |
+
aten::index 4.47% 919.982us 16.50% 3.393ms 70.681us 250.075us 15.70% 381.947us 7.957us 48
|
| 4115 |
+
aten::copy_ 4.90% 1.007ms 11.73% 2.411ms 11.009us 365.571us 22.95% 365.571us 1.669us 219
|
| 4116 |
+
aten::mul 5.89% 1.211ms 10.29% 2.116ms 11.019us 357.953us 22.47% 357.953us 1.864us 192
|
| 4117 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 266.175us 16.71% 266.175us 2.218us 120
|
| 4118 |
+
void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 250.075us 15.70% 250.075us 5.210us 48
|
| 4119 |
+
aten::to 0.56% 115.808us 10.96% 2.254ms 13.183us 0.000us 0.00% 233.699us 1.367us 171
|
| 4120 |
+
aten::_to_copy 1.83% 375.992us 10.40% 2.138ms 17.386us 0.000us 0.00% 233.699us 1.900us 123
|
| 4121 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 169.346us 10.63% 169.346us 2.016us 84
|
| 4122 |
+
aten::contiguous 0.37% 76.815us 8.72% 1.793ms 18.680us 0.000us 0.00% 131.872us 1.374us 96
|
| 4123 |
+
aten::clone 0.79% 162.290us 8.35% 1.716ms 17.880us 0.000us 0.00% 131.872us 1.374us 96
|
| 4124 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 131.872us 8.28% 131.872us 1.374us 96
|
| 4125 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 117.600us 7.38% 117.600us 1.225us 96
|
| 4126 |
+
aten::__and__ 0.42% 86.722us 4.57% 939.170us 11.181us 0.000us 0.00% 105.348us 1.254us 84
|
| 4127 |
+
aten::bitwise_and 2.53% 520.363us 4.15% 852.448us 10.148us 105.348us 6.61% 105.348us 1.254us 84
|
| 4128 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 105.348us 6.61% 105.348us 1.254us 84
|
| 4129 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.196us 6.54% 104.196us 1.447us 72
|
| 4130 |
+
aten::add 1.65% 339.069us 2.79% 573.170us 9.553us 91.619us 5.75% 91.619us 1.527us 60
|
| 4131 |
+
aten::sub 2.16% 443.591us 3.72% 765.420us 10.631us 80.447us 5.05% 80.447us 1.117us 72
|
| 4132 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4133 |
+
Self CPU time total: 20.561ms
|
| 4134 |
+
Self CUDA time total: 1.593ms
|
| 4135 |
|
| 4136 |
|
| 4137 |
|
|
|
|
| 4141 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4142 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4143 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4144 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.345ms 1257.82% 19.345ms 19.345ms 1
|
| 4145 |
+
torch_eager 19.37% 4.137ms 99.97% 21.351ms 21.351ms 0.000us 0.00% 1.539ms 1.539ms 1
|
| 4146 |
+
aten::index 4.47% 955.266us 16.53% 3.530ms 73.551us 242.625us 15.78% 377.060us 7.855us 48
|
| 4147 |
+
aten::copy_ 4.74% 1.012ms 11.59% 2.476ms 11.307us 367.943us 23.92% 367.943us 1.680us 219
|
| 4148 |
+
aten::mul 5.81% 1.241ms 10.15% 2.167ms 11.287us 324.158us 21.08% 324.158us 1.688us 192
|
| 4149 |
+
void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 242.625us 15.78% 242.625us 5.055us 48
|
| 4150 |
+
aten::to 0.53% 113.722us 11.14% 2.380ms 13.916us 0.000us 0.00% 233.508us 1.366us 171
|
| 4151 |
+
aten::_to_copy 2.07% 441.682us 10.61% 2.266ms 18.422us 0.000us 0.00% 233.508us 1.898us 123
|
| 4152 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 233.472us 15.18% 233.472us 1.946us 120
|
| 4153 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 168.769us 10.97% 168.769us 2.009us 84
|
| 4154 |
+
aten::contiguous 0.38% 81.343us 8.57% 1.831ms 19.072us 0.000us 0.00% 134.435us 1.400us 96
|
| 4155 |
+
aten::clone 0.71% 151.394us 8.19% 1.750ms 18.225us 0.000us 0.00% 134.435us 1.400us 96
|
| 4156 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 134.435us 8.74% 134.435us 1.400us 96
|
| 4157 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 116.161us 7.55% 116.161us 1.210us 96
|
| 4158 |
+
aten::__and__ 0.37% 78.366us 4.26% 910.569us 10.840us 0.000us 0.00% 104.128us 1.240us 84
|
| 4159 |
+
aten::bitwise_and 2.32% 495.587us 3.90% 832.203us 9.907us 104.128us 6.77% 104.128us 1.240us 84
|
| 4160 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.128us 6.77% 104.128us 1.240us 84
|
| 4161 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 95.391us 6.20% 95.391us 1.325us 72
|
| 4162 |
+
aten::add 1.65% 352.101us 2.82% 602.659us 10.044us 83.522us 5.43% 83.522us 1.392us 60
|
| 4163 |
+
aten::sub 2.19% 467.179us 3.78% 806.853us 11.206us 79.169us 5.15% 79.169us 1.100us 72
|
| 4164 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4165 |
+
Self CPU time total: 21.357ms
|
| 4166 |
+
Self CUDA time total: 1.538ms
|
| 4167 |
|
| 4168 |
|
| 4169 |
|
|
|
|
| 4173 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4174 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4175 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4176 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 18.928ms 1070.52% 18.928ms 18.928ms 1
|
| 4177 |
+
torch_eager 19.00% 4.018ms 99.97% 21.144ms 21.144ms 0.000us 0.00% 1.769ms 1.769ms 1
|
| 4178 |
+
aten::mul 5.84% 1.234ms 10.44% 2.209ms 11.503us 449.959us 25.45% 449.959us 2.344us 192
|
| 4179 |
+
aten::index 4.43% 937.219us 16.19% 3.424ms 71.339us 281.246us 15.91% 418.466us 8.718us 48
|
| 4180 |
+
aten::copy_ 4.75% 1.005ms 11.71% 2.477ms 11.312us 370.923us 20.98% 370.923us 1.694us 219
|
| 4181 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 355.583us 20.11% 355.583us 2.963us 120
|
| 4182 |
+
void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 281.246us 15.91% 281.246us 5.859us 48
|
| 4183 |
+
aten::to 0.52% 110.789us 10.89% 2.302ms 13.465us 0.000us 0.00% 233.703us 1.367us 171
|
| 4184 |
+
aten::_to_copy 1.88% 398.545us 10.36% 2.192ms 17.819us 0.000us 0.00% 233.703us 1.900us 123
|
| 4185 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 168.070us 9.51% 168.070us 2.001us 84
|
| 4186 |
+
aten::contiguous 0.38% 80.073us 8.57% 1.813ms 18.880us 0.000us 0.00% 137.220us 1.429us 96
|
| 4187 |
+
aten::clone 0.71% 149.477us 8.19% 1.732ms 18.046us 0.000us 0.00% 137.220us 1.429us 96
|
| 4188 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 137.220us 7.76% 137.220us 1.429us 96
|
| 4189 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 128.960us 7.29% 128.960us 1.791us 72
|
| 4190 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 120.326us 6.81% 120.326us 1.253us 96
|
| 4191 |
+
aten::add 1.60% 338.443us 2.84% 599.957us 9.999us 113.407us 6.41% 113.407us 1.890us 60
|
| 4192 |
+
aten::__and__ 0.34% 72.039us 4.35% 919.096us 10.942us 0.000us 0.00% 109.028us 1.298us 84
|
| 4193 |
+
aten::bitwise_and 2.36% 498.512us 4.00% 847.057us 10.084us 109.028us 6.17% 109.028us 1.298us 84
|
| 4194 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 109.028us 6.17% 109.028us 1.298us 84
|
| 4195 |
+
aten::sub 2.14% 452.695us 3.86% 815.589us 11.328us 84.674us 4.79% 84.674us 1.176us 72
|
| 4196 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4197 |
+
Self CPU time total: 21.151ms
|
| 4198 |
+
Self CUDA time total: 1.768ms
|
| 4199 |
|
| 4200 |
|
| 4201 |
impl wl p50(ms) ok
|
| 4202 |
+
torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.29 True
|
| 4203 |
+
torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.07 True
|
| 4204 |
+
torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.07 True
|
| 4205 |
+
torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.11 True
|
| 4206 |
</pre></div>
|
| 4207 |
+
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4208 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4209 |
+
<div class="uv-logs-content" style="display: none;">
|
| 4210 |
+
Installed 37 packages in 286ms
|
| 4211 |
+
</div>
|
| 4212 |
+
</div>
|
| 4213 |
<div class="cell-artifacts">
|
| 4214 |
<h4>Artifacts:</h4>
|
| 4215 |
<a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
|
deformable_detr/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
deformable_detr/results/combined_results.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3889,11 +3889,11 @@ body[data-tool="eraser"] .main-content {
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
-
<dc:date>2025-
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
| 3896 |
-
<dc:title>Matplotlib v3.10.
|
| 3897 |
</ns2:Agent>
|
| 3898 |
</dc:creator>
|
| 3899 |
</ns2:Work>
|
|
@@ -3908,208 +3908,260 @@ body[data-tool="eraser"] .main-content {
|
|
| 3908 |
</g>
|
| 3909 |
<g id="axes--1" class="axes">
|
| 3910 |
<g id="patch_2">
|
| 3911 |
-
<path d="M
|
| 3912 |
</g>
|
| 3913 |
<g id="matplotlib.axis_1">
|
| 3914 |
<g id="xtick_1">
|
| 3915 |
<g id="grid-x--1" class="grid grid-x">
|
| 3916 |
-
<path d="M
|
| 3917 |
</g>
|
| 3918 |
<g id="line2d_1">
|
| 3919 |
<defs>
|
| 3920 |
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3921 |
</defs>
|
| 3922 |
<g>
|
| 3923 |
-
<use ns4:href="#mafb3703e5b" x="
|
| 3924 |
</g>
|
| 3925 |
</g>
|
| 3926 |
<g id="text_1">
|
| 3927 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(
|
| 3928 |
</g>
|
| 3929 |
</g>
|
| 3930 |
<g id="xtick_2">
|
| 3931 |
<g id="grid-x--2" class="grid grid-x">
|
| 3932 |
-
<path d="M
|
| 3933 |
</g>
|
| 3934 |
<g id="line2d_2">
|
| 3935 |
<g>
|
| 3936 |
-
<use ns4:href="#mafb3703e5b" x="
|
| 3937 |
</g>
|
| 3938 |
</g>
|
| 3939 |
<g id="text_2">
|
| 3940 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(
|
| 3941 |
</g>
|
| 3942 |
</g>
|
| 3943 |
<g id="xtick_3">
|
| 3944 |
<g id="grid-x--3" class="grid grid-x">
|
| 3945 |
-
<path d="M
|
| 3946 |
</g>
|
| 3947 |
<g id="line2d_3">
|
| 3948 |
<g>
|
| 3949 |
-
<use ns4:href="#mafb3703e5b" x="
|
| 3950 |
</g>
|
| 3951 |
</g>
|
| 3952 |
<g id="text_3">
|
| 3953 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(
|
| 3954 |
</g>
|
| 3955 |
</g>
|
| 3956 |
<g id="xtick_4">
|
| 3957 |
<g id="grid-x--4" class="grid grid-x">
|
| 3958 |
-
<path d="M 788.
|
| 3959 |
</g>
|
| 3960 |
<g id="line2d_4">
|
| 3961 |
<g>
|
| 3962 |
-
<use ns4:href="#mafb3703e5b" x="788.
|
| 3963 |
</g>
|
| 3964 |
</g>
|
| 3965 |
<g id="text_4">
|
| 3966 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(734.
|
| 3967 |
</g>
|
| 3968 |
</g>
|
| 3969 |
<g id="label--x" class="xlabel">
|
| 3970 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="
|
| 3971 |
</g>
|
| 3972 |
</g>
|
| 3973 |
<g id="matplotlib.axis_2">
|
| 3974 |
<g id="ytick_1">
|
| 3975 |
<g id="grid-y--2" class="grid grid-y">
|
| 3976 |
-
<path d="M
|
| 3977 |
</g>
|
| 3978 |
<g id="line2d_5">
|
| 3979 |
<defs>
|
| 3980 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3981 |
</defs>
|
| 3982 |
<g>
|
| 3983 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 3984 |
</g>
|
| 3985 |
</g>
|
| 3986 |
<g id="text_5">
|
| 3987 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 3988 |
</g>
|
| 3989 |
</g>
|
| 3990 |
<g id="ytick_2">
|
| 3991 |
<g id="grid-y--3" class="grid grid-y">
|
| 3992 |
-
<path d="M
|
| 3993 |
</g>
|
| 3994 |
<g id="line2d_6">
|
| 3995 |
<g>
|
| 3996 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="text_6">
|
| 4000 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 4001 |
</g>
|
| 4002 |
</g>
|
| 4003 |
<g id="ytick_3">
|
| 4004 |
<g id="grid-y--4" class="grid grid-y">
|
| 4005 |
-
<path d="M
|
| 4006 |
</g>
|
| 4007 |
<g id="line2d_7">
|
| 4008 |
<g>
|
| 4009 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="text_7">
|
| 4013 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 4014 |
</g>
|
| 4015 |
</g>
|
| 4016 |
<g id="ytick_4">
|
| 4017 |
<g id="grid-y--5" class="grid grid-y">
|
| 4018 |
-
<path d="M
|
| 4019 |
</g>
|
| 4020 |
<g id="line2d_8">
|
| 4021 |
<g>
|
| 4022 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="text_8">
|
| 4026 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="ytick_5">
|
| 4030 |
<g id="grid-y--6" class="grid grid-y">
|
| 4031 |
-
<path d="M
|
| 4032 |
</g>
|
| 4033 |
<g id="line2d_9">
|
| 4034 |
<g>
|
| 4035 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="text_9">
|
| 4039 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="label--y" class="ylabel">
|
| 4043 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="
|
| 4044 |
</g>
|
| 4045 |
</g>
|
| 4046 |
<g id="series--hf-kernels-deformable-detr" class="series">
|
| 4047 |
-
<path d="M
|
| 4048 |
<defs>
|
| 4049 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4050 |
</defs>
|
| 4051 |
-
<g clip-path="url(#
|
| 4052 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4053 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4054 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4055 |
-
<use ns4:href="#md7efaf3aec" x="788.
|
| 4056 |
</g>
|
| 4057 |
</g>
|
| 4058 |
<g id="series--torch-eager" class="series">
|
| 4059 |
-
<path d="M
|
| 4060 |
<defs>
|
| 4061 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4062 |
</defs>
|
| 4063 |
-
<g clip-path="url(#
|
| 4064 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4065 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4066 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4067 |
-
<use ns4:href="#m9b8c54d372" x="788.
|
| 4068 |
</g>
|
| 4069 |
</g>
|
| 4070 |
<g id="patch_3">
|
| 4071 |
-
<path d="M
|
| 4072 |
</g>
|
| 4073 |
<g id="patch_4">
|
| 4074 |
<path d="M 824.19299 425.105974 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4075 |
</g>
|
| 4076 |
<g id="patch_5">
|
| 4077 |
-
<path d="M
|
| 4078 |
</g>
|
| 4079 |
<g id="patch_6">
|
| 4080 |
-
<path d="M
|
| 4081 |
</g>
|
| 4082 |
-
<g id="
|
| 4083 |
-
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="
|
| 4084 |
</g>
|
| 4085 |
<g id="legend" class="legend">
|
| 4086 |
<g id="patch_7">
|
| 4087 |
-
<path d="M
|
| 4088 |
</g>
|
| 4089 |
-
<g id="
|
| 4090 |
-
<path d="M
|
| 4091 |
<g>
|
| 4092 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4093 |
</g>
|
| 4094 |
</g>
|
| 4095 |
<g id="legend-label--hf-kernels-deformable-detr" class="legend">
|
| 4096 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="
|
| 4097 |
</g>
|
| 4098 |
-
<g id="
|
| 4099 |
-
<path d="M
|
| 4100 |
<g>
|
| 4101 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4102 |
</g>
|
| 4103 |
</g>
|
| 4104 |
<g id="legend-label--torch-eager" class="legend">
|
| 4105 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="
|
| 4106 |
</g>
|
| 4107 |
</g>
|
| 4108 |
</g>
|
| 4109 |
</g>
|
| 4110 |
<defs>
|
| 4111 |
-
<clipPath id="
|
| 4112 |
-
<rect x="
|
| 4113 |
</clipPath>
|
| 4114 |
</defs>
|
| 4115 |
</svg>
|
|
@@ -4122,7 +4174,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4122 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4123 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4124 |
</span> |
|
| 4125 |
-
Cell: combine | 4.
|
| 4126 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4127 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4128 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4209,14 +4261,14 @@ Summary: 2 found, 0 skipped, 0 missing
|
|
| 4209 |
COMBINED BENCHMARK SUMMARY
|
| 4210 |
|
| 4211 |
impl wl p50(ms) ok
|
| 4212 |
-
hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.
|
| 4213 |
hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.04 True
|
| 4214 |
hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.04 True
|
| 4215 |
hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
|
| 4216 |
-
torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.
|
| 4217 |
-
torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.
|
| 4218 |
-
torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.
|
| 4219 |
-
torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.
|
| 4220 |
|
| 4221 |
GENERATING COMBINED VISUALIZATION
|
| 4222 |
|
|
@@ -4236,7 +4288,7 @@ Implementations included:
|
|
| 4236 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4237 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4238 |
<div class="uv-logs-content" style="display: none;">
|
| 4239 |
-
Installed 37 packages in
|
| 4240 |
</div>
|
| 4241 |
</div>
|
| 4242 |
<div class="cell-artifacts">
|
|
@@ -4249,11 +4301,11 @@ Installed 37 packages in 288ms
|
|
| 4249 |
<rdf:RDF>
|
| 4250 |
<ns2:Work>
|
| 4251 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4252 |
-
<dc:date>2025-
|
| 4253 |
<dc:format>image/svg+xml</dc:format>
|
| 4254 |
<dc:creator>
|
| 4255 |
<ns2:Agent>
|
| 4256 |
-
<dc:title>Matplotlib v3.10.
|
| 4257 |
</ns2:Agent>
|
| 4258 |
</dc:creator>
|
| 4259 |
</ns2:Work>
|
|
@@ -4268,208 +4320,260 @@ Installed 37 packages in 288ms
|
|
| 4268 |
</g>
|
| 4269 |
<g id="axes--1" class="axes">
|
| 4270 |
<g id="patch_2">
|
| 4271 |
-
<path d="M
|
| 4272 |
</g>
|
| 4273 |
<g id="matplotlib.axis_1">
|
| 4274 |
<g id="xtick_1">
|
| 4275 |
<g id="grid-x--1" class="grid grid-x">
|
| 4276 |
-
<path d="M
|
| 4277 |
</g>
|
| 4278 |
<g id="line2d_1">
|
| 4279 |
<defs>
|
| 4280 |
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4281 |
</defs>
|
| 4282 |
<g>
|
| 4283 |
-
<use ns4:href="#mafb3703e5b" x="
|
| 4284 |
</g>
|
| 4285 |
</g>
|
| 4286 |
<g id="text_1">
|
| 4287 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(
|
| 4288 |
</g>
|
| 4289 |
</g>
|
| 4290 |
<g id="xtick_2">
|
| 4291 |
<g id="grid-x--2" class="grid grid-x">
|
| 4292 |
-
<path d="M
|
| 4293 |
</g>
|
| 4294 |
<g id="line2d_2">
|
| 4295 |
<g>
|
| 4296 |
-
<use ns4:href="#mafb3703e5b" x="
|
| 4297 |
</g>
|
| 4298 |
</g>
|
| 4299 |
<g id="text_2">
|
| 4300 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(
|
| 4301 |
</g>
|
| 4302 |
</g>
|
| 4303 |
<g id="xtick_3">
|
| 4304 |
<g id="grid-x--3" class="grid grid-x">
|
| 4305 |
-
<path d="M
|
| 4306 |
</g>
|
| 4307 |
<g id="line2d_3">
|
| 4308 |
<g>
|
| 4309 |
-
<use ns4:href="#mafb3703e5b" x="
|
| 4310 |
</g>
|
| 4311 |
</g>
|
| 4312 |
<g id="text_3">
|
| 4313 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(
|
| 4314 |
</g>
|
| 4315 |
</g>
|
| 4316 |
<g id="xtick_4">
|
| 4317 |
<g id="grid-x--4" class="grid grid-x">
|
| 4318 |
-
<path d="M 788.
|
| 4319 |
</g>
|
| 4320 |
<g id="line2d_4">
|
| 4321 |
<g>
|
| 4322 |
-
<use ns4:href="#mafb3703e5b" x="788.
|
| 4323 |
</g>
|
| 4324 |
</g>
|
| 4325 |
<g id="text_4">
|
| 4326 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(734.
|
| 4327 |
</g>
|
| 4328 |
</g>
|
| 4329 |
<g id="label--x" class="xlabel">
|
| 4330 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="
|
| 4331 |
</g>
|
| 4332 |
</g>
|
| 4333 |
<g id="matplotlib.axis_2">
|
| 4334 |
<g id="ytick_1">
|
| 4335 |
<g id="grid-y--2" class="grid grid-y">
|
| 4336 |
-
<path d="M
|
| 4337 |
</g>
|
| 4338 |
<g id="line2d_5">
|
| 4339 |
<defs>
|
| 4340 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4341 |
</defs>
|
| 4342 |
<g>
|
| 4343 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4344 |
</g>
|
| 4345 |
</g>
|
| 4346 |
<g id="text_5">
|
| 4347 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 4348 |
</g>
|
| 4349 |
</g>
|
| 4350 |
<g id="ytick_2">
|
| 4351 |
<g id="grid-y--3" class="grid grid-y">
|
| 4352 |
-
<path d="M
|
| 4353 |
</g>
|
| 4354 |
<g id="line2d_6">
|
| 4355 |
<g>
|
| 4356 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4357 |
</g>
|
| 4358 |
</g>
|
| 4359 |
<g id="text_6">
|
| 4360 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 4361 |
</g>
|
| 4362 |
</g>
|
| 4363 |
<g id="ytick_3">
|
| 4364 |
<g id="grid-y--4" class="grid grid-y">
|
| 4365 |
-
<path d="M
|
| 4366 |
</g>
|
| 4367 |
<g id="line2d_7">
|
| 4368 |
<g>
|
| 4369 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4370 |
</g>
|
| 4371 |
</g>
|
| 4372 |
<g id="text_7">
|
| 4373 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 4374 |
</g>
|
| 4375 |
</g>
|
| 4376 |
<g id="ytick_4">
|
| 4377 |
<g id="grid-y--5" class="grid grid-y">
|
| 4378 |
-
<path d="M
|
| 4379 |
</g>
|
| 4380 |
<g id="line2d_8">
|
| 4381 |
<g>
|
| 4382 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4383 |
</g>
|
| 4384 |
</g>
|
| 4385 |
<g id="text_8">
|
| 4386 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 4387 |
</g>
|
| 4388 |
</g>
|
| 4389 |
<g id="ytick_5">
|
| 4390 |
<g id="grid-y--6" class="grid grid-y">
|
| 4391 |
-
<path d="M
|
| 4392 |
</g>
|
| 4393 |
<g id="line2d_9">
|
| 4394 |
<g>
|
| 4395 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4396 |
</g>
|
| 4397 |
</g>
|
| 4398 |
<g id="text_9">
|
| 4399 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4400 |
</g>
|
| 4401 |
</g>
|
| 4402 |
<g id="label--y" class="ylabel">
|
| 4403 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="
|
| 4404 |
</g>
|
| 4405 |
</g>
|
| 4406 |
<g id="series--hf-kernels-deformable-detr" class="series">
|
| 4407 |
-
<path d="M
|
| 4408 |
<defs>
|
| 4409 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4410 |
</defs>
|
| 4411 |
-
<g clip-path="url(#
|
| 4412 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4413 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4414 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4415 |
-
<use ns4:href="#md7efaf3aec" x="788.
|
| 4416 |
</g>
|
| 4417 |
</g>
|
| 4418 |
<g id="series--torch-eager" class="series">
|
| 4419 |
-
<path d="M
|
| 4420 |
<defs>
|
| 4421 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4422 |
</defs>
|
| 4423 |
-
<g clip-path="url(#
|
| 4424 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4425 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4426 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4427 |
-
<use ns4:href="#m9b8c54d372" x="788.
|
| 4428 |
</g>
|
| 4429 |
</g>
|
| 4430 |
<g id="patch_3">
|
| 4431 |
-
<path d="M
|
| 4432 |
</g>
|
| 4433 |
<g id="patch_4">
|
| 4434 |
<path d="M 824.19299 425.105974 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4435 |
</g>
|
| 4436 |
<g id="patch_5">
|
| 4437 |
-
<path d="M
|
| 4438 |
</g>
|
| 4439 |
<g id="patch_6">
|
| 4440 |
-
<path d="M
|
| 4441 |
</g>
|
| 4442 |
-
<g id="
|
| 4443 |
-
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="
|
| 4444 |
</g>
|
| 4445 |
<g id="legend" class="legend">
|
| 4446 |
<g id="patch_7">
|
| 4447 |
-
<path d="M
|
| 4448 |
</g>
|
| 4449 |
-
<g id="
|
| 4450 |
-
<path d="M
|
| 4451 |
<g>
|
| 4452 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4453 |
</g>
|
| 4454 |
</g>
|
| 4455 |
<g id="legend-label--hf-kernels-deformable-detr" class="legend">
|
| 4456 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="
|
| 4457 |
</g>
|
| 4458 |
-
<g id="
|
| 4459 |
-
<path d="M
|
| 4460 |
<g>
|
| 4461 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4462 |
</g>
|
| 4463 |
</g>
|
| 4464 |
<g id="legend-label--torch-eager" class="legend">
|
| 4465 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="
|
| 4466 |
</g>
|
| 4467 |
</g>
|
| 4468 |
</g>
|
| 4469 |
</g>
|
| 4470 |
<defs>
|
| 4471 |
-
<clipPath id="
|
| 4472 |
-
<rect x="
|
| 4473 |
</clipPath>
|
| 4474 |
</defs>
|
| 4475 |
</svg>
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
+
<dc:date>2025-12-19T19:10:04.668129</dc:date>
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
| 3896 |
+
<dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
|
| 3897 |
</ns2:Agent>
|
| 3898 |
</dc:creator>
|
| 3899 |
</ns2:Work>
|
|
|
|
| 3908 |
</g>
|
| 3909 |
<g id="axes--1" class="axes">
|
| 3910 |
<g id="patch_2">
|
| 3911 |
+
<path d="M 47.72 425.105974 L 824.19299 425.105974 L 824.19299 26.88 L 47.72 26.88 L 47.72 425.105974 z " style="fill: none" />
|
| 3912 |
</g>
|
| 3913 |
<g id="matplotlib.axis_1">
|
| 3914 |
<g id="xtick_1">
|
| 3915 |
<g id="grid-x--1" class="grid grid-x">
|
| 3916 |
+
<path d="M 83.014227 425.105974 L 83.014227 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3917 |
</g>
|
| 3918 |
<g id="line2d_1">
|
| 3919 |
<defs>
|
| 3920 |
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3921 |
</defs>
|
| 3922 |
<g>
|
| 3923 |
+
<use ns4:href="#mafb3703e5b" x="83.014227" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
|
| 3924 |
</g>
|
| 3925 |
</g>
|
| 3926 |
<g id="text_1">
|
| 3927 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(28.96641 549.280197) rotate(-45)">cuda_B1_Q100_H8_E256_L4_P4</text>
|
| 3928 |
</g>
|
| 3929 |
</g>
|
| 3930 |
<g id="xtick_2">
|
| 3931 |
<g id="grid-x--2" class="grid grid-x">
|
| 3932 |
+
<path d="M 318.309072 425.105974 L 318.309072 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3933 |
</g>
|
| 3934 |
<g id="line2d_2">
|
| 3935 |
<g>
|
| 3936 |
+
<use ns4:href="#mafb3703e5b" x="318.309072" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
|
| 3937 |
</g>
|
| 3938 |
</g>
|
| 3939 |
<g id="text_2">
|
| 3940 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(264.261255 549.280197) rotate(-45)">cuda_B1_Q300_H8_E256_L4_P4</text>
|
| 3941 |
</g>
|
| 3942 |
</g>
|
| 3943 |
<g id="xtick_3">
|
| 3944 |
<g id="grid-x--3" class="grid grid-x">
|
| 3945 |
+
<path d="M 553.603918 425.105974 L 553.603918 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3946 |
</g>
|
| 3947 |
<g id="line2d_3">
|
| 3948 |
<g>
|
| 3949 |
+
<use ns4:href="#mafb3703e5b" x="553.603918" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
|
| 3950 |
</g>
|
| 3951 |
</g>
|
| 3952 |
<g id="text_3">
|
| 3953 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(499.556101 549.280197) rotate(-45)">cuda_B2_Q100_H8_E256_L4_P4</text>
|
| 3954 |
</g>
|
| 3955 |
</g>
|
| 3956 |
<g id="xtick_4">
|
| 3957 |
<g id="grid-x--4" class="grid grid-x">
|
| 3958 |
+
<path d="M 788.898763 425.105974 L 788.898763 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3959 |
</g>
|
| 3960 |
<g id="line2d_4">
|
| 3961 |
<g>
|
| 3962 |
+
<use ns4:href="#mafb3703e5b" x="788.898763" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
|
| 3963 |
</g>
|
| 3964 |
</g>
|
| 3965 |
<g id="text_4">
|
| 3966 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(734.850946 549.280197) rotate(-45)">cuda_B2_Q300_H8_E256_L4_P4</text>
|
| 3967 |
</g>
|
| 3968 |
</g>
|
| 3969 |
<g id="label--x" class="xlabel">
|
| 3970 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.956495" y="562.545859" transform="rotate(-0 435.956495 562.545859)">Workload</text>
|
| 3971 |
</g>
|
| 3972 |
</g>
|
| 3973 |
<g id="matplotlib.axis_2">
|
| 3974 |
<g id="ytick_1">
|
| 3975 |
<g id="grid-y--2" class="grid grid-y">
|
| 3976 |
+
<path d="M 47.72 410.313695 L 824.19299 410.313695 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3977 |
</g>
|
| 3978 |
<g id="line2d_5">
|
| 3979 |
<defs>
|
| 3980 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3981 |
</defs>
|
| 3982 |
<g>
|
| 3983 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="410.313695" style="stroke: #000000; stroke-width: 0.8" />
|
| 3984 |
</g>
|
| 3985 |
</g>
|
| 3986 |
<g id="text_5">
|
| 3987 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="414.112914" transform="rotate(-0 40.72 414.112914)">0.0</text>
|
| 3988 |
</g>
|
| 3989 |
</g>
|
| 3990 |
<g id="ytick_2">
|
| 3991 |
<g id="grid-y--3" class="grid grid-y">
|
| 3992 |
+
<path d="M 47.72 365.88698 L 824.19299 365.88698 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3993 |
</g>
|
| 3994 |
<g id="line2d_6">
|
| 3995 |
<g>
|
| 3996 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="365.88698" style="stroke: #000000; stroke-width: 0.8" />
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="text_6">
|
| 4000 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="369.686199" transform="rotate(-0 40.72 369.686199)">0.5</text>
|
| 4001 |
</g>
|
| 4002 |
</g>
|
| 4003 |
<g id="ytick_3">
|
| 4004 |
<g id="grid-y--4" class="grid grid-y">
|
| 4005 |
+
<path d="M 47.72 321.460266 L 824.19299 321.460266 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4006 |
</g>
|
| 4007 |
<g id="line2d_7">
|
| 4008 |
<g>
|
| 4009 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="321.460266" style="stroke: #000000; stroke-width: 0.8" />
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="text_7">
|
| 4013 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="325.259484" transform="rotate(-0 40.72 325.259484)">1.0</text>
|
| 4014 |
</g>
|
| 4015 |
</g>
|
| 4016 |
<g id="ytick_4">
|
| 4017 |
<g id="grid-y--5" class="grid grid-y">
|
| 4018 |
+
<path d="M 47.72 277.033551 L 824.19299 277.033551 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4019 |
</g>
|
| 4020 |
<g id="line2d_8">
|
| 4021 |
<g>
|
| 4022 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="277.033551" style="stroke: #000000; stroke-width: 0.8" />
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="text_8">
|
| 4026 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="280.83277" transform="rotate(-0 40.72 280.83277)">1.5</text>
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="ytick_5">
|
| 4030 |
<g id="grid-y--6" class="grid grid-y">
|
| 4031 |
+
<path d="M 47.72 232.606836 L 824.19299 232.606836 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4032 |
</g>
|
| 4033 |
<g id="line2d_9">
|
| 4034 |
<g>
|
| 4035 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="232.606836" style="stroke: #000000; stroke-width: 0.8" />
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="text_9">
|
| 4039 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="236.406055" transform="rotate(-0 40.72 236.406055)">2.0</text>
|
| 4040 |
+
</g>
|
| 4041 |
+
</g>
|
| 4042 |
+
<g id="ytick_6">
|
| 4043 |
+
<g id="grid-y--7" class="grid grid-y">
|
| 4044 |
+
<path d="M 47.72 188.180122 L 824.19299 188.180122 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4045 |
+
</g>
|
| 4046 |
+
<g id="line2d_10">
|
| 4047 |
+
<g>
|
| 4048 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="188.180122" style="stroke: #000000; stroke-width: 0.8" />
|
| 4049 |
+
</g>
|
| 4050 |
+
</g>
|
| 4051 |
+
<g id="text_10">
|
| 4052 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="191.97934" transform="rotate(-0 40.72 191.97934)">2.5</text>
|
| 4053 |
+
</g>
|
| 4054 |
+
</g>
|
| 4055 |
+
<g id="ytick_7">
|
| 4056 |
+
<g id="grid-y--8" class="grid grid-y">
|
| 4057 |
+
<path d="M 47.72 143.753407 L 824.19299 143.753407 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4058 |
+
</g>
|
| 4059 |
+
<g id="line2d_11">
|
| 4060 |
+
<g>
|
| 4061 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="143.753407" style="stroke: #000000; stroke-width: 0.8" />
|
| 4062 |
+
</g>
|
| 4063 |
+
</g>
|
| 4064 |
+
<g id="text_11">
|
| 4065 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="147.552626" transform="rotate(-0 40.72 147.552626)">3.0</text>
|
| 4066 |
+
</g>
|
| 4067 |
+
</g>
|
| 4068 |
+
<g id="ytick_8">
|
| 4069 |
+
<g id="grid-y--9" class="grid grid-y">
|
| 4070 |
+
<path d="M 47.72 99.326692 L 824.19299 99.326692 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4071 |
+
</g>
|
| 4072 |
+
<g id="line2d_12">
|
| 4073 |
+
<g>
|
| 4074 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="99.326692" style="stroke: #000000; stroke-width: 0.8" />
|
| 4075 |
+
</g>
|
| 4076 |
+
</g>
|
| 4077 |
+
<g id="text_12">
|
| 4078 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="103.125911" transform="rotate(-0 40.72 103.125911)">3.5</text>
|
| 4079 |
+
</g>
|
| 4080 |
+
</g>
|
| 4081 |
+
<g id="ytick_9">
|
| 4082 |
+
<g id="grid-y--10" class="grid grid-y">
|
| 4083 |
+
<path d="M 47.72 54.899978 L 824.19299 54.899978 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4084 |
+
</g>
|
| 4085 |
+
<g id="line2d_13">
|
| 4086 |
+
<g>
|
| 4087 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="54.899978" style="stroke: #000000; stroke-width: 0.8" />
|
| 4088 |
+
</g>
|
| 4089 |
+
</g>
|
| 4090 |
+
<g id="text_13">
|
| 4091 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="58.699197" transform="rotate(-0 40.72 58.699197)">4.0</text>
|
| 4092 |
</g>
|
| 4093 |
</g>
|
| 4094 |
<g id="label--y" class="ylabel">
|
| 4095 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="225.992987" transform="rotate(-90 18.737188 225.992987)">Latency P50 (ms)</text>
|
| 4096 |
</g>
|
| 4097 |
</g>
|
| 4098 |
<g id="series--hf-kernels-deformable-detr" class="series">
|
| 4099 |
+
<path d="M 83.014227 407.004793 L 318.309072 406.541778 L 553.603918 406.347278 L 788.898763 406.283214 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4100 |
<defs>
|
| 4101 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4102 |
</defs>
|
| 4103 |
+
<g clip-path="url(#pb5c8282ea4)">
|
| 4104 |
+
<use ns4:href="#md7efaf3aec" x="83.014227" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4105 |
+
<use ns4:href="#md7efaf3aec" x="318.309072" y="406.541778" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4106 |
+
<use ns4:href="#md7efaf3aec" x="553.603918" y="406.347278" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4107 |
+
<use ns4:href="#md7efaf3aec" x="788.898763" y="406.283214" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4108 |
</g>
|
| 4109 |
</g>
|
| 4110 |
<g id="series--torch-eager" class="series">
|
| 4111 |
+
<path d="M 83.014227 118.130211 L 318.309072 48.708671 L 553.603918 48.49098 L 788.898763 44.981181 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4112 |
<defs>
|
| 4113 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4114 |
</defs>
|
| 4115 |
+
<g clip-path="url(#pb5c8282ea4)">
|
| 4116 |
+
<use ns4:href="#m9b8c54d372" x="83.014227" y="118.130211" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4117 |
+
<use ns4:href="#m9b8c54d372" x="318.309072" y="48.708671" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4118 |
+
<use ns4:href="#m9b8c54d372" x="553.603918" y="48.49098" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4119 |
+
<use ns4:href="#m9b8c54d372" x="788.898763" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4120 |
</g>
|
| 4121 |
</g>
|
| 4122 |
<g id="patch_3">
|
| 4123 |
+
<path d="M 47.72 425.105974 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4124 |
</g>
|
| 4125 |
<g id="patch_4">
|
| 4126 |
<path d="M 824.19299 425.105974 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4127 |
</g>
|
| 4128 |
<g id="patch_5">
|
| 4129 |
+
<path d="M 47.72 425.105974 L 824.19299 425.105974 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4130 |
</g>
|
| 4131 |
<g id="patch_6">
|
| 4132 |
+
<path d="M 47.72 26.88 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4133 |
</g>
|
| 4134 |
+
<g id="text_14">
|
| 4135 |
+
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.956495" y="20.88" transform="rotate(-0 435.956495 20.88)">Attention Implementation Latency</text>
|
| 4136 |
</g>
|
| 4137 |
<g id="legend" class="legend">
|
| 4138 |
<g id="patch_7">
|
| 4139 |
+
<path d="M 54.72 64.7925 L 225.330938 64.7925 Q 227.330938 64.7925 227.330938 62.7925 L 227.330938 33.88 Q 227.330938 31.88 225.330938 31.88 L 54.72 31.88 Q 52.72 31.88 52.72 33.88 L 52.72 62.7925 Q 52.72 64.7925 54.72 64.7925 L 54.72 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4140 |
</g>
|
| 4141 |
+
<g id="line2d_14">
|
| 4142 |
+
<path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4143 |
<g>
|
| 4144 |
+
<use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4145 |
</g>
|
| 4146 |
</g>
|
| 4147 |
<g id="legend-label--hf-kernels-deformable-detr" class="legend">
|
| 4148 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">hf_kernels_deformable_detr</text>
|
| 4149 |
</g>
|
| 4150 |
+
<g id="line2d_15">
|
| 4151 |
+
<path d="M 56.72 54.934687 L 66.72 54.934687 L 76.72 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4152 |
<g>
|
| 4153 |
+
<use ns4:href="#m9b8c54d372" x="66.72" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4154 |
</g>
|
| 4155 |
</g>
|
| 4156 |
<g id="legend-label--torch-eager" class="legend">
|
| 4157 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="58.434687" transform="rotate(-0 84.72 58.434687)">torch_eager</text>
|
| 4158 |
</g>
|
| 4159 |
</g>
|
| 4160 |
</g>
|
| 4161 |
</g>
|
| 4162 |
<defs>
|
| 4163 |
+
<clipPath id="pb5c8282ea4">
|
| 4164 |
+
<rect x="47.72" y="26.88" width="776.47299" height="398.225974" />
|
| 4165 |
</clipPath>
|
| 4166 |
</defs>
|
| 4167 |
</svg>
|
|
|
|
| 4174 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4175 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4176 |
</span> |
|
| 4177 |
+
Cell: combine | 4.43s
|
| 4178 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4179 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4180 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4261 |
COMBINED BENCHMARK SUMMARY
|
| 4262 |
|
| 4263 |
impl wl p50(ms) ok
|
| 4264 |
+
hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
|
| 4265 |
hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.04 True
|
| 4266 |
hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.04 True
|
| 4267 |
hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
|
| 4268 |
+
torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.29 True
|
| 4269 |
+
torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.07 True
|
| 4270 |
+
torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.07 True
|
| 4271 |
+
torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.11 True
|
| 4272 |
|
| 4273 |
GENERATING COMBINED VISUALIZATION
|
| 4274 |
|
|
|
|
| 4288 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4289 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4290 |
<div class="uv-logs-content" style="display: none;">
|
| 4291 |
+
Installed 37 packages in 282ms
|
| 4292 |
</div>
|
| 4293 |
</div>
|
| 4294 |
<div class="cell-artifacts">
|
|
|
|
| 4301 |
<rdf:RDF>
|
| 4302 |
<ns2:Work>
|
| 4303 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4304 |
+
<dc:date>2025-12-19T19:10:04.668129</dc:date>
|
| 4305 |
<dc:format>image/svg+xml</dc:format>
|
| 4306 |
<dc:creator>
|
| 4307 |
<ns2:Agent>
|
| 4308 |
+
<dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
|
| 4309 |
</ns2:Agent>
|
| 4310 |
</dc:creator>
|
| 4311 |
</ns2:Work>
|
|
|
|
| 4320 |
</g>
|
| 4321 |
<g id="axes--1" class="axes">
|
| 4322 |
<g id="patch_2">
|
| 4323 |
+
<path d="M 47.72 425.105974 L 824.19299 425.105974 L 824.19299 26.88 L 47.72 26.88 L 47.72 425.105974 z " style="fill: none" />
|
| 4324 |
</g>
|
| 4325 |
<g id="matplotlib.axis_1">
|
| 4326 |
<g id="xtick_1">
|
| 4327 |
<g id="grid-x--1" class="grid grid-x">
|
| 4328 |
+
<path d="M 83.014227 425.105974 L 83.014227 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4329 |
</g>
|
| 4330 |
<g id="line2d_1">
|
| 4331 |
<defs>
|
| 4332 |
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4333 |
</defs>
|
| 4334 |
<g>
|
| 4335 |
+
<use ns4:href="#mafb3703e5b" x="83.014227" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
|
| 4336 |
</g>
|
| 4337 |
</g>
|
| 4338 |
<g id="text_1">
|
| 4339 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(28.96641 549.280197) rotate(-45)">cuda_B1_Q100_H8_E256_L4_P4</text>
|
| 4340 |
</g>
|
| 4341 |
</g>
|
| 4342 |
<g id="xtick_2">
|
| 4343 |
<g id="grid-x--2" class="grid grid-x">
|
| 4344 |
+
<path d="M 318.309072 425.105974 L 318.309072 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4345 |
</g>
|
| 4346 |
<g id="line2d_2">
|
| 4347 |
<g>
|
| 4348 |
+
<use ns4:href="#mafb3703e5b" x="318.309072" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
|
| 4349 |
</g>
|
| 4350 |
</g>
|
| 4351 |
<g id="text_2">
|
| 4352 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(264.261255 549.280197) rotate(-45)">cuda_B1_Q300_H8_E256_L4_P4</text>
|
| 4353 |
</g>
|
| 4354 |
</g>
|
| 4355 |
<g id="xtick_3">
|
| 4356 |
<g id="grid-x--3" class="grid grid-x">
|
| 4357 |
+
<path d="M 553.603918 425.105974 L 553.603918 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4358 |
</g>
|
| 4359 |
<g id="line2d_3">
|
| 4360 |
<g>
|
| 4361 |
+
<use ns4:href="#mafb3703e5b" x="553.603918" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
|
| 4362 |
</g>
|
| 4363 |
</g>
|
| 4364 |
<g id="text_3">
|
| 4365 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(499.556101 549.280197) rotate(-45)">cuda_B2_Q100_H8_E256_L4_P4</text>
|
| 4366 |
</g>
|
| 4367 |
</g>
|
| 4368 |
<g id="xtick_4">
|
| 4369 |
<g id="grid-x--4" class="grid grid-x">
|
| 4370 |
+
<path d="M 788.898763 425.105974 L 788.898763 26.88 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4371 |
</g>
|
| 4372 |
<g id="line2d_4">
|
| 4373 |
<g>
|
| 4374 |
+
<use ns4:href="#mafb3703e5b" x="788.898763" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
|
| 4375 |
</g>
|
| 4376 |
</g>
|
| 4377 |
<g id="text_4">
|
| 4378 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(734.850946 549.280197) rotate(-45)">cuda_B2_Q300_H8_E256_L4_P4</text>
|
| 4379 |
</g>
|
| 4380 |
</g>
|
| 4381 |
<g id="label--x" class="xlabel">
|
| 4382 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.956495" y="562.545859" transform="rotate(-0 435.956495 562.545859)">Workload</text>
|
| 4383 |
</g>
|
| 4384 |
</g>
|
| 4385 |
<g id="matplotlib.axis_2">
|
| 4386 |
<g id="ytick_1">
|
| 4387 |
<g id="grid-y--2" class="grid grid-y">
|
| 4388 |
+
<path d="M 47.72 410.313695 L 824.19299 410.313695 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4389 |
</g>
|
| 4390 |
<g id="line2d_5">
|
| 4391 |
<defs>
|
| 4392 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4393 |
</defs>
|
| 4394 |
<g>
|
| 4395 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="410.313695" style="stroke: #000000; stroke-width: 0.8" />
|
| 4396 |
</g>
|
| 4397 |
</g>
|
| 4398 |
<g id="text_5">
|
| 4399 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="414.112914" transform="rotate(-0 40.72 414.112914)">0.0</text>
|
| 4400 |
</g>
|
| 4401 |
</g>
|
| 4402 |
<g id="ytick_2">
|
| 4403 |
<g id="grid-y--3" class="grid grid-y">
|
| 4404 |
+
<path d="M 47.72 365.88698 L 824.19299 365.88698 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4405 |
</g>
|
| 4406 |
<g id="line2d_6">
|
| 4407 |
<g>
|
| 4408 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="365.88698" style="stroke: #000000; stroke-width: 0.8" />
|
| 4409 |
</g>
|
| 4410 |
</g>
|
| 4411 |
<g id="text_6">
|
| 4412 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="369.686199" transform="rotate(-0 40.72 369.686199)">0.5</text>
|
| 4413 |
</g>
|
| 4414 |
</g>
|
| 4415 |
<g id="ytick_3">
|
| 4416 |
<g id="grid-y--4" class="grid grid-y">
|
| 4417 |
+
<path d="M 47.72 321.460266 L 824.19299 321.460266 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4418 |
</g>
|
| 4419 |
<g id="line2d_7">
|
| 4420 |
<g>
|
| 4421 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="321.460266" style="stroke: #000000; stroke-width: 0.8" />
|
| 4422 |
</g>
|
| 4423 |
</g>
|
| 4424 |
<g id="text_7">
|
| 4425 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="325.259484" transform="rotate(-0 40.72 325.259484)">1.0</text>
|
| 4426 |
</g>
|
| 4427 |
</g>
|
| 4428 |
<g id="ytick_4">
|
| 4429 |
<g id="grid-y--5" class="grid grid-y">
|
| 4430 |
+
<path d="M 47.72 277.033551 L 824.19299 277.033551 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4431 |
</g>
|
| 4432 |
<g id="line2d_8">
|
| 4433 |
<g>
|
| 4434 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="277.033551" style="stroke: #000000; stroke-width: 0.8" />
|
| 4435 |
</g>
|
| 4436 |
</g>
|
| 4437 |
<g id="text_8">
|
| 4438 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="280.83277" transform="rotate(-0 40.72 280.83277)">1.5</text>
|
| 4439 |
</g>
|
| 4440 |
</g>
|
| 4441 |
<g id="ytick_5">
|
| 4442 |
<g id="grid-y--6" class="grid grid-y">
|
| 4443 |
+
<path d="M 47.72 232.606836 L 824.19299 232.606836 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4444 |
</g>
|
| 4445 |
<g id="line2d_9">
|
| 4446 |
<g>
|
| 4447 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="232.606836" style="stroke: #000000; stroke-width: 0.8" />
|
| 4448 |
</g>
|
| 4449 |
</g>
|
| 4450 |
<g id="text_9">
|
| 4451 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="236.406055" transform="rotate(-0 40.72 236.406055)">2.0</text>
|
| 4452 |
+
</g>
|
| 4453 |
+
</g>
|
| 4454 |
+
<g id="ytick_6">
|
| 4455 |
+
<g id="grid-y--7" class="grid grid-y">
|
| 4456 |
+
<path d="M 47.72 188.180122 L 824.19299 188.180122 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4457 |
+
</g>
|
| 4458 |
+
<g id="line2d_10">
|
| 4459 |
+
<g>
|
| 4460 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="188.180122" style="stroke: #000000; stroke-width: 0.8" />
|
| 4461 |
+
</g>
|
| 4462 |
+
</g>
|
| 4463 |
+
<g id="text_10">
|
| 4464 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="191.97934" transform="rotate(-0 40.72 191.97934)">2.5</text>
|
| 4465 |
+
</g>
|
| 4466 |
+
</g>
|
| 4467 |
+
<g id="ytick_7">
|
| 4468 |
+
<g id="grid-y--8" class="grid grid-y">
|
| 4469 |
+
<path d="M 47.72 143.753407 L 824.19299 143.753407 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4470 |
+
</g>
|
| 4471 |
+
<g id="line2d_11">
|
| 4472 |
+
<g>
|
| 4473 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="143.753407" style="stroke: #000000; stroke-width: 0.8" />
|
| 4474 |
+
</g>
|
| 4475 |
+
</g>
|
| 4476 |
+
<g id="text_11">
|
| 4477 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="147.552626" transform="rotate(-0 40.72 147.552626)">3.0</text>
|
| 4478 |
+
</g>
|
| 4479 |
+
</g>
|
| 4480 |
+
<g id="ytick_8">
|
| 4481 |
+
<g id="grid-y--9" class="grid grid-y">
|
| 4482 |
+
<path d="M 47.72 99.326692 L 824.19299 99.326692 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4483 |
+
</g>
|
| 4484 |
+
<g id="line2d_12">
|
| 4485 |
+
<g>
|
| 4486 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="99.326692" style="stroke: #000000; stroke-width: 0.8" />
|
| 4487 |
+
</g>
|
| 4488 |
+
</g>
|
| 4489 |
+
<g id="text_12">
|
| 4490 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="103.125911" transform="rotate(-0 40.72 103.125911)">3.5</text>
|
| 4491 |
+
</g>
|
| 4492 |
+
</g>
|
| 4493 |
+
<g id="ytick_9">
|
| 4494 |
+
<g id="grid-y--10" class="grid grid-y">
|
| 4495 |
+
<path d="M 47.72 54.899978 L 824.19299 54.899978 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4496 |
+
</g>
|
| 4497 |
+
<g id="line2d_13">
|
| 4498 |
+
<g>
|
| 4499 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="54.899978" style="stroke: #000000; stroke-width: 0.8" />
|
| 4500 |
+
</g>
|
| 4501 |
+
</g>
|
| 4502 |
+
<g id="text_13">
|
| 4503 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="58.699197" transform="rotate(-0 40.72 58.699197)">4.0</text>
|
| 4504 |
</g>
|
| 4505 |
</g>
|
| 4506 |
<g id="label--y" class="ylabel">
|
| 4507 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="225.992987" transform="rotate(-90 18.737188 225.992987)">Latency P50 (ms)</text>
|
| 4508 |
</g>
|
| 4509 |
</g>
|
| 4510 |
<g id="series--hf-kernels-deformable-detr" class="series">
|
| 4511 |
+
<path d="M 83.014227 407.004793 L 318.309072 406.541778 L 553.603918 406.347278 L 788.898763 406.283214 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4512 |
<defs>
|
| 4513 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4514 |
</defs>
|
| 4515 |
+
<g clip-path="url(#pb5c8282ea4)">
|
| 4516 |
+
<use ns4:href="#md7efaf3aec" x="83.014227" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4517 |
+
<use ns4:href="#md7efaf3aec" x="318.309072" y="406.541778" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4518 |
+
<use ns4:href="#md7efaf3aec" x="553.603918" y="406.347278" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4519 |
+
<use ns4:href="#md7efaf3aec" x="788.898763" y="406.283214" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4520 |
</g>
|
| 4521 |
</g>
|
| 4522 |
<g id="series--torch-eager" class="series">
|
| 4523 |
+
<path d="M 83.014227 118.130211 L 318.309072 48.708671 L 553.603918 48.49098 L 788.898763 44.981181 " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4524 |
<defs>
|
| 4525 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4526 |
</defs>
|
| 4527 |
+
<g clip-path="url(#pb5c8282ea4)">
|
| 4528 |
+
<use ns4:href="#m9b8c54d372" x="83.014227" y="118.130211" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4529 |
+
<use ns4:href="#m9b8c54d372" x="318.309072" y="48.708671" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4530 |
+
<use ns4:href="#m9b8c54d372" x="553.603918" y="48.49098" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4531 |
+
<use ns4:href="#m9b8c54d372" x="788.898763" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4532 |
</g>
|
| 4533 |
</g>
|
| 4534 |
<g id="patch_3">
|
| 4535 |
+
<path d="M 47.72 425.105974 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4536 |
</g>
|
| 4537 |
<g id="patch_4">
|
| 4538 |
<path d="M 824.19299 425.105974 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4539 |
</g>
|
| 4540 |
<g id="patch_5">
|
| 4541 |
+
<path d="M 47.72 425.105974 L 824.19299 425.105974 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4542 |
</g>
|
| 4543 |
<g id="patch_6">
|
| 4544 |
+
<path d="M 47.72 26.88 L 824.19299 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4545 |
</g>
|
| 4546 |
+
<g id="text_14">
|
| 4547 |
+
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.956495" y="20.88" transform="rotate(-0 435.956495 20.88)">Attention Implementation Latency</text>
|
| 4548 |
</g>
|
| 4549 |
<g id="legend" class="legend">
|
| 4550 |
<g id="patch_7">
|
| 4551 |
+
<path d="M 54.72 64.7925 L 225.330938 64.7925 Q 227.330938 64.7925 227.330938 62.7925 L 227.330938 33.88 Q 227.330938 31.88 225.330938 31.88 L 54.72 31.88 Q 52.72 31.88 52.72 33.88 L 52.72 62.7925 Q 52.72 64.7925 54.72 64.7925 L 54.72 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4552 |
</g>
|
| 4553 |
+
<g id="line2d_14">
|
| 4554 |
+
<path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4555 |
<g>
|
| 4556 |
+
<use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4557 |
</g>
|
| 4558 |
</g>
|
| 4559 |
<g id="legend-label--hf-kernels-deformable-detr" class="legend">
|
| 4560 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">hf_kernels_deformable_detr</text>
|
| 4561 |
</g>
|
| 4562 |
+
<g id="line2d_15">
|
| 4563 |
+
<path d="M 56.72 54.934687 L 66.72 54.934687 L 76.72 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4564 |
<g>
|
| 4565 |
+
<use ns4:href="#m9b8c54d372" x="66.72" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4566 |
</g>
|
| 4567 |
</g>
|
| 4568 |
<g id="legend-label--torch-eager" class="legend">
|
| 4569 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="58.434687" transform="rotate(-0 84.72 58.434687)">torch_eager</text>
|
| 4570 |
</g>
|
| 4571 |
</g>
|
| 4572 |
</g>
|
| 4573 |
</g>
|
| 4574 |
<defs>
|
| 4575 |
+
<clipPath id="pb5c8282ea4">
|
| 4576 |
+
<rect x="47.72" y="26.88" width="776.47299" height="398.225974" />
|
| 4577 |
</clipPath>
|
| 4578 |
</defs>
|
| 4579 |
</svg>
|
flash_attn/impls/artifacts/benchmark/attention.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-
|
| 2 |
-
{"ts": "2025-
|
| 3 |
-
{"ts": "2025-
|
| 4 |
-
{"ts": "2025-
|
| 5 |
-
{"ts": "2025-
|
| 6 |
-
{"ts": "2025-
|
|
|
|
| 1 |
+
{"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
|
| 2 |
+
{"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
|
| 3 |
+
{"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
|
| 4 |
+
{"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
|
| 5 |
+
{"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
|
| 6 |
+
{"ts": "2025-12-19T18:57:16Z", "run": "acbd7f3686fd441a96acd6946b221ed9", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'"}}
|
flash_attn/impls/cells/benchmark.py
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
-
# "kernels-benchmark-tools",
|
| 7 |
# "kernels",
|
|
|
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
@@ -15,17 +15,18 @@ import sys
|
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
from kernels import get_kernel
|
| 17 |
|
| 18 |
-
# Load the
|
| 19 |
-
|
| 20 |
|
| 21 |
|
| 22 |
-
def
|
| 23 |
-
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
run_benchmark(
|
| 27 |
kernel_type=KernelTypeEnum.ATTENTION,
|
| 28 |
-
impl_name="
|
| 29 |
-
impl_tags={"family": "
|
| 30 |
-
impl_func=
|
| 31 |
)
|
|
|
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
|
|
|
| 6 |
# "kernels",
|
| 7 |
+
# "kernels-benchmark-tools",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
|
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
from kernels import get_kernel
|
| 17 |
|
| 18 |
+
# Load the sage attention kernel
|
| 19 |
+
hf_kernels_sage_attn = get_kernel("kernels-community/sage_attention")
|
| 20 |
|
| 21 |
|
| 22 |
+
def sage_attention(query, key, value):
|
| 23 |
+
"""SageAttention with INT8 Q/K quantization and FP16 P/V"""
|
| 24 |
+
return hf_kernels_sage_attn.fwd(query, key, value, is_causal=False)[0]
|
| 25 |
|
| 26 |
|
| 27 |
run_benchmark(
|
| 28 |
kernel_type=KernelTypeEnum.ATTENTION,
|
| 29 |
+
impl_name="sage_int8_fp16",
|
| 30 |
+
impl_tags={"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"},
|
| 31 |
+
impl_func=sage_attention,
|
| 32 |
)
|
flash_attn/impls/flash_attention.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: nv | 0.
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3905,16 +3905,16 @@ Cell: nv | 0.26s
|
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div id="output-nv" class="cell-output">
|
| 3908 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
-
| NVIDIA-SMI 580.
|
| 3911 |
+-----------------------------------------+------------------------+----------------------+
|
| 3912 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3913 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3914 |
| | | MIG M. |
|
| 3915 |
|=========================================+========================+======================|
|
| 3916 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3917 |
-
| N/A
|
| 3918 |
| | | N/A |
|
| 3919 |
+-----------------------------------------+------------------------+----------------------+
|
| 3920 |
|
|
@@ -3938,7 +3938,7 @@ Cell: nv | 0.26s
|
|
| 3938 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3939 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3940 |
</span> |
|
| 3941 |
-
Cell: benchmark | 4.
|
| 3942 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3943 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3944 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3989,29 +3989,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
|
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 3993 |
-
torch_flash_ma 5.
|
| 3994 |
-
aten::scaled_dot_product_attention 0.
|
| 3995 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 3996 |
-
aten::_flash_attention_forward 0.70%
|
| 3997 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3998 |
-
aten::contiguous 0.
|
| 3999 |
-
aten::clone 0.
|
| 4000 |
-
aten::copy_ 1.
|
| 4001 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4002 |
-
Activity Buffer Request 32.
|
| 4003 |
-
aten::transpose 1.
|
| 4004 |
-
aten::as_strided 0.
|
| 4005 |
-
aten::empty_like 0.
|
| 4006 |
-
aten::empty 1.
|
| 4007 |
-
cudaLaunchKernel 2.
|
| 4008 |
-
aten::empty_strided 0.
|
| 4009 |
-
cudaDeviceGetAttribute 0.04% 2.
|
| 4010 |
-
cudaFuncSetAttribute 0.
|
| 4011 |
-
cudaDeviceSynchronize
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
-
Self CPU time total: 5.
|
| 4014 |
-
Self CUDA time total: 3.
|
| 4015 |
|
| 4016 |
|
| 4017 |
|
|
@@ -4021,29 +4021,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
|
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4023 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4024 |
-
torch_flash_ma 4.
|
| 4025 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4026 |
-
aten::scaled_dot_product_attention 0.
|
| 4027 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4028 |
-
aten::_flash_attention_forward 0.
|
| 4029 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4030 |
-
aten::contiguous 0.19% 10.
|
| 4031 |
-
aten::clone 0.
|
| 4032 |
-
aten::copy_ 1.
|
| 4033 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4034 |
-
Activity Buffer Request
|
| 4035 |
-
aten::transpose 0.
|
| 4036 |
-
aten::as_strided 0.
|
| 4037 |
-
aten::empty_like 0.
|
| 4038 |
-
aten::empty 1.
|
| 4039 |
-
cudaLaunchKernel
|
| 4040 |
-
aten::empty_strided 0.
|
| 4041 |
-
cudaDeviceGetAttribute 0.
|
| 4042 |
-
cudaFuncSetAttribute 0.
|
| 4043 |
-
cudaDeviceSynchronize
|
| 4044 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4045 |
-
Self CPU time total: 5.
|
| 4046 |
-
Self CUDA time total: 3.
|
| 4047 |
|
| 4048 |
|
| 4049 |
|
|
@@ -4053,29 +4053,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
|
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4055 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4056 |
-
torch_flash_ma 4.
|
| 4057 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4058 |
-
aten::scaled_dot_product_attention 0.
|
| 4059 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4060 |
-
aten::_flash_attention_forward 0.
|
| 4061 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4062 |
-
aten::contiguous 0.
|
| 4063 |
-
aten::clone 0.
|
| 4064 |
-
aten::copy_ 1.
|
| 4065 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4066 |
-
Activity Buffer Request 30.
|
| 4067 |
-
aten::transpose 0.
|
| 4068 |
-
aten::as_strided 0.
|
| 4069 |
-
aten::empty_like 0.
|
| 4070 |
-
aten::empty 1.
|
| 4071 |
-
cudaLaunchKernel 1.
|
| 4072 |
-
aten::empty_strided 0.
|
| 4073 |
-
cudaDeviceGetAttribute 0.
|
| 4074 |
-
cudaFuncSetAttribute 0.
|
| 4075 |
-
cudaDeviceSynchronize 55.
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
-
Self CPU time total: 5.
|
| 4078 |
-
Self CUDA time total: 3.
|
| 4079 |
|
| 4080 |
|
| 4081 |
|
|
@@ -4085,29 +4085,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
|
|
| 4085 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4086 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
-
torch_flash_ma 4.
|
| 4089 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4090 |
-
aten::scaled_dot_product_attention 0.
|
| 4091 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4092 |
-
aten::_flash_attention_forward 0.
|
| 4093 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4094 |
-
aten::contiguous 0.17%
|
| 4095 |
-
aten::clone 0.
|
| 4096 |
-
aten::copy_ 1.
|
| 4097 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4098 |
-
Activity Buffer Request
|
| 4099 |
-
aten::transpose 0.
|
| 4100 |
-
aten::as_strided 0.33% 19.
|
| 4101 |
-
aten::empty_like 0.
|
| 4102 |
-
aten::empty 1.
|
| 4103 |
-
cudaLaunchKernel 5.
|
| 4104 |
-
aten::empty_strided 0.
|
| 4105 |
-
cudaDeviceGetAttribute 0.
|
| 4106 |
-
cudaFuncSetAttribute 0.
|
| 4107 |
-
cudaDeviceSynchronize
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
-
Self CPU time total: 5.
|
| 4110 |
-
Self CUDA time total: 3.
|
| 4111 |
|
| 4112 |
|
| 4113 |
|
|
@@ -4117,29 +4117,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
|
|
| 4117 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4118 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4119 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4120 |
-
torch_flash_ma 4.
|
| 4121 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4122 |
-
aten::scaled_dot_product_attention 0.
|
| 4123 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4124 |
-
aten::_flash_attention_forward 0.
|
| 4125 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4126 |
-
aten::contiguous 0.
|
| 4127 |
-
aten::clone 0.
|
| 4128 |
-
aten::copy_ 1.
|
| 4129 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 803.
|
| 4130 |
-
Activity Buffer Request
|
| 4131 |
-
aten::transpose 0.
|
| 4132 |
-
aten::as_strided 0.
|
| 4133 |
-
aten::empty_like 0.
|
| 4134 |
-
aten::empty 1.
|
| 4135 |
-
cudaLaunchKernel
|
| 4136 |
-
aten::empty_strided 0.23% 14.
|
| 4137 |
-
cudaDeviceGetAttribute 0.
|
| 4138 |
-
cudaFuncSetAttribute 0.
|
| 4139 |
-
cudaDeviceSynchronize
|
| 4140 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4141 |
-
Self CPU time total: 6.
|
| 4142 |
-
Self CUDA time total: 4.
|
| 4143 |
|
| 4144 |
|
| 4145 |
|
|
@@ -4149,38 +4149,38 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
|
|
| 4149 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4150 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4151 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4152 |
-
torch_flash_ma 3.
|
| 4153 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4154 |
-
aten::scaled_dot_product_attention 0.
|
| 4155 |
-
aten::_scaled_dot_product_flash_attention 0.28%
|
| 4156 |
-
aten::_flash_attention_forward 0.
|
| 4157 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4158 |
-
aten::contiguous 0.14%
|
| 4159 |
-
aten::clone 0.41% 26.
|
| 4160 |
-
aten::copy_ 1.
|
| 4161 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4162 |
-
Activity Buffer Request
|
| 4163 |
-
aten::transpose 0.
|
| 4164 |
-
aten::as_strided 0.
|
| 4165 |
-
aten::empty_like 0.
|
| 4166 |
-
aten::empty 1.
|
| 4167 |
-
cudaLaunchKernel
|
| 4168 |
-
aten::empty_strided 0.
|
| 4169 |
-
cudaDeviceGetAttribute 0.
|
| 4170 |
-
cudaFuncSetAttribute 0.
|
| 4171 |
-
cudaDeviceSynchronize 58.
|
| 4172 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4173 |
-
Self CPU time total: 6.
|
| 4174 |
-
Self CUDA time total: 4.
|
| 4175 |
|
| 4176 |
|
| 4177 |
impl wl p50(ms) ok
|
| 4178 |
-
torch_flash_ma cuda_attn_L128_bfloat16 1.
|
| 4179 |
-
torch_flash_ma cuda_attn_L256_bfloat16 1.
|
| 4180 |
-
torch_flash_ma cuda_attn_L320_bfloat16 1.
|
| 4181 |
-
torch_flash_ma cuda_attn_L384_bfloat16 1.
|
| 4182 |
-
torch_flash_ma cuda_attn_L448_bfloat16 1.
|
| 4183 |
-
torch_flash_ma cuda_attn_L512_bfloat16 1.
|
| 4184 |
</pre></div>
|
| 4185 |
<div class="cell-artifacts">
|
| 4186 |
<h4>Artifacts:</h4>
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: nv | 0.28s
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div id="output-nv" class="cell-output">
|
| 3908 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:57:02 2025
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
+
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3911 |
+-----------------------------------------+------------------------+----------------------+
|
| 3912 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3913 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3914 |
| | | MIG M. |
|
| 3915 |
|=========================================+========================+======================|
|
| 3916 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3917 |
+
| N/A 34C P0 103W / 350W | 0MiB / 46068MiB | 31% Default |
|
| 3918 |
| | | N/A |
|
| 3919 |
+-----------------------------------------+------------------------+----------------------+
|
| 3920 |
|
|
|
|
| 3938 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3939 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3940 |
</span> |
|
| 3941 |
+
Cell: benchmark | 4.28s
|
| 3942 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3943 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3944 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.564ms 102.03% 3.564ms 3.564ms 1
|
| 3993 |
+
torch_flash_ma 5.92% 322.864us 49.31% 2.690ms 2.690ms 0.000us 0.00% 3.533ms 3.533ms 1
|
| 3994 |
+
aten::scaled_dot_product_attention 0.71% 38.601us 3.97% 216.634us 72.211us 0.000us 0.00% 2.778ms 926.157us 3
|
| 3995 |
+
aten::_scaled_dot_product_flash_attention 0.48% 26.049us 3.26% 178.033us 59.344us 0.000us 0.00% 2.778ms 926.157us 3
|
| 3996 |
+
aten::_flash_attention_forward 0.70% 38.244us 2.37% 129.043us 43.014us 2.778ms 79.53% 2.778ms 926.157us 3
|
| 3997 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.778ms 79.53% 2.778ms 926.157us 3
|
| 3998 |
+
aten::contiguous 0.25% 13.590us 38.20% 2.084ms 173.652us 0.000us 0.00% 754.825us 62.902us 12
|
| 3999 |
+
aten::clone 0.64% 35.000us 37.95% 2.070ms 172.519us 0.000us 0.00% 754.825us 62.902us 12
|
| 4000 |
+
aten::copy_ 1.68% 91.923us 35.78% 1.952ms 162.645us 715.017us 20.47% 754.825us 62.902us 12
|
| 4001 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 715.017us 20.47% 715.017us 59.585us 12
|
| 4002 |
+
Activity Buffer Request 32.25% 1.760ms 32.25% 1.760ms 1.760ms 39.808us 1.14% 39.808us 39.808us 1
|
| 4003 |
+
aten::transpose 1.21% 66.005us 1.65% 89.944us 3.748us 0.000us 0.00% 0.000us 0.000us 24
|
| 4004 |
+
aten::as_strided 0.44% 23.939us 0.44% 23.939us 0.997us 0.000us 0.00% 0.000us 0.000us 24
|
| 4005 |
+
aten::empty_like 0.46% 24.998us 1.93% 105.512us 7.034us 0.000us 0.00% 0.000us 0.000us 15
|
| 4006 |
+
aten::empty 1.74% 94.901us 1.74% 94.901us 3.954us 0.000us 0.00% 0.000us 0.000us 24
|
| 4007 |
+
cudaLaunchKernel 2.30% 125.662us 2.30% 125.662us 8.377us 0.000us 0.00% 0.000us 0.000us 15
|
| 4008 |
+
aten::empty_strided 0.30% 16.192us 0.30% 16.192us 5.397us 0.000us 0.00% 0.000us 0.000us 3
|
| 4009 |
+
cudaDeviceGetAttribute 0.04% 2.360us 0.04% 2.360us 0.393us 0.000us 0.00% 0.000us 0.000us 6
|
| 4010 |
+
cudaFuncSetAttribute 0.19% 10.450us 0.19% 10.450us 3.483us 0.000us 0.00% 0.000us 0.000us 3
|
| 4011 |
+
cudaDeviceSynchronize 50.69% 2.765ms 50.69% 2.765ms 2.765ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
+
Self CPU time total: 5.456ms
|
| 4014 |
+
Self CUDA time total: 3.493ms
|
| 4015 |
|
| 4016 |
|
| 4017 |
|
|
|
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4023 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4024 |
+
torch_flash_ma 4.71% 256.956us 44.29% 2.416ms 2.416ms 0.000us 0.00% 3.774ms 3.774ms 1
|
| 4025 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.728ms 100.28% 3.728ms 3.728ms 1
|
| 4026 |
+
aten::scaled_dot_product_attention 0.47% 25.660us 3.51% 191.364us 63.788us 0.000us 0.00% 2.953ms 984.270us 3
|
| 4027 |
+
aten::_scaled_dot_product_flash_attention 0.35% 18.860us 3.04% 165.704us 55.235us 0.000us 0.00% 2.953ms 984.270us 3
|
| 4028 |
+
aten::_flash_attention_forward 0.82% 44.462us 2.27% 123.662us 41.221us 2.953ms 79.43% 2.953ms 984.270us 3
|
| 4029 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.953ms 79.43% 2.953ms 984.270us 3
|
| 4030 |
+
aten::contiguous 0.19% 10.628us 35.19% 1.920ms 159.985us 0.000us 0.00% 820.970us 68.414us 12
|
| 4031 |
+
aten::clone 0.57% 30.960us 35.00% 1.909ms 159.100us 0.000us 0.00% 820.970us 68.414us 12
|
| 4032 |
+
aten::copy_ 1.50% 81.693us 33.25% 1.814ms 151.145us 764.809us 20.57% 820.970us 68.414us 12
|
| 4033 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 764.809us 20.57% 764.809us 63.734us 12
|
| 4034 |
+
Activity Buffer Request 30.19% 1.647ms 30.19% 1.647ms 1.647ms 56.161us 1.51% 56.161us 56.161us 1
|
| 4035 |
+
aten::transpose 0.93% 50.867us 1.30% 70.984us 2.958us 0.000us 0.00% 0.000us 0.000us 24
|
| 4036 |
+
aten::as_strided 0.37% 20.117us 0.37% 20.117us 0.838us 0.000us 0.00% 0.000us 0.000us 24
|
| 4037 |
+
aten::empty_like 0.39% 21.390us 1.52% 82.920us 5.528us 0.000us 0.00% 0.000us 0.000us 15
|
| 4038 |
+
aten::empty 1.43% 78.110us 1.43% 78.110us 3.255us 0.000us 0.00% 0.000us 0.000us 24
|
| 4039 |
+
cudaLaunchKernel 2.02% 110.102us 2.02% 110.102us 7.340us 0.000us 0.00% 0.000us 0.000us 15
|
| 4040 |
+
aten::empty_strided 0.25% 13.480us 0.25% 13.480us 4.493us 0.000us 0.00% 0.000us 0.000us 3
|
| 4041 |
+
cudaDeviceGetAttribute 0.03% 1.800us 0.03% 1.800us 0.300us 0.000us 0.00% 0.000us 0.000us 6
|
| 4042 |
+
cudaFuncSetAttribute 0.07% 4.010us 0.07% 4.010us 1.337us 0.000us 0.00% 0.000us 0.000us 3
|
| 4043 |
+
cudaDeviceSynchronize 55.71% 3.039ms 55.71% 3.039ms 3.039ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4044 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4045 |
+
Self CPU time total: 5.455ms
|
| 4046 |
+
Self CUDA time total: 3.718ms
|
| 4047 |
|
| 4048 |
|
| 4049 |
|
|
|
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4055 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4056 |
+
torch_flash_ma 4.83% 269.985us 44.74% 2.500ms 2.500ms 0.000us 0.00% 3.834ms 3.834ms 1
|
| 4057 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.786ms 100.29% 3.786ms 3.786ms 1
|
| 4058 |
+
aten::scaled_dot_product_attention 0.43% 24.011us 3.55% 198.294us 66.098us 0.000us 0.00% 2.997ms 999.122us 3
|
| 4059 |
+
aten::_scaled_dot_product_flash_attention 0.34% 19.010us 3.12% 174.283us 58.094us 0.000us 0.00% 2.997ms 999.122us 3
|
| 4060 |
+
aten::_flash_attention_forward 0.79% 43.958us 2.36% 131.713us 43.904us 2.997ms 79.40% 2.997ms 999.122us 3
|
| 4061 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.997ms 79.40% 2.997ms 999.122us 3
|
| 4062 |
+
aten::contiguous 0.20% 11.122us 35.53% 1.985ms 165.423us 0.000us 0.00% 837.094us 69.758us 12
|
| 4063 |
+
aten::clone 0.53% 29.350us 35.33% 1.974ms 164.496us 0.000us 0.00% 837.094us 69.758us 12
|
| 4064 |
+
aten::copy_ 1.44% 80.718us 33.66% 1.880ms 156.702us 777.862us 20.60% 837.094us 69.758us 12
|
| 4065 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 777.862us 20.60% 777.862us 64.822us 12
|
| 4066 |
+
Activity Buffer Request 30.68% 1.714ms 30.68% 1.714ms 1.714ms 59.232us 1.57% 59.232us 59.232us 1
|
| 4067 |
+
aten::transpose 0.92% 51.150us 1.25% 70.010us 2.917us 0.000us 0.00% 0.000us 0.000us 24
|
| 4068 |
+
aten::as_strided 0.34% 18.860us 0.34% 18.860us 0.786us 0.000us 0.00% 0.000us 0.000us 24
|
| 4069 |
+
aten::empty_like 0.37% 20.561us 1.52% 84.672us 5.645us 0.000us 0.00% 0.000us 0.000us 15
|
| 4070 |
+
aten::empty 1.54% 85.833us 1.54% 85.833us 3.576us 0.000us 0.00% 0.000us 0.000us 24
|
| 4071 |
+
cudaLaunchKernel 1.95% 109.214us 1.95% 109.214us 7.281us 0.000us 0.00% 0.000us 0.000us 15
|
| 4072 |
+
aten::empty_strided 0.27% 15.280us 0.27% 15.280us 5.093us 0.000us 0.00% 0.000us 0.000us 3
|
| 4073 |
+
cudaDeviceGetAttribute 0.04% 2.120us 0.04% 2.120us 0.353us 0.000us 0.00% 0.000us 0.000us 6
|
| 4074 |
+
cudaFuncSetAttribute 0.08% 4.293us 0.08% 4.293us 1.431us 0.000us 0.00% 0.000us 0.000us 3
|
| 4075 |
+
cudaDeviceSynchronize 55.26% 3.087ms 55.26% 3.087ms 3.087ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
+
Self CPU time total: 5.587ms
|
| 4078 |
+
Self CUDA time total: 3.775ms
|
| 4079 |
|
| 4080 |
|
| 4081 |
|
|
|
|
| 4085 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4086 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
+
torch_flash_ma 4.54% 264.303us 45.63% 2.655ms 2.655ms 0.000us 0.00% 3.910ms 3.910ms 1
|
| 4089 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.865ms 100.29% 3.865ms 3.865ms 1
|
| 4090 |
+
aten::scaled_dot_product_attention 0.44% 25.860us 3.27% 190.173us 63.391us 0.000us 0.00% 3.076ms 1.025ms 3
|
| 4091 |
+
aten::_scaled_dot_product_flash_attention 0.31% 18.100us 2.82% 164.313us 54.771us 0.000us 0.00% 3.076ms 1.025ms 3
|
| 4092 |
+
aten::_flash_attention_forward 0.70% 40.710us 2.10% 122.383us 40.794us 3.076ms 79.82% 3.076ms 1.025ms 3
|
| 4093 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.076ms 79.82% 3.076ms 1.025ms 3
|
| 4094 |
+
aten::contiguous 0.17% 9.789us 37.00% 2.153ms 179.384us 0.000us 0.00% 833.826us 69.486us 12
|
| 4095 |
+
aten::clone 0.51% 29.519us 36.83% 2.143ms 178.569us 0.000us 0.00% 833.826us 69.486us 12
|
| 4096 |
+
aten::copy_ 1.40% 81.625us 35.17% 2.046ms 170.539us 777.953us 20.18% 833.826us 69.486us 12
|
| 4097 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 777.953us 20.18% 777.953us 64.829us 12
|
| 4098 |
+
Activity Buffer Request 28.32% 1.648ms 28.32% 1.648ms 1.648ms 55.873us 1.45% 55.873us 55.873us 1
|
| 4099 |
+
aten::transpose 0.90% 52.082us 1.23% 71.483us 2.978us 0.000us 0.00% 0.000us 0.000us 24
|
| 4100 |
+
aten::as_strided 0.33% 19.401us 0.33% 19.401us 0.808us 0.000us 0.00% 0.000us 0.000us 24
|
| 4101 |
+
aten::empty_like 0.38% 21.851us 1.50% 87.141us 5.809us 0.000us 0.00% 0.000us 0.000us 15
|
| 4102 |
+
aten::empty 1.38% 80.371us 1.38% 80.371us 3.349us 0.000us 0.00% 0.000us 0.000us 24
|
| 4103 |
+
cudaLaunchKernel 5.88% 342.407us 5.88% 342.407us 22.827us 0.000us 0.00% 0.000us 0.000us 15
|
| 4104 |
+
aten::empty_strided 0.26% 14.910us 0.26% 14.910us 4.970us 0.000us 0.00% 0.000us 0.000us 3
|
| 4105 |
+
cudaDeviceGetAttribute 0.03% 1.811us 0.03% 1.811us 0.302us 0.000us 0.00% 0.000us 0.000us 6
|
| 4106 |
+
cudaFuncSetAttribute 0.07% 4.181us 0.07% 4.181us 1.394us 0.000us 0.00% 0.000us 0.000us 3
|
| 4107 |
+
cudaDeviceSynchronize 54.37% 3.164ms 54.37% 3.164ms 3.164ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
+
Self CPU time total: 5.818ms
|
| 4110 |
+
Self CUDA time total: 3.854ms
|
| 4111 |
|
| 4112 |
|
| 4113 |
|
|
|
|
| 4117 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4118 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4119 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4120 |
+
torch_flash_ma 4.87% 306.708us 43.18% 2.718ms 2.718ms 0.000us 0.00% 4.364ms 4.364ms 1
|
| 4121 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.314ms 100.24% 4.314ms 4.314ms 1
|
| 4122 |
+
aten::scaled_dot_product_attention 0.42% 26.322us 3.04% 191.625us 63.875us 0.000us 0.00% 3.500ms 1.167ms 3
|
| 4123 |
+
aten::_scaled_dot_product_flash_attention 0.31% 19.398us 2.63% 165.303us 55.101us 0.000us 0.00% 3.500ms 1.167ms 3
|
| 4124 |
+
aten::_flash_attention_forward 0.65% 40.750us 1.93% 121.261us 40.420us 3.500ms 81.33% 3.500ms 1.167ms 3
|
| 4125 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.500ms 81.33% 3.500ms 1.167ms 3
|
| 4126 |
+
aten::contiguous 0.18% 11.020us 34.50% 2.172ms 180.965us 0.000us 0.00% 863.467us 71.956us 12
|
| 4127 |
+
aten::clone 0.46% 28.711us 34.33% 2.161ms 180.047us 0.000us 0.00% 863.467us 71.956us 12
|
| 4128 |
+
aten::copy_ 1.29% 81.309us 32.83% 2.066ms 172.192us 803.338us 18.67% 863.467us 71.956us 12
|
| 4129 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 803.338us 18.67% 803.338us 66.945us 12
|
| 4130 |
+
Activity Buffer Request 26.76% 1.684ms 26.76% 1.684ms 1.684ms 60.129us 1.40% 60.129us 60.129us 1
|
| 4131 |
+
aten::transpose 0.83% 52.430us 1.15% 72.394us 3.016us 0.000us 0.00% 0.000us 0.000us 24
|
| 4132 |
+
aten::as_strided 0.32% 19.964us 0.32% 19.964us 0.832us 0.000us 0.00% 0.000us 0.000us 24
|
| 4133 |
+
aten::empty_like 0.32% 19.960us 1.35% 84.930us 5.662us 0.000us 0.00% 0.000us 0.000us 15
|
| 4134 |
+
aten::empty 1.27% 80.061us 1.27% 80.061us 3.336us 0.000us 0.00% 0.000us 0.000us 24
|
| 4135 |
+
cudaLaunchKernel 5.16% 325.017us 5.16% 325.017us 21.668us 0.000us 0.00% 0.000us 0.000us 15
|
| 4136 |
+
aten::empty_strided 0.23% 14.460us 0.23% 14.460us 4.820us 0.000us 0.00% 0.000us 0.000us 3
|
| 4137 |
+
cudaDeviceGetAttribute 0.04% 2.690us 0.04% 2.690us 0.448us 0.000us 0.00% 0.000us 0.000us 6
|
| 4138 |
+
cudaFuncSetAttribute 0.07% 4.660us 0.07% 4.660us 1.553us 0.000us 0.00% 0.000us 0.000us 3
|
| 4139 |
+
cudaDeviceSynchronize 56.82% 3.576ms 56.82% 3.576ms 3.576ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4140 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4141 |
+
Self CPU time total: 6.294ms
|
| 4142 |
+
Self CUDA time total: 4.304ms
|
| 4143 |
|
| 4144 |
|
| 4145 |
|
|
|
|
| 4149 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4150 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4151 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4152 |
+
torch_flash_ma 3.61% 231.105us 41.57% 2.662ms 2.662ms 0.000us 0.00% 4.461ms 4.461ms 1
|
| 4153 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.411ms 100.25% 4.411ms 4.411ms 1
|
| 4154 |
+
aten::scaled_dot_product_attention 0.40% 25.770us 2.78% 178.013us 59.338us 0.000us 0.00% 3.582ms 1.194ms 3
|
| 4155 |
+
aten::_scaled_dot_product_flash_attention 0.28% 17.960us 2.38% 152.243us 50.748us 0.000us 0.00% 3.582ms 1.194ms 3
|
| 4156 |
+
aten::_flash_attention_forward 0.51% 32.421us 1.73% 110.913us 36.971us 3.582ms 81.42% 3.582ms 1.194ms 3
|
| 4157 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.582ms 81.42% 3.582ms 1.194ms 3
|
| 4158 |
+
aten::contiguous 0.14% 9.230us 34.45% 2.206ms 183.815us 0.000us 0.00% 878.374us 73.198us 12
|
| 4159 |
+
aten::clone 0.41% 26.011us 34.30% 2.197ms 183.046us 0.000us 0.00% 878.374us 73.198us 12
|
| 4160 |
+
aten::copy_ 1.29% 82.861us 32.91% 2.107ms 175.603us 817.702us 18.58% 878.374us 73.198us 12
|
| 4161 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 817.702us 18.58% 817.702us 68.142us 12
|
| 4162 |
+
Activity Buffer Request 26.91% 1.723ms 26.91% 1.723ms 1.723ms 60.672us 1.38% 60.672us 60.672us 1
|
| 4163 |
+
aten::transpose 0.81% 51.890us 1.10% 70.690us 2.945us 0.000us 0.00% 0.000us 0.000us 24
|
| 4164 |
+
aten::as_strided 0.29% 18.800us 0.29% 18.800us 0.783us 0.000us 0.00% 0.000us 0.000us 24
|
| 4165 |
+
aten::empty_like 0.29% 18.829us 1.29% 82.771us 5.518us 0.000us 0.00% 0.000us 0.000us 15
|
| 4166 |
+
aten::empty 1.23% 78.733us 1.23% 78.733us 3.281us 0.000us 0.00% 0.000us 0.000us 24
|
| 4167 |
+
cudaLaunchKernel 5.08% 325.239us 5.08% 325.239us 21.683us 0.000us 0.00% 0.000us 0.000us 15
|
| 4168 |
+
aten::empty_strided 0.23% 14.690us 0.23% 14.690us 4.897us 0.000us 0.00% 0.000us 0.000us 3
|
| 4169 |
+
cudaDeviceGetAttribute 0.03% 1.808us 0.03% 1.808us 0.301us 0.000us 0.00% 0.000us 0.000us 6
|
| 4170 |
+
cudaFuncSetAttribute 0.06% 3.871us 0.06% 3.871us 1.290us 0.000us 0.00% 0.000us 0.000us 3
|
| 4171 |
+
cudaDeviceSynchronize 58.43% 3.741ms 58.43% 3.741ms 3.741ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4172 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4173 |
+
Self CPU time total: 6.404ms
|
| 4174 |
+
Self CUDA time total: 4.400ms
|
| 4175 |
|
| 4176 |
|
| 4177 |
impl wl p50(ms) ok
|
| 4178 |
+
torch_flash_ma cuda_attn_L128_bfloat16 1.20 True
|
| 4179 |
+
torch_flash_ma cuda_attn_L256_bfloat16 1.26 True
|
| 4180 |
+
torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
|
| 4181 |
+
torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
|
| 4182 |
+
torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
|
| 4183 |
+
torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
|
| 4184 |
</pre></div>
|
| 4185 |
<div class="cell-artifacts">
|
| 4186 |
<h4>Artifacts:</h4>
|
flash_attn/impls/hf_kernels_flash_attn.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3886,9 +3886,9 @@ body[data-tool="eraser"] .main-content {
|
|
| 3886 |
<span class="collapse-indicators">
|
| 3887 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
-
<span id="uv-indicator-benchmark"
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: benchmark |
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3943,21 +3943,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
|
|
| 3943 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3944 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3945 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3946 |
-
hf_kernels_flash_attn 3.
|
| 3947 |
-
_flash_attn_9e27194::fwd 1.
|
| 3948 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3949 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3950 |
-
Activity Buffer Request
|
| 3951 |
-
cudaDeviceGetAttribute 0.09% 4.
|
| 3952 |
-
aten::empty_like 0.
|
| 3953 |
-
aten::empty_strided 0.
|
| 3954 |
-
aten::empty 0.
|
| 3955 |
-
cudaFuncSetAttribute
|
| 3956 |
-
cudaLaunchKernel 0.
|
| 3957 |
-
cudaDeviceSynchronize
|
| 3958 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3959 |
-
Self CPU time total:
|
| 3960 |
-
Self CUDA time total: 2.
|
| 3961 |
|
| 3962 |
|
| 3963 |
|
|
@@ -3967,21 +3967,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
|
|
| 3967 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3968 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3969 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3970 |
-
hf_kernels_flash_attn 1.
|
| 3971 |
-
_flash_attn_9e27194::fwd 0.
|
| 3972 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3973 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3974 |
-
Activity Buffer Request
|
| 3975 |
-
cudaDeviceGetAttribute 0.07% 3.
|
| 3976 |
-
aten::empty_like 0.
|
| 3977 |
-
aten::empty_strided 0.
|
| 3978 |
-
aten::empty 0.
|
| 3979 |
-
cudaFuncSetAttribute 0.
|
| 3980 |
-
cudaLaunchKernel 0.
|
| 3981 |
-
cudaDeviceSynchronize
|
| 3982 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3983 |
-
Self CPU time total:
|
| 3984 |
-
Self CUDA time total:
|
| 3985 |
|
| 3986 |
|
| 3987 |
|
|
@@ -3991,21 +3991,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
|
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
-
hf_kernels_flash_attn
|
| 3995 |
-
_flash_attn_9e27194::fwd
|
| 3996 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 3997 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 3998 |
-
Activity Buffer Request 35.
|
| 3999 |
-
cudaDeviceGetAttribute 0.07% 3.
|
| 4000 |
-
aten::empty_like 0.13% 6.
|
| 4001 |
-
aten::empty_strided 0.
|
| 4002 |
-
aten::empty 0.
|
| 4003 |
-
cudaFuncSetAttribute 0.07% 3.
|
| 4004 |
-
cudaLaunchKernel 0.
|
| 4005 |
-
cudaDeviceSynchronize 59.
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
-
Self CPU time total:
|
| 4008 |
-
Self CUDA time total: 3.
|
| 4009 |
|
| 4010 |
|
| 4011 |
|
|
@@ -4015,21 +4015,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
|
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
-
hf_kernels_flash_attn 2.
|
| 4019 |
-
_flash_attn_9e27194::fwd
|
| 4020 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4021 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4022 |
-
Activity Buffer Request
|
| 4023 |
-
cudaDeviceGetAttribute 0.
|
| 4024 |
-
aten::empty_like 0.
|
| 4025 |
-
aten::empty_strided 0.
|
| 4026 |
-
aten::empty 0.
|
| 4027 |
-
cudaFuncSetAttribute 0.
|
| 4028 |
-
cudaLaunchKernel
|
| 4029 |
-
cudaDeviceSynchronize
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
-
Self CPU time total:
|
| 4032 |
-
Self CUDA time total: 3.
|
| 4033 |
|
| 4034 |
|
| 4035 |
|
|
@@ -4039,21 +4039,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
|
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4041 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4042 |
-
hf_kernels_flash_attn
|
| 4043 |
-
_flash_attn_9e27194::fwd 0.
|
| 4044 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4045 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4046 |
-
Activity Buffer Request
|
| 4047 |
-
cudaDeviceGetAttribute 0.
|
| 4048 |
-
aten::empty_like 0.
|
| 4049 |
-
aten::empty_strided 0.
|
| 4050 |
-
aten::empty 0.
|
| 4051 |
-
cudaFuncSetAttribute 0.
|
| 4052 |
-
cudaLaunchKernel
|
| 4053 |
-
cudaDeviceSynchronize
|
| 4054 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4055 |
-
Self CPU time total: 5.
|
| 4056 |
-
Self CUDA time total: 3.
|
| 4057 |
|
| 4058 |
|
| 4059 |
|
|
@@ -4063,40 +4063,37 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
|
|
| 4063 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4064 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
-
hf_kernels_flash_attn
|
| 4067 |
-
_flash_attn_9e27194::fwd
|
| 4068 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4069 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4070 |
-
Activity Buffer Request
|
| 4071 |
-
cudaDeviceGetAttribute 0.
|
| 4072 |
-
aten::empty_like 0.
|
| 4073 |
-
aten::empty_strided 0.
|
| 4074 |
-
aten::empty 0.
|
| 4075 |
-
cudaFuncSetAttribute 0.
|
| 4076 |
-
cudaLaunchKernel
|
| 4077 |
-
cudaDeviceSynchronize
|
| 4078 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4079 |
-
Self CPU time total:
|
| 4080 |
-
Self CUDA time total: 3.
|
| 4081 |
|
| 4082 |
|
| 4083 |
impl wl p50(ms) ok
|
| 4084 |
-
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.
|
| 4085 |
-
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.
|
| 4086 |
-
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.
|
| 4087 |
-
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.
|
| 4088 |
-
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.
|
| 4089 |
-
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.
|
| 4090 |
</pre></div>
|
| 4091 |
-
<div class="
|
| 4092 |
-
|
| 4093 |
-
|
| 4094 |
-
|
|
|
|
| 4095 |
</div>
|
| 4096 |
-
</div>
|
| 4097 |
-
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4098 |
-
Fetching 20 files: 10%|█ | 2/20 [00:01<00:17, 1.01it/s]
|
| 4099 |
-
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 10.06it/s]</div>
|
| 4100 |
<div class="cell-artifacts">
|
| 4101 |
<h4>Artifacts:</h4>
|
| 4102 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3886 |
<span class="collapse-indicators">
|
| 3887 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: benchmark | 6.12s
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3943 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3944 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3945 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3946 |
+
hf_kernels_flash_attn 3.19% 147.591us 44.59% 2.062ms 2.062ms 0.000us 0.00% 3.719ms 3.719ms 1
|
| 3947 |
+
_flash_attn_9e27194::fwd 1.32% 60.849us 41.40% 1.914ms 638.151us 2.771ms 100.00% 3.719ms 1.240ms 3
|
| 3948 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.773ms 100.06% 2.773ms 2.773ms 1
|
| 3949 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.771ms 100.00% 2.771ms 923.713us 3
|
| 3950 |
+
Activity Buffer Request 37.16% 1.718ms 37.16% 1.718ms 1.718ms 947.777us 34.20% 947.777us 947.777us 1
|
| 3951 |
+
cudaDeviceGetAttribute 0.09% 4.211us 0.09% 4.211us 0.281us 0.000us 0.00% 0.000us 0.000us 15
|
| 3952 |
+
aten::empty_like 0.37% 16.891us 1.10% 50.702us 16.901us 0.000us 0.00% 0.000us 0.000us 3
|
| 3953 |
+
aten::empty_strided 0.73% 33.811us 0.73% 33.811us 11.270us 0.000us 0.00% 0.000us 0.000us 3
|
| 3954 |
+
aten::empty 0.54% 24.922us 0.54% 24.922us 2.769us 0.000us 0.00% 0.000us 0.000us 9
|
| 3955 |
+
cudaFuncSetAttribute 0.27% 12.349us 0.27% 12.349us 4.116us 0.000us 0.00% 0.000us 0.000us 3
|
| 3956 |
+
cudaLaunchKernel 0.93% 42.971us 0.93% 42.971us 14.324us 0.000us 0.00% 0.000us 0.000us 3
|
| 3957 |
+
cudaDeviceSynchronize 55.41% 2.563ms 55.41% 2.563ms 2.563ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3958 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3959 |
+
Self CPU time total: 4.625ms
|
| 3960 |
+
Self CUDA time total: 2.771ms
|
| 3961 |
|
| 3962 |
|
| 3963 |
|
|
|
|
| 3967 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3968 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3969 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3970 |
+
hf_kernels_flash_attn 1.95% 91.420us 40.89% 1.916ms 1.916ms 0.000us 0.00% 3.901ms 3.901ms 1
|
| 3971 |
+
_flash_attn_9e27194::fwd 0.98% 45.792us 38.94% 1.825ms 608.181us 2.914ms 100.00% 3.901ms 1.300ms 3
|
| 3972 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.916ms 100.05% 2.916ms 2.916ms 1
|
| 3973 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.914ms 100.00% 2.914ms 971.481us 3
|
| 3974 |
+
Activity Buffer Request 36.29% 1.700ms 36.29% 1.700ms 1.700ms 986.884us 33.86% 986.884us 986.884us 1
|
| 3975 |
+
cudaDeviceGetAttribute 0.07% 3.500us 0.07% 3.500us 0.233us 0.000us 0.00% 0.000us 0.000us 15
|
| 3976 |
+
aten::empty_like 0.15% 6.960us 0.52% 24.320us 8.107us 0.000us 0.00% 0.000us 0.000us 3
|
| 3977 |
+
aten::empty_strided 0.37% 17.360us 0.37% 17.360us 5.787us 0.000us 0.00% 0.000us 0.000us 3
|
| 3978 |
+
aten::empty 0.45% 21.021us 0.45% 21.021us 2.336us 0.000us 0.00% 0.000us 0.000us 9
|
| 3979 |
+
cudaFuncSetAttribute 0.08% 3.519us 0.08% 3.519us 1.173us 0.000us 0.00% 0.000us 0.000us 3
|
| 3980 |
+
cudaLaunchKernel 0.55% 25.931us 0.55% 25.931us 8.644us 0.000us 0.00% 0.000us 0.000us 3
|
| 3981 |
+
cudaDeviceSynchronize 59.11% 2.770ms 59.11% 2.770ms 2.770ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3982 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3983 |
+
Self CPU time total: 4.686ms
|
| 3984 |
+
Self CUDA time total: 2.914ms
|
| 3985 |
|
| 3986 |
|
| 3987 |
|
|
|
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
+
hf_kernels_flash_attn 2.13% 103.462us 40.42% 1.967ms 1.967ms 0.000us 0.00% 4.069ms 4.069ms 1
|
| 3995 |
+
_flash_attn_9e27194::fwd 0.94% 45.522us 38.30% 1.863ms 621.134us 3.040ms 100.00% 4.069ms 1.356ms 3
|
| 3996 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.041ms 100.05% 3.041ms 3.041ms 1
|
| 3997 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.040ms 100.00% 3.040ms 1.013ms 3
|
| 3998 |
+
Activity Buffer Request 35.70% 1.737ms 35.70% 1.737ms 1.737ms 1.029ms 33.84% 1.029ms 1.029ms 1
|
| 3999 |
+
cudaDeviceGetAttribute 0.07% 3.488us 0.07% 3.488us 0.233us 0.000us 0.00% 0.000us 0.000us 15
|
| 4000 |
+
aten::empty_like 0.13% 6.550us 0.49% 24.010us 8.003us 0.000us 0.00% 0.000us 0.000us 3
|
| 4001 |
+
aten::empty_strided 0.36% 17.460us 0.36% 17.460us 5.820us 0.000us 0.00% 0.000us 0.000us 3
|
| 4002 |
+
aten::empty 0.47% 22.651us 0.47% 22.651us 2.517us 0.000us 0.00% 0.000us 0.000us 9
|
| 4003 |
+
cudaFuncSetAttribute 0.07% 3.621us 0.07% 3.621us 1.207us 0.000us 0.00% 0.000us 0.000us 3
|
| 4004 |
+
cudaLaunchKernel 0.55% 26.960us 0.55% 26.960us 8.987us 0.000us 0.00% 0.000us 0.000us 3
|
| 4005 |
+
cudaDeviceSynchronize 59.58% 2.899ms 59.58% 2.899ms 2.899ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
+
Self CPU time total: 4.866ms
|
| 4008 |
+
Self CUDA time total: 3.040ms
|
| 4009 |
|
| 4010 |
|
| 4011 |
|
|
|
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
+
hf_kernels_flash_attn 2.03% 100.371us 41.00% 2.032ms 2.032ms 0.000us 0.00% 4.098ms 4.098ms 1
|
| 4019 |
+
_flash_attn_9e27194::fwd 0.92% 45.401us 38.98% 1.931ms 643.821us 3.066ms 100.00% 4.098ms 1.366ms 3
|
| 4020 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.068ms 100.05% 3.068ms 3.068ms 1
|
| 4021 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.066ms 100.00% 3.066ms 1.022ms 3
|
| 4022 |
+
Activity Buffer Request 32.94% 1.632ms 32.94% 1.632ms 1.632ms 1.032ms 33.68% 1.032ms 1.032ms 1
|
| 4023 |
+
cudaDeviceGetAttribute 0.07% 3.502us 0.07% 3.502us 0.233us 0.000us 0.00% 0.000us 0.000us 15
|
| 4024 |
+
aten::empty_like 0.14% 6.780us 0.47% 23.270us 7.757us 0.000us 0.00% 0.000us 0.000us 3
|
| 4025 |
+
aten::empty_strided 0.33% 16.490us 0.33% 16.490us 5.497us 0.000us 0.00% 0.000us 0.000us 3
|
| 4026 |
+
aten::empty 0.45% 22.299us 0.45% 22.299us 2.478us 0.000us 0.00% 0.000us 0.000us 9
|
| 4027 |
+
cudaFuncSetAttribute 0.09% 4.220us 0.09% 4.220us 1.407us 0.000us 0.00% 0.000us 0.000us 3
|
| 4028 |
+
cudaLaunchKernel 4.04% 200.304us 4.04% 200.304us 66.768us 0.000us 0.00% 0.000us 0.000us 3
|
| 4029 |
+
cudaDeviceSynchronize 59.00% 2.924ms 59.00% 2.924ms 2.924ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
+
Self CPU time total: 4.956ms
|
| 4032 |
+
Self CUDA time total: 3.066ms
|
| 4033 |
|
| 4034 |
|
| 4035 |
|
|
|
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4041 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4042 |
+
hf_kernels_flash_attn 2.01% 110.531us 38.27% 2.104ms 2.104ms 0.000us 0.00% 4.721ms 4.721ms 1
|
| 4043 |
+
_flash_attn_9e27194::fwd 0.85% 46.845us 36.26% 1.993ms 664.435us 3.536ms 100.00% 4.721ms 1.574ms 3
|
| 4044 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.537ms 100.04% 3.537ms 3.537ms 1
|
| 4045 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.536ms 100.00% 3.536ms 1.179ms 3
|
| 4046 |
+
Activity Buffer Request 31.52% 1.733ms 31.52% 1.733ms 1.733ms 1.186ms 33.53% 1.186ms 1.186ms 1
|
| 4047 |
+
cudaDeviceGetAttribute 0.07% 3.850us 0.07% 3.850us 0.257us 0.000us 0.00% 0.000us 0.000us 15
|
| 4048 |
+
aten::empty_like 0.13% 7.081us 0.42% 23.120us 7.707us 0.000us 0.00% 0.000us 0.000us 3
|
| 4049 |
+
aten::empty_strided 0.29% 16.039us 0.29% 16.039us 5.346us 0.000us 0.00% 0.000us 0.000us 3
|
| 4050 |
+
aten::empty 0.38% 21.099us 0.38% 21.099us 2.344us 0.000us 0.00% 0.000us 0.000us 9
|
| 4051 |
+
cudaFuncSetAttribute 0.07% 3.738us 0.07% 3.738us 1.246us 0.000us 0.00% 0.000us 0.000us 3
|
| 4052 |
+
cudaLaunchKernel 2.95% 161.933us 2.95% 161.933us 53.978us 0.000us 0.00% 0.000us 0.000us 3
|
| 4053 |
+
cudaDeviceSynchronize 61.73% 3.393ms 61.73% 3.393ms 3.393ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4054 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4055 |
+
Self CPU time total: 5.497ms
|
| 4056 |
+
Self CUDA time total: 3.536ms
|
| 4057 |
|
| 4058 |
|
| 4059 |
|
|
|
|
| 4063 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4064 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
+
hf_kernels_flash_attn 1.92% 105.962us 36.83% 2.036ms 2.036ms 0.000us 0.00% 4.864ms 4.864ms 1
|
| 4067 |
+
_flash_attn_9e27194::fwd 0.86% 47.350us 34.91% 1.930ms 643.481us 3.642ms 100.00% 4.864ms 1.621ms 3
|
| 4068 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.643ms 100.04% 3.643ms 3.643ms 1
|
| 4069 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.642ms 100.00% 3.642ms 1.214ms 3
|
| 4070 |
+
Activity Buffer Request 30.16% 1.668ms 30.16% 1.668ms 1.668ms 1.222ms 33.55% 1.222ms 1.222ms 1
|
| 4071 |
+
cudaDeviceGetAttribute 0.06% 3.551us 0.06% 3.551us 0.237us 0.000us 0.00% 0.000us 0.000us 15
|
| 4072 |
+
aten::empty_like 0.12% 6.900us 0.42% 23.180us 7.727us 0.000us 0.00% 0.000us 0.000us 3
|
| 4073 |
+
aten::empty_strided 0.29% 16.280us 0.29% 16.280us 5.427us 0.000us 0.00% 0.000us 0.000us 3
|
| 4074 |
+
aten::empty 0.40% 21.939us 0.40% 21.939us 2.438us 0.000us 0.00% 0.000us 0.000us 9
|
| 4075 |
+
cudaFuncSetAttribute 0.07% 3.861us 0.07% 3.861us 1.287us 0.000us 0.00% 0.000us 0.000us 3
|
| 4076 |
+
cudaLaunchKernel 2.95% 163.043us 2.95% 163.043us 54.348us 0.000us 0.00% 0.000us 0.000us 3
|
| 4077 |
+
cudaDeviceSynchronize 63.17% 3.493ms 63.17% 3.493ms 3.493ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4078 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4079 |
+
Self CPU time total: 5.529ms
|
| 4080 |
+
Self CUDA time total: 3.642ms
|
| 4081 |
|
| 4082 |
|
| 4083 |
impl wl p50(ms) ok
|
| 4084 |
+
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
|
| 4085 |
+
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.02 True
|
| 4086 |
+
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
|
| 4087 |
+
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
|
| 4088 |
+
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
|
| 4089 |
+
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.22 True
|
| 4090 |
</pre></div>
|
| 4091 |
+
<div class="cell-stderr">
|
| 4092 |
+
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4093 |
+
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:03, 6.04it/s]
|
| 4094 |
+
Fetching 20 files: 10%|█ | 2/20 [00:01<00:20, 1.14s/it]
|
| 4095 |
+
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 10.05it/s]
|
| 4096 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4097 |
<div class="cell-artifacts">
|
| 4098 |
<h4>Artifacts:</h4>
|
| 4099 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/hf_kernels_flash_attn3.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3886,9 +3886,9 @@ body[data-tool="eraser"] .main-content {
|
|
| 3886 |
<span class="collapse-indicators">
|
| 3887 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
-
<span id="uv-indicator-benchmark" style="cursor:
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: benchmark |
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3942,19 +3942,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
|
|
| 3942 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3943 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3944 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3945 |
-
hf_kernels_flash_attn3 3.
|
| 3946 |
-
FlashAttnFunc 2.
|
| 3947 |
-
|
| 3948 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3949 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3950 |
-
Activity Buffer Request 38
|
| 3951 |
-
aten::empty 0.
|
| 3952 |
-
cudaFuncSetAttribute 0.
|
| 3953 |
-
cudaLaunchKernel 1.
|
| 3954 |
-
cudaDeviceSynchronize
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
-
Self CPU time total: 4.
|
| 3957 |
-
Self CUDA time total: 2.
|
| 3958 |
|
| 3959 |
|
| 3960 |
|
|
@@ -3964,19 +3964,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
|
|
| 3964 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3965 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3966 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3967 |
-
hf_kernels_flash_attn3 2.
|
| 3968 |
-
FlashAttnFunc
|
| 3969 |
-
|
| 3970 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3971 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3972 |
-
Activity Buffer Request
|
| 3973 |
-
aten::empty 0.
|
| 3974 |
-
cudaFuncSetAttribute 0.
|
| 3975 |
-
cudaLaunchKernel 0.
|
| 3976 |
-
cudaDeviceSynchronize 54.
|
| 3977 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3978 |
-
Self CPU time total: 4.
|
| 3979 |
-
Self CUDA time total: 2.
|
| 3980 |
|
| 3981 |
|
| 3982 |
|
|
@@ -3986,19 +3986,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
|
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
-
hf_kernels_flash_attn3 2.
|
| 3990 |
-
FlashAttnFunc 1.
|
| 3991 |
-
|
| 3992 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3993 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3994 |
-
Activity Buffer Request 36.
|
| 3995 |
-
aten::empty 0.
|
| 3996 |
-
cudaFuncSetAttribute 0.11%
|
| 3997 |
-
cudaLaunchKernel 0.
|
| 3998 |
-
cudaDeviceSynchronize
|
| 3999 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4000 |
-
Self CPU time total: 4.
|
| 4001 |
-
Self CUDA time total: 2.
|
| 4002 |
|
| 4003 |
|
| 4004 |
|
|
@@ -4008,19 +4008,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
|
|
| 4008 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4009 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
-
hf_kernels_flash_attn3 2.
|
| 4012 |
-
FlashAttnFunc 1.
|
| 4013 |
-
|
| 4014 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4015 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4016 |
-
Activity Buffer Request
|
| 4017 |
-
aten::empty 0.
|
| 4018 |
-
cudaFuncSetAttribute 0.
|
| 4019 |
-
cudaLaunchKernel 4.
|
| 4020 |
-
cudaDeviceSynchronize
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
-
Self CPU time total:
|
| 4023 |
-
Self CUDA time total: 2.
|
| 4024 |
|
| 4025 |
|
| 4026 |
|
|
@@ -4030,19 +4030,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
|
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4032 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4033 |
-
hf_kernels_flash_attn3 2.
|
| 4034 |
-
FlashAttnFunc 1.
|
| 4035 |
-
|
| 4036 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4037 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4038 |
-
Activity Buffer Request
|
| 4039 |
-
aten::empty 0.49% 26.
|
| 4040 |
-
cudaFuncSetAttribute 0.09%
|
| 4041 |
-
cudaLaunchKernel
|
| 4042 |
-
cudaDeviceSynchronize
|
| 4043 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4044 |
-
Self CPU time total: 5.
|
| 4045 |
-
Self CUDA time total: 3.
|
| 4046 |
|
| 4047 |
|
| 4048 |
|
|
@@ -4052,34 +4052,38 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
|
|
| 4052 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4053 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4054 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4055 |
-
hf_kernels_flash_attn3 2.
|
| 4056 |
-
FlashAttnFunc 1.
|
| 4057 |
-
|
| 4058 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4059 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4060 |
-
Activity Buffer Request
|
| 4061 |
-
aten::empty 0.
|
| 4062 |
-
cudaFuncSetAttribute 0.09%
|
| 4063 |
-
cudaLaunchKernel
|
| 4064 |
-
cudaDeviceSynchronize
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
-
Self CPU time total: 5.
|
| 4067 |
-
Self CUDA time total: 3.
|
| 4068 |
|
| 4069 |
|
| 4070 |
impl wl p50(ms) ok
|
| 4071 |
-
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.
|
| 4072 |
-
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.
|
| 4073 |
-
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.
|
| 4074 |
-
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.
|
| 4075 |
-
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.
|
| 4076 |
-
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.
|
| 4077 |
</pre></div>
|
| 4078 |
-
<div class="
|
| 4079 |
-
|
| 4080 |
-
|
| 4081 |
-
|
| 4082 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4083 |
<div class="cell-artifacts">
|
| 4084 |
<h4>Artifacts:</h4>
|
| 4085 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3886 |
<span class="collapse-indicators">
|
| 3887 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
+
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: benchmark | 6.42s
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3942 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3943 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3944 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3945 |
+
hf_kernels_flash_attn3 3.60% 164.063us 47.53% 2.169ms 2.169ms 0.000us 0.00% 3.577ms 3.577ms 1
|
| 3946 |
+
FlashAttnFunc 2.65% 121.151us 43.94% 2.005ms 668.341us 0.000us 0.00% 3.577ms 1.192ms 3
|
| 3947 |
+
_flash_attn3_1d39a44::fwd 1.62% 73.763us 41.28% 1.884ms 627.958us 2.686ms 100.00% 3.577ms 1.192ms 3
|
| 3948 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.688ms 100.07% 2.688ms 2.688ms 1
|
| 3949 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.686ms 100.00% 2.686ms 895.374us 3
|
| 3950 |
+
Activity Buffer Request 37.38% 1.706ms 37.38% 1.706ms 1.706ms 891.299us 33.18% 891.299us 891.299us 1
|
| 3951 |
+
aten::empty 0.94% 42.930us 0.94% 42.930us 7.155us 0.000us 0.00% 0.000us 0.000us 6
|
| 3952 |
+
cudaFuncSetAttribute 0.33% 14.999us 0.33% 14.999us 5.000us 0.000us 0.00% 0.000us 0.000us 3
|
| 3953 |
+
cudaLaunchKernel 1.02% 46.432us 1.02% 46.432us 15.477us 0.000us 0.00% 0.000us 0.000us 3
|
| 3954 |
+
cudaDeviceSynchronize 52.47% 2.394ms 52.47% 2.394ms 2.394ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
+
Self CPU time total: 4.563ms
|
| 3957 |
+
Self CUDA time total: 2.686ms
|
| 3958 |
|
| 3959 |
|
| 3960 |
|
|
|
|
| 3964 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3965 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3966 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3967 |
+
hf_kernels_flash_attn3 2.68% 123.103us 45.27% 2.082ms 2.082ms 0.000us 0.00% 3.670ms 3.670ms 1
|
| 3968 |
+
FlashAttnFunc 2.03% 93.300us 42.60% 1.959ms 653.024us 0.000us 0.00% 3.670ms 1.223ms 3
|
| 3969 |
+
_flash_attn3_1d39a44::fwd 1.05% 48.412us 40.57% 1.866ms 621.924us 2.738ms 100.00% 3.670ms 1.223ms 3
|
| 3970 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.739ms 100.06% 2.739ms 2.739ms 1
|
| 3971 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.738ms 100.00% 2.738ms 912.629us 3
|
| 3972 |
+
Activity Buffer Request 38.14% 1.754ms 38.14% 1.754ms 1.754ms 932.416us 34.06% 932.416us 932.416us 1
|
| 3973 |
+
aten::empty 0.59% 27.041us 0.59% 27.041us 4.507us 0.000us 0.00% 0.000us 0.000us 6
|
| 3974 |
+
cudaFuncSetAttribute 0.14% 6.480us 0.14% 6.480us 2.160us 0.000us 0.00% 0.000us 0.000us 3
|
| 3975 |
+
cudaLaunchKernel 0.64% 29.621us 0.64% 29.621us 9.874us 0.000us 0.00% 0.000us 0.000us 3
|
| 3976 |
+
cudaDeviceSynchronize 54.73% 2.517ms 54.73% 2.517ms 2.517ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3977 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3978 |
+
Self CPU time total: 4.599ms
|
| 3979 |
+
Self CUDA time total: 2.738ms
|
| 3980 |
|
| 3981 |
|
| 3982 |
|
|
|
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
+
hf_kernels_flash_attn3 2.66% 126.472us 43.74% 2.079ms 2.079ms 0.000us 0.00% 3.863ms 3.863ms 1
|
| 3990 |
+
FlashAttnFunc 1.87% 89.050us 41.07% 1.952ms 650.694us 0.000us 0.00% 3.863ms 1.288ms 3
|
| 3991 |
+
_flash_attn3_1d39a44::fwd 1.00% 47.600us 39.20% 1.863ms 621.011us 2.883ms 100.00% 3.863ms 1.288ms 3
|
| 3992 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.885ms 100.05% 2.885ms 2.885ms 1
|
| 3993 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.883ms 100.00% 2.883ms 961.034us 3
|
| 3994 |
+
Activity Buffer Request 36.94% 1.756ms 36.94% 1.756ms 1.756ms 979.903us 33.99% 979.903us 979.903us 1
|
| 3995 |
+
aten::empty 0.53% 25.081us 0.53% 25.081us 4.180us 0.000us 0.00% 0.000us 0.000us 6
|
| 3996 |
+
cudaFuncSetAttribute 0.11% 5.050us 0.11% 5.050us 1.683us 0.000us 0.00% 0.000us 0.000us 3
|
| 3997 |
+
cudaLaunchKernel 0.62% 29.612us 0.62% 29.612us 9.871us 0.000us 0.00% 0.000us 0.000us 3
|
| 3998 |
+
cudaDeviceSynchronize 56.26% 2.674ms 56.26% 2.674ms 2.674ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3999 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4000 |
+
Self CPU time total: 4.753ms
|
| 4001 |
+
Self CUDA time total: 2.883ms
|
| 4002 |
|
| 4003 |
|
| 4004 |
|
|
|
|
| 4008 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4009 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
+
hf_kernels_flash_attn3 2.48% 119.623us 44.91% 2.170ms 2.170ms 0.000us 0.00% 3.846ms 3.846ms 1
|
| 4012 |
+
FlashAttnFunc 1.87% 90.201us 42.43% 2.050ms 683.325us 0.000us 0.00% 3.846ms 1.282ms 3
|
| 4013 |
+
_flash_attn3_1d39a44::fwd 0.98% 47.571us 40.56% 1.960ms 653.258us 2.874ms 100.00% 3.846ms 1.282ms 3
|
| 4014 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.876ms 100.05% 2.876ms 2.876ms 1
|
| 4015 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.874ms 100.00% 2.874ms 957.983us 3
|
| 4016 |
+
Activity Buffer Request 34.13% 1.649ms 34.13% 1.649ms 1.649ms 972.223us 33.83% 972.223us 972.223us 1
|
| 4017 |
+
aten::empty 0.55% 26.410us 0.55% 26.410us 4.402us 0.000us 0.00% 0.000us 0.000us 6
|
| 4018 |
+
cudaFuncSetAttribute 0.11% 5.420us 0.11% 5.420us 1.807us 0.000us 0.00% 0.000us 0.000us 3
|
| 4019 |
+
cudaLaunchKernel 4.79% 231.213us 4.79% 231.213us 77.071us 0.000us 0.00% 0.000us 0.000us 3
|
| 4020 |
+
cudaDeviceSynchronize 55.09% 2.662ms 55.09% 2.662ms 2.662ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
+
Self CPU time total: 4.831ms
|
| 4023 |
+
Self CUDA time total: 2.874ms
|
| 4024 |
|
| 4025 |
|
| 4026 |
|
|
|
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4032 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4033 |
+
hf_kernels_flash_attn3 2.24% 122.153us 41.67% 2.277ms 2.277ms 0.000us 0.00% 4.541ms 4.541ms 1
|
| 4034 |
+
FlashAttnFunc 1.69% 92.610us 39.43% 2.155ms 718.395us 0.000us 0.00% 4.541ms 1.514ms 3
|
| 4035 |
+
_flash_attn3_1d39a44::fwd 0.86% 47.089us 37.74% 2.063ms 687.525us 3.403ms 100.00% 4.541ms 1.514ms 3
|
| 4036 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.404ms 100.05% 3.404ms 3.404ms 1
|
| 4037 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.403ms 100.00% 3.403ms 1.134ms 3
|
| 4038 |
+
Activity Buffer Request 32.14% 1.757ms 32.14% 1.757ms 1.757ms 1.138ms 33.45% 1.138ms 1.138ms 1
|
| 4039 |
+
aten::empty 0.49% 26.951us 0.49% 26.951us 4.492us 0.000us 0.00% 0.000us 0.000us 6
|
| 4040 |
+
cudaFuncSetAttribute 0.09% 4.812us 0.09% 4.812us 1.604us 0.000us 0.00% 0.000us 0.000us 3
|
| 4041 |
+
cudaLaunchKernel 4.15% 227.044us 4.15% 227.044us 75.681us 0.000us 0.00% 0.000us 0.000us 3
|
| 4042 |
+
cudaDeviceSynchronize 58.33% 3.188ms 58.33% 3.188ms 3.188ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4043 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4044 |
+
Self CPU time total: 5.465ms
|
| 4045 |
+
Self CUDA time total: 3.403ms
|
| 4046 |
|
| 4047 |
|
| 4048 |
|
|
|
|
| 4052 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4053 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4054 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4055 |
+
hf_kernels_flash_attn3 2.06% 111.143us 40.98% 2.214ms 2.214ms 0.000us 0.00% 4.541ms 4.541ms 1
|
| 4056 |
+
FlashAttnFunc 1.64% 88.581us 38.92% 2.103ms 700.975us 0.000us 0.00% 4.541ms 1.514ms 3
|
| 4057 |
+
_flash_attn3_1d39a44::fwd 0.89% 48.319us 37.28% 2.014ms 671.448us 3.401ms 100.00% 4.541ms 1.514ms 3
|
| 4058 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.402ms 100.04% 3.402ms 3.402ms 1
|
| 4059 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.401ms 100.00% 3.401ms 1.134ms 3
|
| 4060 |
+
Activity Buffer Request 31.65% 1.710ms 31.65% 1.710ms 1.710ms 1.140ms 33.52% 1.140ms 1.140ms 1
|
| 4061 |
+
aten::empty 0.48% 25.892us 0.48% 25.892us 4.315us 0.000us 0.00% 0.000us 0.000us 6
|
| 4062 |
+
cudaFuncSetAttribute 0.09% 4.710us 0.09% 4.710us 1.570us 0.000us 0.00% 0.000us 0.000us 3
|
| 4063 |
+
cudaLaunchKernel 4.17% 225.304us 4.17% 225.304us 75.101us 0.000us 0.00% 0.000us 0.000us 3
|
| 4064 |
+
cudaDeviceSynchronize 59.02% 3.189ms 59.02% 3.189ms 3.189ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
+
Self CPU time total: 5.403ms
|
| 4067 |
+
Self CUDA time total: 3.401ms
|
| 4068 |
|
| 4069 |
|
| 4070 |
impl wl p50(ms) ok
|
| 4071 |
+
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
|
| 4072 |
+
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
|
| 4073 |
+
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.00 True
|
| 4074 |
+
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True
|
| 4075 |
+
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.16 True
|
| 4076 |
+
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True
|
| 4077 |
</pre></div>
|
| 4078 |
+
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4079 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4080 |
+
<div class="uv-logs-content" style="display: none;">
|
| 4081 |
+
Installed 14 packages in 11ms
|
| 4082 |
</div>
|
| 4083 |
+
</div>
|
| 4084 |
+
<div class="cell-stderr">Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]
|
| 4085 |
+
Fetching 5 files: 40%|████ | 2/5 [00:01<00:02, 1.24it/s]
|
| 4086 |
+
Fetching 5 files: 100%|██████████| 5/5 [00:01<00:00, 3.09it/s]</div>
|
| 4087 |
<div class="cell-artifacts">
|
| 4088 |
<h4>Artifacts:</h4>
|
| 4089 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/mem_efficient_attention.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: benchmark | 4.
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3941,28 +3941,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
|
|
| 3941 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3942 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3943 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3944 |
-
torch_mem_eff 4.
|
| 3945 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3946 |
-
aten::scaled_dot_product_attention 0.
|
| 3947 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 3948 |
-
aten::_efficient_attention_forward 0.
|
| 3949 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3950 |
-
aten::contiguous 0.
|
| 3951 |
-
aten::clone 0.
|
| 3952 |
-
aten::copy_
|
| 3953 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3954 |
-
Activity Buffer Request 24.
|
| 3955 |
-
aten::transpose 0.
|
| 3956 |
-
aten::as_strided 0.
|
| 3957 |
-
aten::empty_like 0.20% 14.
|
| 3958 |
-
aten::empty 1.
|
| 3959 |
-
cudaLaunchKernel 1.
|
| 3960 |
-
cudaStreamIsCapturing 0.04% 3.
|
| 3961 |
-
cudaFuncSetAttribute 0.12%
|
| 3962 |
-
cudaDeviceSynchronize 64.
|
| 3963 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3964 |
-
Self CPU time total: 7.
|
| 3965 |
-
Self CUDA time total: 5.
|
| 3966 |
|
| 3967 |
|
| 3968 |
|
|
@@ -3972,28 +3972,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
|
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
-
torch_mem_eff 3.
|
| 3976 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3977 |
-
aten::scaled_dot_product_attention 0.
|
| 3978 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 3979 |
-
aten::_efficient_attention_forward 0.36% 27.
|
| 3980 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3981 |
-
aten::contiguous 0.
|
| 3982 |
-
aten::clone 0.
|
| 3983 |
-
aten::copy_ 0.
|
| 3984 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3985 |
-
Activity Buffer Request
|
| 3986 |
-
aten::transpose 0.
|
| 3987 |
-
aten::as_strided 0.20% 15.
|
| 3988 |
-
aten::empty_like 0.
|
| 3989 |
-
aten::empty 0.
|
| 3990 |
-
cudaLaunchKernel 1.
|
| 3991 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 3992 |
-
cudaFuncSetAttribute 0.
|
| 3993 |
-
cudaDeviceSynchronize
|
| 3994 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3995 |
-
Self CPU time total: 7.
|
| 3996 |
-
Self CUDA time total: 5.
|
| 3997 |
|
| 3998 |
|
| 3999 |
|
|
@@ -4003,28 +4003,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
|
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4005 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4006 |
-
torch_mem_eff 3.
|
| 4007 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4008 |
-
aten::scaled_dot_product_attention 0.
|
| 4009 |
-
aten::_scaled_dot_product_efficient_attention 0.23% 18.
|
| 4010 |
-
aten::_efficient_attention_forward 0.
|
| 4011 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4012 |
-
aten::contiguous 0.10% 7.
|
| 4013 |
-
aten::clone 0.26% 20.
|
| 4014 |
-
aten::copy_ 0.
|
| 4015 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4016 |
-
Activity Buffer Request 22.
|
| 4017 |
-
aten::transpose 0.
|
| 4018 |
-
aten::as_strided 0.
|
| 4019 |
-
aten::empty_like 0.
|
| 4020 |
-
aten::empty 0.
|
| 4021 |
-
cudaLaunchKernel 1.
|
| 4022 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4023 |
-
cudaFuncSetAttribute 0.
|
| 4024 |
-
cudaDeviceSynchronize 69.
|
| 4025 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4026 |
-
Self CPU time total: 7.
|
| 4027 |
-
Self CUDA time total:
|
| 4028 |
|
| 4029 |
|
| 4030 |
|
|
@@ -4034,28 +4034,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
|
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
-
torch_mem_eff
|
| 4038 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4039 |
-
aten::scaled_dot_product_attention 0.
|
| 4040 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4041 |
-
aten::_efficient_attention_forward 0.
|
| 4042 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4043 |
-
aten::contiguous 0.
|
| 4044 |
-
aten::clone 0.
|
| 4045 |
-
aten::copy_ 0.
|
| 4046 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4047 |
-
Activity Buffer Request 21.
|
| 4048 |
-
aten::transpose 0.
|
| 4049 |
-
aten::as_strided 0.
|
| 4050 |
-
aten::empty_like 0.15%
|
| 4051 |
-
aten::empty 0.
|
| 4052 |
-
cudaLaunchKernel 3.
|
| 4053 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4054 |
-
cudaFuncSetAttribute 0.04% 3.
|
| 4055 |
-
cudaDeviceSynchronize
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
-
Self CPU time total: 8.
|
| 4058 |
-
Self CUDA time total: 6.
|
| 4059 |
|
| 4060 |
|
| 4061 |
|
|
@@ -4065,28 +4065,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
|
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4067 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4068 |
-
torch_mem_eff 2.
|
| 4069 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4070 |
-
aten::scaled_dot_product_attention 0.
|
| 4071 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4072 |
-
aten::_efficient_attention_forward 0.
|
| 4073 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4074 |
-
aten::contiguous 0.
|
| 4075 |
-
aten::clone 0.
|
| 4076 |
-
aten::copy_ 0.
|
| 4077 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4078 |
-
Activity Buffer Request 21.
|
| 4079 |
-
aten::transpose 0.
|
| 4080 |
-
aten::as_strided 0.19% 15.
|
| 4081 |
-
aten::empty_like 0.14% 11.
|
| 4082 |
-
aten::empty 0.
|
| 4083 |
-
cudaLaunchKernel
|
| 4084 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4085 |
-
cudaFuncSetAttribute 0.04% 3.
|
| 4086 |
-
cudaDeviceSynchronize 68.
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
-
Self CPU time total: 8.
|
| 4089 |
-
Self CUDA time total: 6.
|
| 4090 |
|
| 4091 |
|
| 4092 |
|
|
@@ -4096,37 +4096,37 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
|
|
| 4096 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4097 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4098 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4099 |
-
torch_mem_eff 2.
|
| 4100 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4101 |
-
aten::scaled_dot_product_attention 0.
|
| 4102 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4103 |
-
aten::_efficient_attention_forward 0.
|
| 4104 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4105 |
-
aten::contiguous 0.09% 7.
|
| 4106 |
-
aten::clone 0.
|
| 4107 |
-
aten::copy_ 0.
|
| 4108 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4109 |
-
Activity Buffer Request 19.
|
| 4110 |
-
aten::transpose 0.
|
| 4111 |
-
aten::as_strided 0.18% 15.
|
| 4112 |
-
aten::empty_like 0.
|
| 4113 |
-
aten::empty 0.
|
| 4114 |
-
cudaLaunchKernel 2.
|
| 4115 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4116 |
-
cudaFuncSetAttribute 0.
|
| 4117 |
-
cudaDeviceSynchronize
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
-
Self CPU time total: 8.
|
| 4120 |
-
Self CUDA time total: 6.
|
| 4121 |
|
| 4122 |
|
| 4123 |
impl wl p50(ms) ok
|
| 4124 |
-
torch_mem_eff cuda_attn_L128_bfloat16 1.
|
| 4125 |
-
torch_mem_eff cuda_attn_L256_bfloat16 1.
|
| 4126 |
-
torch_mem_eff cuda_attn_L320_bfloat16 1.
|
| 4127 |
-
torch_mem_eff cuda_attn_L384_bfloat16 2.
|
| 4128 |
-
torch_mem_eff cuda_attn_L448_bfloat16 2.
|
| 4129 |
-
torch_mem_eff cuda_attn_L512_bfloat16 2.
|
| 4130 |
</pre></div>
|
| 4131 |
<div class="cell-artifacts">
|
| 4132 |
<h4>Artifacts:</h4>
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: benchmark | 4.15s
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3941 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3942 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3943 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3944 |
+
torch_mem_eff 4.11% 302.695us 35.19% 2.592ms 2.592ms 0.000us 0.00% 5.476ms 5.476ms 1
|
| 3945 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.440ms 100.33% 5.440ms 5.440ms 1
|
| 3946 |
+
aten::scaled_dot_product_attention 0.40% 29.210us 2.30% 169.213us 56.404us 0.000us 0.00% 4.805ms 1.602ms 3
|
| 3947 |
+
aten::_scaled_dot_product_efficient_attention 0.29% 21.719us 1.90% 140.003us 46.668us 0.000us 0.00% 4.805ms 1.602ms 3
|
| 3948 |
+
aten::_efficient_attention_forward 0.48% 35.571us 1.32% 97.242us 32.414us 4.805ms 88.62% 4.805ms 1.602ms 3
|
| 3949 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.805ms 88.62% 4.805ms 1.602ms 3
|
| 3950 |
+
aten::contiguous 0.13% 9.829us 27.98% 2.062ms 229.090us 0.000us 0.00% 670.404us 74.489us 9
|
| 3951 |
+
aten::clone 0.35% 25.869us 27.85% 2.052ms 227.998us 0.000us 0.00% 670.404us 74.489us 9
|
| 3952 |
+
aten::copy_ 0.98% 72.210us 26.54% 1.956ms 217.285us 616.836us 11.38% 670.404us 74.489us 9
|
| 3953 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 616.836us 11.38% 616.836us 68.537us 9
|
| 3954 |
+
Activity Buffer Request 24.39% 1.797ms 24.39% 1.797ms 1.797ms 53.568us 0.99% 53.568us 53.568us 1
|
| 3955 |
+
aten::transpose 0.81% 59.530us 1.08% 79.784us 3.324us 0.000us 0.00% 0.000us 0.000us 24
|
| 3956 |
+
aten::as_strided 0.27% 20.254us 0.27% 20.254us 0.844us 0.000us 0.00% 0.000us 0.000us 24
|
| 3957 |
+
aten::empty_like 0.20% 14.892us 0.96% 70.554us 7.839us 0.000us 0.00% 0.000us 0.000us 9
|
| 3958 |
+
aten::empty 1.12% 82.341us 1.12% 82.341us 3.921us 0.000us 0.00% 0.000us 0.000us 21
|
| 3959 |
+
cudaLaunchKernel 1.48% 109.241us 1.48% 109.241us 9.103us 0.000us 0.00% 0.000us 0.000us 12
|
| 3960 |
+
cudaStreamIsCapturing 0.04% 3.240us 0.04% 3.240us 1.080us 0.000us 0.00% 0.000us 0.000us 3
|
| 3961 |
+
cudaFuncSetAttribute 0.12% 9.162us 0.12% 9.162us 3.054us 0.000us 0.00% 0.000us 0.000us 3
|
| 3962 |
+
cudaDeviceSynchronize 64.81% 4.776ms 64.81% 4.776ms 4.776ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3963 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3964 |
+
Self CPU time total: 7.368ms
|
| 3965 |
+
Self CUDA time total: 5.422ms
|
| 3966 |
|
| 3967 |
|
| 3968 |
|
|
|
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
+
torch_mem_eff 3.18% 243.704us 30.16% 2.312ms 2.312ms 0.000us 0.00% 5.946ms 5.946ms 1
|
| 3976 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.900ms 100.14% 5.900ms 5.900ms 1
|
| 3977 |
+
aten::scaled_dot_product_attention 0.23% 17.410us 1.83% 139.893us 46.631us 0.000us 0.00% 5.256ms 1.752ms 3
|
| 3978 |
+
aten::_scaled_dot_product_efficient_attention 0.24% 18.330us 1.60% 122.483us 40.828us 0.000us 0.00% 5.256ms 1.752ms 3
|
| 3979 |
+
aten::_efficient_attention_forward 0.36% 27.350us 1.07% 81.803us 27.268us 5.256ms 89.21% 5.256ms 1.752ms 3
|
| 3980 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.256ms 89.21% 5.256ms 1.752ms 3
|
| 3981 |
+
aten::contiguous 0.10% 7.470us 24.63% 1.888ms 209.765us 0.000us 0.00% 690.500us 76.722us 9
|
| 3982 |
+
aten::clone 0.27% 20.522us 24.53% 1.880ms 208.935us 0.000us 0.00% 690.500us 76.722us 9
|
| 3983 |
+
aten::copy_ 0.86% 65.740us 23.60% 1.809ms 200.963us 635.844us 10.79% 690.500us 76.722us 9
|
| 3984 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 635.844us 10.79% 635.844us 70.649us 9
|
| 3985 |
+
Activity Buffer Request 21.87% 1.676ms 21.87% 1.676ms 1.676ms 54.656us 0.93% 54.656us 54.656us 1
|
| 3986 |
+
aten::transpose 0.62% 47.210us 0.82% 62.900us 2.621us 0.000us 0.00% 0.000us 0.000us 24
|
| 3987 |
+
aten::as_strided 0.20% 15.690us 0.20% 15.690us 0.654us 0.000us 0.00% 0.000us 0.000us 24
|
| 3988 |
+
aten::empty_like 0.16% 11.901us 0.67% 51.221us 5.691us 0.000us 0.00% 0.000us 0.000us 9
|
| 3989 |
+
aten::empty 0.85% 65.201us 0.85% 65.201us 3.105us 0.000us 0.00% 0.000us 0.000us 21
|
| 3990 |
+
cudaLaunchKernel 1.16% 89.161us 1.16% 89.161us 7.430us 0.000us 0.00% 0.000us 0.000us 12
|
| 3991 |
+
cudaStreamIsCapturing 0.03% 2.381us 0.03% 2.381us 0.794us 0.000us 0.00% 0.000us 0.000us 3
|
| 3992 |
+
cudaFuncSetAttribute 0.05% 3.881us 0.05% 3.881us 1.294us 0.000us 0.00% 0.000us 0.000us 3
|
| 3993 |
+
cudaDeviceSynchronize 69.84% 5.353ms 69.84% 5.353ms 5.353ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3994 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3995 |
+
Self CPU time total: 7.665ms
|
| 3996 |
+
Self CUDA time total: 5.891ms
|
| 3997 |
|
| 3998 |
|
| 3999 |
|
|
|
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4005 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4006 |
+
torch_mem_eff 3.05% 239.816us 30.60% 2.409ms 2.409ms 0.000us 0.00% 6.068ms 6.068ms 1
|
| 4007 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.021ms 100.14% 6.021ms 6.021ms 1
|
| 4008 |
+
aten::scaled_dot_product_attention 0.23% 17.959us 1.79% 140.600us 46.867us 0.000us 0.00% 5.365ms 1.788ms 3
|
| 4009 |
+
aten::_scaled_dot_product_efficient_attention 0.23% 18.141us 1.56% 122.641us 40.880us 0.000us 0.00% 5.365ms 1.788ms 3
|
| 4010 |
+
aten::_efficient_attention_forward 0.36% 28.699us 1.04% 81.531us 27.177us 5.365ms 89.24% 5.365ms 1.788ms 3
|
| 4011 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.365ms 89.24% 5.365ms 1.788ms 3
|
| 4012 |
+
aten::contiguous 0.10% 7.861us 25.24% 1.987ms 220.773us 0.000us 0.00% 702.468us 78.052us 9
|
| 4013 |
+
aten::clone 0.26% 20.540us 25.14% 1.979ms 219.899us 0.000us 0.00% 702.468us 78.052us 9
|
| 4014 |
+
aten::copy_ 0.92% 72.171us 24.24% 1.908ms 212.002us 646.884us 10.76% 702.468us 78.052us 9
|
| 4015 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 646.884us 10.76% 646.884us 71.876us 9
|
| 4016 |
+
Activity Buffer Request 22.46% 1.768ms 22.46% 1.768ms 1.768ms 55.584us 0.92% 55.584us 55.584us 1
|
| 4017 |
+
aten::transpose 0.60% 47.471us 0.81% 64.120us 2.672us 0.000us 0.00% 0.000us 0.000us 24
|
| 4018 |
+
aten::as_strided 0.21% 16.649us 0.21% 16.649us 0.694us 0.000us 0.00% 0.000us 0.000us 24
|
| 4019 |
+
aten::empty_like 0.15% 11.960us 0.64% 50.531us 5.615us 0.000us 0.00% 0.000us 0.000us 9
|
| 4020 |
+
aten::empty 0.81% 63.971us 0.81% 63.971us 3.046us 0.000us 0.00% 0.000us 0.000us 21
|
| 4021 |
+
cudaLaunchKernel 1.13% 89.282us 1.13% 89.282us 7.440us 0.000us 0.00% 0.000us 0.000us 12
|
| 4022 |
+
cudaStreamIsCapturing 0.03% 2.660us 0.03% 2.660us 0.887us 0.000us 0.00% 0.000us 0.000us 3
|
| 4023 |
+
cudaFuncSetAttribute 0.04% 3.150us 0.04% 3.150us 1.050us 0.000us 0.00% 0.000us 0.000us 3
|
| 4024 |
+
cudaDeviceSynchronize 69.40% 5.462ms 69.40% 5.462ms 5.462ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4025 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4026 |
+
Self CPU time total: 7.871ms
|
| 4027 |
+
Self CUDA time total: 6.012ms
|
| 4028 |
|
| 4029 |
|
| 4030 |
|
|
|
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
+
torch_mem_eff 2.93% 240.625us 31.13% 2.555ms 2.555ms 0.000us 0.00% 6.259ms 6.259ms 1
|
| 4038 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.208ms 100.13% 6.208ms 6.208ms 1
|
| 4039 |
+
aten::scaled_dot_product_attention 0.21% 17.361us 1.73% 142.203us 47.401us 0.000us 0.00% 5.537ms 1.846ms 3
|
| 4040 |
+
aten::_scaled_dot_product_efficient_attention 0.22% 18.441us 1.52% 124.842us 41.614us 0.000us 0.00% 5.537ms 1.846ms 3
|
| 4041 |
+
aten::_efficient_attention_forward 0.36% 29.601us 1.03% 84.471us 28.157us 5.537ms 89.30% 5.537ms 1.846ms 3
|
| 4042 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.537ms 89.30% 5.537ms 1.846ms 3
|
| 4043 |
+
aten::contiguous 0.09% 7.769us 25.95% 2.130ms 236.658us 0.000us 0.00% 721.984us 80.220us 9
|
| 4044 |
+
aten::clone 0.26% 21.609us 25.85% 2.122ms 235.795us 0.000us 0.00% 721.984us 80.220us 9
|
| 4045 |
+
aten::copy_ 0.80% 65.822us 24.94% 2.047ms 227.475us 663.552us 10.70% 721.984us 80.220us 9
|
| 4046 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 663.552us 10.70% 663.552us 73.728us 9
|
| 4047 |
+
Activity Buffer Request 21.30% 1.749ms 21.30% 1.749ms 1.749ms 58.432us 0.94% 58.432us 58.432us 1
|
| 4048 |
+
aten::transpose 0.59% 48.680us 0.78% 64.131us 2.672us 0.000us 0.00% 0.000us 0.000us 24
|
| 4049 |
+
aten::as_strided 0.19% 15.451us 0.19% 15.451us 0.644us 0.000us 0.00% 0.000us 0.000us 24
|
| 4050 |
+
aten::empty_like 0.15% 12.591us 0.65% 53.271us 5.919us 0.000us 0.00% 0.000us 0.000us 9
|
| 4051 |
+
aten::empty 0.81% 66.120us 0.81% 66.120us 3.149us 0.000us 0.00% 0.000us 0.000us 21
|
| 4052 |
+
cudaLaunchKernel 3.12% 256.044us 3.12% 256.044us 21.337us 0.000us 0.00% 0.000us 0.000us 12
|
| 4053 |
+
cudaStreamIsCapturing 0.03% 2.670us 0.03% 2.670us 0.890us 0.000us 0.00% 0.000us 0.000us 3
|
| 4054 |
+
cudaFuncSetAttribute 0.04% 3.480us 0.04% 3.480us 1.160us 0.000us 0.00% 0.000us 0.000us 3
|
| 4055 |
+
cudaDeviceSynchronize 68.87% 5.653ms 68.87% 5.653ms 5.653ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
+
Self CPU time total: 8.208ms
|
| 4058 |
+
Self CUDA time total: 6.200ms
|
| 4059 |
|
| 4060 |
|
| 4061 |
|
|
|
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4067 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4068 |
+
torch_mem_eff 2.93% 245.582us 31.52% 2.645ms 2.645ms 0.000us 0.00% 6.354ms 6.354ms 1
|
| 4069 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.303ms 100.13% 6.303ms 6.303ms 1
|
| 4070 |
+
aten::scaled_dot_product_attention 0.20% 17.170us 1.68% 140.693us 46.898us 0.000us 0.00% 5.628ms 1.876ms 3
|
| 4071 |
+
aten::_scaled_dot_product_efficient_attention 0.21% 17.520us 1.47% 123.523us 41.174us 0.000us 0.00% 5.628ms 1.876ms 3
|
| 4072 |
+
aten::_efficient_attention_forward 0.35% 29.440us 1.00% 84.263us 28.088us 5.628ms 89.41% 5.628ms 1.876ms 3
|
| 4073 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.628ms 89.41% 5.628ms 1.876ms 3
|
| 4074 |
+
aten::contiguous 0.09% 7.259us 26.43% 2.218ms 246.393us 0.000us 0.00% 726.309us 80.701us 9
|
| 4075 |
+
aten::clone 0.25% 21.219us 26.34% 2.210ms 245.587us 0.000us 0.00% 726.309us 80.701us 9
|
| 4076 |
+
aten::copy_ 0.78% 65.083us 25.46% 2.136ms 237.368us 666.948us 10.59% 726.309us 80.701us 9
|
| 4077 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 666.948us 10.59% 666.948us 74.105us 9
|
| 4078 |
+
Activity Buffer Request 21.84% 1.833ms 21.84% 1.833ms 1.833ms 59.361us 0.94% 59.361us 59.361us 1
|
| 4079 |
+
aten::transpose 0.56% 46.780us 0.75% 62.730us 2.614us 0.000us 0.00% 0.000us 0.000us 24
|
| 4080 |
+
aten::as_strided 0.19% 15.950us 0.19% 15.950us 0.665us 0.000us 0.00% 0.000us 0.000us 24
|
| 4081 |
+
aten::empty_like 0.14% 11.512us 0.63% 52.753us 5.861us 0.000us 0.00% 0.000us 0.000us 9
|
| 4082 |
+
aten::empty 0.79% 66.642us 0.79% 66.642us 3.173us 0.000us 0.00% 0.000us 0.000us 21
|
| 4083 |
+
cudaLaunchKernel 3.12% 261.945us 3.12% 261.945us 21.829us 0.000us 0.00% 0.000us 0.000us 12
|
| 4084 |
+
cudaStreamIsCapturing 0.03% 2.500us 0.03% 2.500us 0.833us 0.000us 0.00% 0.000us 0.000us 3
|
| 4085 |
+
cudaFuncSetAttribute 0.04% 3.581us 0.04% 3.581us 1.194us 0.000us 0.00% 0.000us 0.000us 3
|
| 4086 |
+
cudaDeviceSynchronize 68.48% 5.745ms 68.48% 5.745ms 5.745ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
+
Self CPU time total: 8.390ms
|
| 4089 |
+
Self CUDA time total: 6.295ms
|
| 4090 |
|
| 4091 |
|
| 4092 |
|
|
|
|
| 4096 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4097 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4098 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4099 |
+
torch_mem_eff 2.68% 234.298us 28.81% 2.516ms 2.516ms 0.000us 0.00% 6.820ms 6.820ms 1
|
| 4100 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.768ms 100.12% 6.768ms 6.768ms 1
|
| 4101 |
+
aten::scaled_dot_product_attention 0.20% 17.618us 1.61% 140.900us 46.967us 0.000us 0.00% 6.087ms 2.029ms 3
|
| 4102 |
+
aten::_scaled_dot_product_efficient_attention 0.21% 18.311us 1.41% 123.282us 41.094us 0.000us 0.00% 6.087ms 2.029ms 3
|
| 4103 |
+
aten::_efficient_attention_forward 0.33% 29.191us 0.95% 82.621us 27.540us 6.087ms 90.04% 6.087ms 2.029ms 3
|
| 4104 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 6.087ms 90.04% 6.087ms 2.029ms 3
|
| 4105 |
+
aten::contiguous 0.09% 7.641us 24.06% 2.101ms 233.417us 0.000us 0.00% 733.380us 81.487us 9
|
| 4106 |
+
aten::clone 0.23% 20.279us 23.97% 2.093ms 232.568us 0.000us 0.00% 733.380us 81.487us 9
|
| 4107 |
+
aten::copy_ 0.74% 64.431us 23.10% 2.017ms 224.097us 672.964us 9.96% 733.380us 81.487us 9
|
| 4108 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 672.964us 9.96% 672.964us 74.774us 9
|
| 4109 |
+
Activity Buffer Request 19.61% 1.713ms 19.61% 1.713ms 1.713ms 60.416us 0.89% 60.416us 60.416us 1
|
| 4110 |
+
aten::transpose 0.53% 46.410us 0.71% 62.109us 2.588us 0.000us 0.00% 0.000us 0.000us 24
|
| 4111 |
+
aten::as_strided 0.18% 15.699us 0.18% 15.699us 0.654us 0.000us 0.00% 0.000us 0.000us 24
|
| 4112 |
+
aten::empty_like 0.15% 12.751us 0.64% 55.961us 6.218us 0.000us 0.00% 0.000us 0.000us 9
|
| 4113 |
+
aten::empty 0.79% 69.050us 0.79% 69.050us 3.288us 0.000us 0.00% 0.000us 0.000us 21
|
| 4114 |
+
cudaLaunchKernel 2.99% 261.415us 2.99% 261.415us 21.785us 0.000us 0.00% 0.000us 0.000us 12
|
| 4115 |
+
cudaStreamIsCapturing 0.03% 2.920us 0.03% 2.920us 0.973us 0.000us 0.00% 0.000us 0.000us 3
|
| 4116 |
+
cudaFuncSetAttribute 0.03% 2.980us 0.03% 2.980us 0.993us 0.000us 0.00% 0.000us 0.000us 3
|
| 4117 |
+
cudaDeviceSynchronize 71.19% 6.216ms 71.19% 6.216ms 6.216ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
+
Self CPU time total: 8.732ms
|
| 4120 |
+
Self CUDA time total: 6.759ms
|
| 4121 |
|
| 4122 |
|
| 4123 |
impl wl p50(ms) ok
|
| 4124 |
+
torch_mem_eff cuda_attn_L128_bfloat16 1.84 True
|
| 4125 |
+
torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
|
| 4126 |
+
torch_mem_eff cuda_attn_L320_bfloat16 1.97 True
|
| 4127 |
+
torch_mem_eff cuda_attn_L384_bfloat16 2.08 True
|
| 4128 |
+
torch_mem_eff cuda_attn_L448_bfloat16 2.04 True
|
| 4129 |
+
torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
|
| 4130 |
</pre></div>
|
| 4131 |
<div class="cell-artifacts">
|
| 4132 |
<h4>Artifacts:</h4>
|
flash_attn/impls/sage_attention.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: benchmark | 4.
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3938,22 +3938,22 @@ Cell: benchmark | 4.59s
|
|
| 3938 |
<div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
|
| 3939 |
impl wl p50(ms) ok
|
| 3940 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 3941 |
-
Error: module '
|
| 3942 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 3943 |
-
Error: module '
|
| 3944 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 3945 |
-
Error: module '
|
| 3946 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 3947 |
-
Error: module '
|
| 3948 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 3949 |
-
Error: module '
|
| 3950 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 3951 |
-
Error: module '
|
| 3952 |
</pre></div>
|
| 3953 |
<div class="cell-stderr">
|
| 3954 |
-
Fetching
|
| 3955 |
-
Fetching
|
| 3956 |
-
Fetching
|
| 3957 |
</div>
|
| 3958 |
<div class="cell-artifacts">
|
| 3959 |
<h4>Artifacts:</h4>
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: benchmark | 4.58s
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3938 |
<div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
|
| 3939 |
impl wl p50(ms) ok
|
| 3940 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 3941 |
+
Error: module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'
|
| 3942 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 3943 |
+
Error: module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'
|
| 3944 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 3945 |
+
Error: module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'
|
| 3946 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 3947 |
+
Error: module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'
|
| 3948 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 3949 |
+
Error: module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'
|
| 3950 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 3951 |
+
Error: module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'
|
| 3952 |
</pre></div>
|
| 3953 |
<div class="cell-stderr">
|
| 3954 |
+
Fetching 8 files: 0%| | 0/8 [00:00<?, ?it/s]
|
| 3955 |
+
Fetching 8 files: 38%|███▊ | 3/8 [00:00<00:00, 5.88it/s]
|
| 3956 |
+
Fetching 8 files: 100%|██████████| 8/8 [00:00<00:00, 15.67it/s]
|
| 3957 |
</div>
|
| 3958 |
<div class="cell-artifacts">
|
| 3959 |
<h4>Artifacts:</h4>
|
flash_attn/impls/xformers.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: benchmark |
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3940,21 +3940,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
|
|
| 3940 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3941 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3942 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3943 |
-
xformers_meff 9.
|
| 3944 |
-
xformers_flash3::flash_fwd
|
| 3945 |
-
flash_attn_3::fwd 1.
|
| 3946 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3947 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3948 |
-
Activity Buffer Request
|
| 3949 |
-
aten::empty 0.
|
| 3950 |
-
cudaFuncSetAttribute 0.26% 12.
|
| 3951 |
-
cudaLaunchKernel 0.
|
| 3952 |
-
aten::reshape 0.
|
| 3953 |
-
aten::view 0.
|
| 3954 |
-
cudaDeviceSynchronize
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
-
Self CPU time total: 4.
|
| 3957 |
-
Self CUDA time total: 2.
|
| 3958 |
|
| 3959 |
|
| 3960 |
|
|
@@ -3964,21 +3964,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
|
|
| 3964 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3965 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3966 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3967 |
-
xformers_meff 6.
|
| 3968 |
-
xformers_flash3::flash_fwd
|
| 3969 |
-
flash_attn_3::fwd 1.
|
| 3970 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3971 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3972 |
-
Activity Buffer Request
|
| 3973 |
-
aten::empty 0.
|
| 3974 |
-
cudaFuncSetAttribute 0.
|
| 3975 |
-
cudaLaunchKernel 0.
|
| 3976 |
-
aten::reshape 0.
|
| 3977 |
-
aten::view 0.
|
| 3978 |
-
cudaDeviceSynchronize
|
| 3979 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3980 |
-
Self CPU time total: 4.
|
| 3981 |
-
Self CUDA time total: 2.
|
| 3982 |
|
| 3983 |
|
| 3984 |
|
|
@@ -3988,21 +3988,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
|
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3990 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3991 |
-
xformers_meff 6.
|
| 3992 |
-
xformers_flash3::flash_fwd 2.
|
| 3993 |
-
flash_attn_3::fwd 1.
|
| 3994 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3995 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3996 |
-
Activity Buffer Request
|
| 3997 |
-
aten::empty 0.
|
| 3998 |
-
cudaFuncSetAttribute 0.
|
| 3999 |
-
cudaLaunchKernel 0.71%
|
| 4000 |
-
aten::reshape 0.
|
| 4001 |
-
aten::view 0.
|
| 4002 |
-
cudaDeviceSynchronize
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
-
Self CPU time total: 4.
|
| 4005 |
-
Self CUDA time total: 2.
|
| 4006 |
|
| 4007 |
|
| 4008 |
|
|
@@ -4012,21 +4012,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
|
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4014 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4015 |
-
xformers_meff 6.
|
| 4016 |
-
xformers_flash3::flash_fwd
|
| 4017 |
-
flash_attn_3::fwd
|
| 4018 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4019 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4020 |
-
Activity Buffer Request
|
| 4021 |
-
aten::empty 0.
|
| 4022 |
-
cudaFuncSetAttribute 0.
|
| 4023 |
-
cudaLaunchKernel 3.
|
| 4024 |
-
aten::reshape 0.
|
| 4025 |
-
aten::view 0.
|
| 4026 |
-
cudaDeviceSynchronize 49.
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
-
Self CPU time total:
|
| 4029 |
-
Self CUDA time total: 2.
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
@@ -4036,21 +4036,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
-
xformers_meff 5.
|
| 4040 |
-
xformers_flash3::flash_fwd 2.
|
| 4041 |
-
flash_attn_3::fwd 0.92%
|
| 4042 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4043 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4044 |
-
Activity Buffer Request
|
| 4045 |
-
aten::empty 0.52%
|
| 4046 |
-
cudaFuncSetAttribute 0.10% 5.
|
| 4047 |
-
cudaLaunchKernel 3.
|
| 4048 |
-
aten::reshape 0.
|
| 4049 |
-
aten::view 0.24% 13.
|
| 4050 |
-
cudaDeviceSynchronize 54.
|
| 4051 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4052 |
-
Self CPU time total: 5.
|
| 4053 |
-
Self CUDA time total: 3.
|
| 4054 |
|
| 4055 |
|
| 4056 |
|
|
@@ -4060,37 +4060,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
|
|
| 4060 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4061 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4062 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4063 |
-
xformers_meff 5.
|
| 4064 |
-
xformers_flash3::flash_fwd 2.
|
| 4065 |
-
flash_attn_3::fwd 0.
|
| 4066 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4067 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4068 |
-
Activity Buffer Request 31.
|
| 4069 |
-
aten::empty 0.
|
| 4070 |
-
cudaFuncSetAttribute 0.10% 5.
|
| 4071 |
-
cudaLaunchKernel 3.
|
| 4072 |
-
aten::reshape 0.15% 8.
|
| 4073 |
-
aten::view 0.24%
|
| 4074 |
-
cudaDeviceSynchronize 55.
|
| 4075 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4076 |
-
Self CPU time total: 5.
|
| 4077 |
-
Self CUDA time total: 3.
|
| 4078 |
|
| 4079 |
|
| 4080 |
impl wl p50(ms) ok
|
| 4081 |
-
xformers_meff cuda_attn_L128_bfloat16 0.
|
| 4082 |
-
xformers_meff cuda_attn_L256_bfloat16 1.
|
| 4083 |
-
xformers_meff cuda_attn_L320_bfloat16 1.
|
| 4084 |
-
xformers_meff cuda_attn_L384_bfloat16 1.
|
| 4085 |
-
xformers_meff cuda_attn_L448_bfloat16 1.
|
| 4086 |
-
xformers_meff cuda_attn_L512_bfloat16 1.
|
| 4087 |
</pre></div>
|
| 4088 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4089 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4090 |
<div class="uv-logs-content" style="display: none;">
|
| 4091 |
Downloading xformers (111.8MiB)
|
| 4092 |
-
|
| 4093 |
-
Installed
|
| 4094 |
</div>
|
| 4095 |
</div>
|
| 4096 |
<div class="cell-artifacts">
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3888 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: benchmark | 8.92s
|
| 3892 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3894 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3940 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3941 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3942 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3943 |
+
xformers_meff 9.64% 463.299us 53.77% 2.584ms 2.584ms 0.000us 0.00% 3.636ms 3.636ms 1
|
| 3944 |
+
xformers_flash3::flash_fwd 3.92% 188.192us 43.38% 2.085ms 694.978us 0.000us 0.00% 3.636ms 1.212ms 3
|
| 3945 |
+
flash_attn_3::fwd 1.40% 67.082us 39.46% 1.897ms 632.248us 2.748ms 100.00% 3.636ms 1.212ms 3
|
| 3946 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.749ms 100.05% 2.749ms 2.749ms 1
|
| 3947 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.748ms 100.00% 2.748ms 915.935us 3
|
| 3948 |
+
Activity Buffer Request 36.10% 1.735ms 36.10% 1.735ms 1.735ms 887.807us 32.31% 887.807us 887.807us 1
|
| 3949 |
+
aten::empty 0.82% 39.381us 0.82% 39.381us 6.563us 0.000us 0.00% 0.000us 0.000us 6
|
| 3950 |
+
cudaFuncSetAttribute 0.26% 12.540us 0.26% 12.540us 4.180us 0.000us 0.00% 0.000us 0.000us 3
|
| 3951 |
+
cudaLaunchKernel 0.88% 42.510us 0.88% 42.510us 14.170us 0.000us 0.00% 0.000us 0.000us 3
|
| 3952 |
+
aten::reshape 0.25% 12.121us 0.75% 35.870us 5.978us 0.000us 0.00% 0.000us 0.000us 6
|
| 3953 |
+
aten::view 0.49% 23.749us 0.49% 23.749us 3.958us 0.000us 0.00% 0.000us 0.000us 6
|
| 3954 |
+
cudaDeviceSynchronize 46.23% 2.222ms 46.23% 2.222ms 2.222ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
+
Self CPU time total: 4.806ms
|
| 3957 |
+
Self CUDA time total: 2.748ms
|
| 3958 |
|
| 3959 |
|
| 3960 |
|
|
|
|
| 3964 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3965 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3966 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3967 |
+
xformers_meff 6.94% 327.436us 51.65% 2.436ms 2.436ms 0.000us 0.00% 3.659ms 3.659ms 1
|
| 3968 |
+
xformers_flash3::flash_fwd 3.29% 155.063us 44.22% 2.085ms 695.085us 0.000us 0.00% 3.659ms 1.220ms 3
|
| 3969 |
+
flash_attn_3::fwd 1.15% 54.292us 40.93% 1.930ms 643.398us 2.737ms 100.00% 3.659ms 1.220ms 3
|
| 3970 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.738ms 100.05% 2.738ms 2.738ms 1
|
| 3971 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.737ms 100.00% 2.737ms 912.235us 3
|
| 3972 |
+
Activity Buffer Request 38.21% 1.802ms 38.21% 1.802ms 1.802ms 922.336us 33.70% 922.336us 922.336us 1
|
| 3973 |
+
aten::empty 0.70% 32.930us 0.70% 32.930us 5.488us 0.000us 0.00% 0.000us 0.000us 6
|
| 3974 |
+
cudaFuncSetAttribute 0.12% 5.760us 0.12% 5.760us 1.920us 0.000us 0.00% 0.000us 0.000us 3
|
| 3975 |
+
cudaLaunchKernel 0.75% 35.410us 0.75% 35.410us 11.803us 0.000us 0.00% 0.000us 0.000us 3
|
| 3976 |
+
aten::reshape 0.20% 9.409us 0.49% 22.989us 3.831us 0.000us 0.00% 0.000us 0.000us 6
|
| 3977 |
+
aten::view 0.29% 13.580us 0.29% 13.580us 2.263us 0.000us 0.00% 0.000us 0.000us 6
|
| 3978 |
+
cudaDeviceSynchronize 48.35% 2.280ms 48.35% 2.280ms 2.280ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3979 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3980 |
+
Self CPU time total: 4.715ms
|
| 3981 |
+
Self CUDA time total: 2.737ms
|
| 3982 |
|
| 3983 |
|
| 3984 |
|
|
|
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3990 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3991 |
+
xformers_meff 6.35% 296.325us 47.95% 2.238ms 2.238ms 0.000us 0.00% 3.787ms 3.787ms 1
|
| 3992 |
+
xformers_flash3::flash_fwd 2.95% 137.473us 41.12% 1.919ms 639.648us 0.000us 0.00% 3.787ms 1.262ms 3
|
| 3993 |
+
flash_attn_3::fwd 1.09% 50.850us 38.17% 1.781ms 593.823us 2.829ms 100.00% 3.787ms 1.262ms 3
|
| 3994 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.831ms 100.05% 2.831ms 2.831ms 1
|
| 3995 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.829ms 100.00% 2.829ms 943.127us 3
|
| 3996 |
+
Activity Buffer Request 35.64% 1.663ms 35.64% 1.663ms 1.663ms 957.186us 33.83% 957.186us 957.186us 1
|
| 3997 |
+
aten::empty 0.63% 29.301us 0.63% 29.301us 4.884us 0.000us 0.00% 0.000us 0.000us 6
|
| 3998 |
+
cudaFuncSetAttribute 0.11% 5.090us 0.11% 5.090us 1.697us 0.000us 0.00% 0.000us 0.000us 3
|
| 3999 |
+
cudaLaunchKernel 0.71% 33.151us 0.71% 33.151us 11.050us 0.000us 0.00% 0.000us 0.000us 3
|
| 4000 |
+
aten::reshape 0.18% 8.531us 0.48% 22.580us 3.763us 0.000us 0.00% 0.000us 0.000us 6
|
| 4001 |
+
aten::view 0.30% 14.049us 0.30% 14.049us 2.341us 0.000us 0.00% 0.000us 0.000us 6
|
| 4002 |
+
cudaDeviceSynchronize 52.05% 2.429ms 52.05% 2.429ms 2.429ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
+
Self CPU time total: 4.667ms
|
| 4005 |
+
Self CUDA time total: 2.829ms
|
| 4006 |
|
| 4007 |
|
| 4008 |
|
|
|
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4014 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4015 |
+
xformers_meff 6.11% 304.138us 50.43% 2.511ms 2.511ms 0.000us 0.00% 3.860ms 3.860ms 1
|
| 4016 |
+
xformers_flash3::flash_fwd 3.07% 152.860us 43.87% 2.184ms 727.989us 0.000us 0.00% 3.860ms 1.287ms 3
|
| 4017 |
+
flash_attn_3::fwd 1.07% 53.395us 40.80% 2.031ms 677.035us 2.883ms 100.00% 3.860ms 1.287ms 3
|
| 4018 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.885ms 100.05% 2.885ms 2.885ms 1
|
| 4019 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.883ms 100.00% 2.883ms 961.001us 3
|
| 4020 |
+
Activity Buffer Request 34.97% 1.741ms 34.97% 1.741ms 1.741ms 977.086us 33.89% 977.086us 977.086us 1
|
| 4021 |
+
aten::empty 0.66% 32.699us 0.66% 32.699us 5.450us 0.000us 0.00% 0.000us 0.000us 6
|
| 4022 |
+
cudaFuncSetAttribute 0.12% 6.109us 0.12% 6.109us 2.036us 0.000us 0.00% 0.000us 0.000us 3
|
| 4023 |
+
cudaLaunchKernel 3.98% 197.963us 3.98% 197.963us 65.988us 0.000us 0.00% 0.000us 0.000us 3
|
| 4024 |
+
aten::reshape 0.17% 8.489us 0.45% 22.539us 3.757us 0.000us 0.00% 0.000us 0.000us 6
|
| 4025 |
+
aten::view 0.28% 14.050us 0.28% 14.050us 2.342us 0.000us 0.00% 0.000us 0.000us 6
|
| 4026 |
+
cudaDeviceSynchronize 49.57% 2.468ms 49.57% 2.468ms 2.468ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
+
Self CPU time total: 4.978ms
|
| 4029 |
+
Self CUDA time total: 2.883ms
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
+
xformers_meff 5.45% 299.105us 45.26% 2.482ms 2.482ms 0.000us 0.00% 4.556ms 4.556ms 1
|
| 4040 |
+
xformers_flash3::flash_fwd 2.57% 140.761us 39.42% 2.162ms 720.685us 0.000us 0.00% 4.556ms 1.519ms 3
|
| 4041 |
+
flash_attn_3::fwd 0.92% 50.555us 36.85% 2.021ms 673.765us 3.406ms 100.00% 4.556ms 1.519ms 3
|
| 4042 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.408ms 100.05% 3.408ms 3.408ms 1
|
| 4043 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.406ms 100.00% 3.406ms 1.135ms 3
|
| 4044 |
+
Activity Buffer Request 31.74% 1.741ms 31.74% 1.741ms 1.741ms 1.150ms 33.76% 1.150ms 1.150ms 1
|
| 4045 |
+
aten::empty 0.52% 28.258us 0.52% 28.258us 4.710us 0.000us 0.00% 0.000us 0.000us 6
|
| 4046 |
+
cudaFuncSetAttribute 0.10% 5.340us 0.10% 5.340us 1.780us 0.000us 0.00% 0.000us 0.000us 3
|
| 4047 |
+
cudaLaunchKernel 3.58% 196.453us 3.58% 196.453us 65.484us 0.000us 0.00% 0.000us 0.000us 3
|
| 4048 |
+
aten::reshape 0.14% 7.863us 0.39% 21.181us 3.530us 0.000us 0.00% 0.000us 0.000us 6
|
| 4049 |
+
aten::view 0.24% 13.318us 0.24% 13.318us 2.220us 0.000us 0.00% 0.000us 0.000us 6
|
| 4050 |
+
cudaDeviceSynchronize 54.74% 3.003ms 54.74% 3.003ms 3.003ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4051 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4052 |
+
Self CPU time total: 5.485ms
|
| 4053 |
+
Self CUDA time total: 3.406ms
|
| 4054 |
|
| 4055 |
|
| 4056 |
|
|
|
|
| 4060 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4061 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4062 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4063 |
+
xformers_meff 5.08% 273.484us 44.98% 2.423ms 2.423ms 0.000us 0.00% 4.494ms 4.494ms 1
|
| 4064 |
+
xformers_flash3::flash_fwd 2.55% 137.253us 39.52% 2.129ms 709.536us 0.000us 0.00% 4.494ms 1.498ms 3
|
| 4065 |
+
flash_attn_3::fwd 0.94% 50.440us 36.97% 1.991ms 663.785us 3.366ms 100.00% 4.494ms 1.498ms 3
|
| 4066 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.368ms 100.05% 3.368ms 3.368ms 1
|
| 4067 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.366ms 100.00% 3.366ms 1.122ms 3
|
| 4068 |
+
Activity Buffer Request 31.81% 1.713ms 31.81% 1.713ms 1.713ms 1.127ms 33.48% 1.127ms 1.127ms 1
|
| 4069 |
+
aten::empty 0.56% 30.302us 0.56% 30.302us 5.050us 0.000us 0.00% 0.000us 0.000us 6
|
| 4070 |
+
cudaFuncSetAttribute 0.10% 5.300us 0.10% 5.300us 1.767us 0.000us 0.00% 0.000us 0.000us 3
|
| 4071 |
+
cudaLaunchKernel 3.56% 191.983us 3.56% 191.983us 63.994us 0.000us 0.00% 0.000us 0.000us 3
|
| 4072 |
+
aten::reshape 0.15% 8.029us 0.39% 20.930us 3.488us 0.000us 0.00% 0.000us 0.000us 6
|
| 4073 |
+
aten::view 0.24% 12.901us 0.24% 12.901us 2.150us 0.000us 0.00% 0.000us 0.000us 6
|
| 4074 |
+
cudaDeviceSynchronize 55.02% 2.964ms 55.02% 2.964ms 2.964ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4075 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4076 |
+
Self CPU time total: 5.387ms
|
| 4077 |
+
Self CUDA time total: 3.366ms
|
| 4078 |
|
| 4079 |
|
| 4080 |
impl wl p50(ms) ok
|
| 4081 |
+
xformers_meff cuda_attn_L128_bfloat16 0.98 True
|
| 4082 |
+
xformers_meff cuda_attn_L256_bfloat16 1.03 True
|
| 4083 |
+
xformers_meff cuda_attn_L320_bfloat16 1.06 True
|
| 4084 |
+
xformers_meff cuda_attn_L384_bfloat16 1.06 True
|
| 4085 |
+
xformers_meff cuda_attn_L448_bfloat16 1.25 True
|
| 4086 |
+
xformers_meff cuda_attn_L512_bfloat16 1.23 True
|
| 4087 |
</pre></div>
|
| 4088 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4089 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4090 |
<div class="uv-logs-content" style="display: none;">
|
| 4091 |
Downloading xformers (111.8MiB)
|
| 4092 |
+
Downloaded xformers
|
| 4093 |
+
Installed 38 packages in 217ms
|
| 4094 |
</div>
|
| 4095 |
</div>
|
| 4096 |
<div class="cell-artifacts">
|
flash_attn/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
flash_attn/results/cells/combine.py
CHANGED
|
@@ -20,6 +20,7 @@ cache_env_map = {
|
|
| 20 |
"HF Kernels Flash Attn": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK",
|
| 21 |
"HF Kernels Flash Attn3": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK",
|
| 22 |
"SageAttention": "UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK",
|
|
|
|
| 23 |
}
|
| 24 |
|
| 25 |
# Generate combined results with visualization
|
|
|
|
| 20 |
"HF Kernels Flash Attn": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK",
|
| 21 |
"HF Kernels Flash Attn3": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK",
|
| 22 |
"SageAttention": "UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK",
|
| 23 |
+
# "Flash Attn CUTE": "UVNOTE_FILE_FLASH_ATTN_CUTE_BENCHMARK",
|
| 24 |
}
|
| 25 |
|
| 26 |
# Generate combined results with visualization
|
flash_attn/results/combined_results.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3889,11 +3889,11 @@ body[data-tool="eraser"] .main-content {
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
-
<dc:date>2025-
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
| 3896 |
-
<dc:title>Matplotlib v3.10.
|
| 3897 |
</ns2:Agent>
|
| 3898 |
</dc:creator>
|
| 3899 |
</ns2:Work>
|
|
@@ -3999,96 +3999,96 @@ body[data-tool="eraser"] .main-content {
|
|
| 3999 |
<g id="matplotlib.axis_2">
|
| 4000 |
<g id="ytick_1">
|
| 4001 |
<g id="grid-y--2" class="grid grid-y">
|
| 4002 |
-
<path d="M 47.81
|
| 4003 |
</g>
|
| 4004 |
<g id="line2d_7">
|
| 4005 |
<defs>
|
| 4006 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4007 |
</defs>
|
| 4008 |
<g>
|
| 4009 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="text_7">
|
| 4013 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4014 |
</g>
|
| 4015 |
</g>
|
| 4016 |
<g id="ytick_2">
|
| 4017 |
<g id="grid-y--3" class="grid grid-y">
|
| 4018 |
-
<path d="M 47.81
|
| 4019 |
</g>
|
| 4020 |
<g id="line2d_8">
|
| 4021 |
<g>
|
| 4022 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="text_8">
|
| 4026 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="ytick_3">
|
| 4030 |
<g id="grid-y--4" class="grid grid-y">
|
| 4031 |
-
<path d="M 47.81
|
| 4032 |
</g>
|
| 4033 |
<g id="line2d_9">
|
| 4034 |
<g>
|
| 4035 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="text_9">
|
| 4039 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="ytick_4">
|
| 4043 |
<g id="grid-y--5" class="grid grid-y">
|
| 4044 |
-
<path d="M 47.81
|
| 4045 |
</g>
|
| 4046 |
<g id="line2d_10">
|
| 4047 |
<g>
|
| 4048 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="text_10">
|
| 4052 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="236.
|
| 4053 |
</g>
|
| 4054 |
</g>
|
| 4055 |
<g id="ytick_5">
|
| 4056 |
<g id="grid-y--6" class="grid grid-y">
|
| 4057 |
-
<path d="M 47.81
|
| 4058 |
</g>
|
| 4059 |
<g id="line2d_11">
|
| 4060 |
<g>
|
| 4061 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="text_11">
|
| 4065 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4066 |
</g>
|
| 4067 |
</g>
|
| 4068 |
<g id="ytick_6">
|
| 4069 |
<g id="grid-y--7" class="grid grid-y">
|
| 4070 |
-
<path d="M 47.81
|
| 4071 |
</g>
|
| 4072 |
<g id="line2d_12">
|
| 4073 |
<g>
|
| 4074 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="text_12">
|
| 4078 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="ytick_7">
|
| 4082 |
<g id="grid-y--8" class="grid grid-y">
|
| 4083 |
-
<path d="M 47.81
|
| 4084 |
</g>
|
| 4085 |
<g id="line2d_13">
|
| 4086 |
<g>
|
| 4087 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="text_13">
|
| 4091 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4092 |
</g>
|
| 4093 |
</g>
|
| 4094 |
<g id="label--y" class="ylabel">
|
|
@@ -4096,73 +4096,73 @@ body[data-tool="eraser"] .main-content {
|
|
| 4096 |
</g>
|
| 4097 |
</g>
|
| 4098 |
<g id="series--torch-flash-ma" class="series">
|
| 4099 |
-
<path d="M 83.607806
|
| 4100 |
<defs>
|
| 4101 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4102 |
</defs>
|
| 4103 |
<g clip-path="url(#p09feef2583)">
|
| 4104 |
-
<use ns4:href="#md7efaf3aec" x="83.607806" y="
|
| 4105 |
-
<use ns4:href="#md7efaf3aec" x="226.799032" y="
|
| 4106 |
-
<use ns4:href="#md7efaf3aec" x="369.990258" y="
|
| 4107 |
-
<use ns4:href="#md7efaf3aec" x="513.181484" y="313.
|
| 4108 |
-
<use ns4:href="#md7efaf3aec" x="656.37271" y="
|
| 4109 |
-
<use ns4:href="#md7efaf3aec" x="799.563935" y="
|
| 4110 |
</g>
|
| 4111 |
</g>
|
| 4112 |
<g id="series--torch-mem-eff" class="series">
|
| 4113 |
-
<path d="M 83.607806
|
| 4114 |
<defs>
|
| 4115 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4116 |
</defs>
|
| 4117 |
<g clip-path="url(#p09feef2583)">
|
| 4118 |
-
<use ns4:href="#m9b8c54d372" x="83.607806" y="
|
| 4119 |
-
<use ns4:href="#m9b8c54d372" x="226.799032" y="
|
| 4120 |
-
<use ns4:href="#m9b8c54d372" x="369.990258" y="
|
| 4121 |
-
<use ns4:href="#m9b8c54d372" x="513.181484" y="
|
| 4122 |
-
<use ns4:href="#m9b8c54d372" x="656.37271" y="
|
| 4123 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4124 |
</g>
|
| 4125 |
</g>
|
| 4126 |
<g id="series--xformers-meff" class="series">
|
| 4127 |
-
<path d="M 83.607806
|
| 4128 |
<defs>
|
| 4129 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4130 |
</defs>
|
| 4131 |
<g clip-path="url(#p09feef2583)">
|
| 4132 |
-
<use ns4:href="#mc655281e0b" x="83.607806" y="
|
| 4133 |
-
<use ns4:href="#mc655281e0b" x="226.799032" y="
|
| 4134 |
-
<use ns4:href="#mc655281e0b" x="369.990258" y="
|
| 4135 |
-
<use ns4:href="#mc655281e0b" x="513.181484" y="
|
| 4136 |
-
<use ns4:href="#mc655281e0b" x="656.37271" y="
|
| 4137 |
-
<use ns4:href="#mc655281e0b" x="799.563935" y="
|
| 4138 |
</g>
|
| 4139 |
</g>
|
| 4140 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4141 |
-
<path d="M 83.607806
|
| 4142 |
<defs>
|
| 4143 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4144 |
</defs>
|
| 4145 |
<g clip-path="url(#p09feef2583)">
|
| 4146 |
-
<use ns4:href="#m61c8040d7e" x="83.607806" y="
|
| 4147 |
-
<use ns4:href="#m61c8040d7e" x="226.799032" y="
|
| 4148 |
-
<use ns4:href="#m61c8040d7e" x="369.990258" y="390.
|
| 4149 |
-
<use ns4:href="#m61c8040d7e" x="513.181484" y="
|
| 4150 |
-
<use ns4:href="#m61c8040d7e" x="656.37271" y="
|
| 4151 |
-
<use ns4:href="#m61c8040d7e" x="799.563935" y="
|
| 4152 |
</g>
|
| 4153 |
</g>
|
| 4154 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4155 |
-
<path d="M 83.607806 428.387702 L 226.799032
|
| 4156 |
<defs>
|
| 4157 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4158 |
</defs>
|
| 4159 |
<g clip-path="url(#p09feef2583)">
|
| 4160 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4161 |
-
<use ns4:href="#m7cd35be9cc" x="226.799032" y="
|
| 4162 |
-
<use ns4:href="#m7cd35be9cc" x="369.990258" y="
|
| 4163 |
-
<use ns4:href="#m7cd35be9cc" x="513.181484" y="
|
| 4164 |
-
<use ns4:href="#m7cd35be9cc" x="656.37271" y="
|
| 4165 |
-
<use ns4:href="#m7cd35be9cc" x="799.563935" y="
|
| 4166 |
</g>
|
| 4167 |
</g>
|
| 4168 |
<g id="patch_3">
|
|
@@ -4247,12 +4247,12 @@ body[data-tool="eraser"] .main-content {
|
|
| 4247 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4248 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4249 |
</span> |
|
| 4250 |
-
Cell: combine | 4.
|
| 4251 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4252 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4253 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
| 4254 |
</div>
|
| 4255 |
-
<div id="code-combine" class="cell-code collapsed" data-lines="
|
| 4256 |
<div class="highlight-with-lines">
|
| 4257 |
<div class="line-numbers" id="lines-combine">
|
| 4258 |
<a class="line-number" data-cell="combine" data-line="1" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 1, true);">1</a>
|
|
@@ -4285,6 +4285,7 @@ Cell: combine | 4.53s
|
|
| 4285 |
<a class="line-number" data-cell="combine" data-line="28" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 28, true);">28</a>
|
| 4286 |
<a class="line-number" data-cell="combine" data-line="29" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 29, true);">29</a>
|
| 4287 |
<a class="line-number" data-cell="combine" data-line="30" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 30, true);">30</a>
|
|
|
|
| 4288 |
</div>
|
| 4289 |
<div class="code-wrap">
|
| 4290 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
|
@@ -4309,6 +4310,7 @@ Cell: combine | 4.53s
|
|
| 4309 |
<span class="s2">"HF Kernels Flash Attn"</span><span class="p">:</span> <span class="s2">"UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK"</span><span class="p">,</span>
|
| 4310 |
<span class="s2">"HF Kernels Flash Attn3"</span><span class="p">:</span> <span class="s2">"UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK"</span><span class="p">,</span>
|
| 4311 |
<span class="s2">"SageAttention"</span><span class="p">:</span> <span class="s2">"UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK"</span><span class="p">,</span>
|
|
|
|
| 4312 |
<span class="p">}</span>
|
| 4313 |
|
| 4314 |
<span class="c1"># Generate combined results with visualization</span>
|
|
@@ -4354,48 +4356,48 @@ Summary: 6 found, 0 skipped, 0 missing
|
|
| 4354 |
COMBINED BENCHMARK SUMMARY
|
| 4355 |
|
| 4356 |
impl wl p50(ms) ok
|
| 4357 |
-
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.
|
| 4358 |
-
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.
|
| 4359 |
-
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.
|
| 4360 |
-
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.
|
| 4361 |
-
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.
|
| 4362 |
-
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.
|
| 4363 |
-
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.
|
| 4364 |
-
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.
|
| 4365 |
-
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.
|
| 4366 |
-
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.
|
| 4367 |
-
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.
|
| 4368 |
-
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.
|
| 4369 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4370 |
-
Error: module '
|
| 4371 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4372 |
-
Error: module '
|
| 4373 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4374 |
-
Error: module '
|
| 4375 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4376 |
-
Error: module '
|
| 4377 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4378 |
-
Error: module '
|
| 4379 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4380 |
-
Error: module '
|
| 4381 |
-
torch_flash_ma cuda_attn_L128_bfloat16 1.
|
| 4382 |
-
torch_flash_ma cuda_attn_L256_bfloat16 1.
|
| 4383 |
-
torch_flash_ma cuda_attn_L320_bfloat16 1.
|
| 4384 |
-
torch_flash_ma cuda_attn_L384_bfloat16 1.
|
| 4385 |
-
torch_flash_ma cuda_attn_L448_bfloat16 1.
|
| 4386 |
-
torch_flash_ma cuda_attn_L512_bfloat16 1.
|
| 4387 |
-
torch_mem_eff cuda_attn_L128_bfloat16 1.
|
| 4388 |
-
torch_mem_eff cuda_attn_L256_bfloat16 1.
|
| 4389 |
-
torch_mem_eff cuda_attn_L320_bfloat16 1.
|
| 4390 |
-
torch_mem_eff cuda_attn_L384_bfloat16 2.
|
| 4391 |
-
torch_mem_eff cuda_attn_L448_bfloat16 2.
|
| 4392 |
-
torch_mem_eff cuda_attn_L512_bfloat16 2.
|
| 4393 |
-
xformers_meff cuda_attn_L128_bfloat16 0.
|
| 4394 |
-
xformers_meff cuda_attn_L256_bfloat16 1.
|
| 4395 |
-
xformers_meff cuda_attn_L320_bfloat16 1.
|
| 4396 |
-
xformers_meff cuda_attn_L384_bfloat16 1.
|
| 4397 |
-
xformers_meff cuda_attn_L448_bfloat16 1.
|
| 4398 |
-
xformers_meff cuda_attn_L512_bfloat16 1.
|
| 4399 |
|
| 4400 |
GENERATING COMBINED VISUALIZATION
|
| 4401 |
|
|
@@ -4419,7 +4421,7 @@ Implementations included:
|
|
| 4419 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4420 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4421 |
<div class="uv-logs-content" style="display: none;">
|
| 4422 |
-
Installed 37 packages in
|
| 4423 |
</div>
|
| 4424 |
</div>
|
| 4425 |
<div class="cell-artifacts">
|
|
@@ -4432,11 +4434,11 @@ Installed 37 packages in 327ms
|
|
| 4432 |
<rdf:RDF>
|
| 4433 |
<ns2:Work>
|
| 4434 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4435 |
-
<dc:date>2025-
|
| 4436 |
<dc:format>image/svg+xml</dc:format>
|
| 4437 |
<dc:creator>
|
| 4438 |
<ns2:Agent>
|
| 4439 |
-
<dc:title>Matplotlib v3.10.
|
| 4440 |
</ns2:Agent>
|
| 4441 |
</dc:creator>
|
| 4442 |
</ns2:Work>
|
|
@@ -4542,96 +4544,96 @@ Installed 37 packages in 327ms
|
|
| 4542 |
<g id="matplotlib.axis_2">
|
| 4543 |
<g id="ytick_1">
|
| 4544 |
<g id="grid-y--2" class="grid grid-y">
|
| 4545 |
-
<path d="M 47.81
|
| 4546 |
</g>
|
| 4547 |
<g id="line2d_7">
|
| 4548 |
<defs>
|
| 4549 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4550 |
</defs>
|
| 4551 |
<g>
|
| 4552 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4553 |
</g>
|
| 4554 |
</g>
|
| 4555 |
<g id="text_7">
|
| 4556 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4557 |
</g>
|
| 4558 |
</g>
|
| 4559 |
<g id="ytick_2">
|
| 4560 |
<g id="grid-y--3" class="grid grid-y">
|
| 4561 |
-
<path d="M 47.81
|
| 4562 |
</g>
|
| 4563 |
<g id="line2d_8">
|
| 4564 |
<g>
|
| 4565 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4566 |
</g>
|
| 4567 |
</g>
|
| 4568 |
<g id="text_8">
|
| 4569 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4570 |
</g>
|
| 4571 |
</g>
|
| 4572 |
<g id="ytick_3">
|
| 4573 |
<g id="grid-y--4" class="grid grid-y">
|
| 4574 |
-
<path d="M 47.81
|
| 4575 |
</g>
|
| 4576 |
<g id="line2d_9">
|
| 4577 |
<g>
|
| 4578 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4579 |
</g>
|
| 4580 |
</g>
|
| 4581 |
<g id="text_9">
|
| 4582 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4583 |
</g>
|
| 4584 |
</g>
|
| 4585 |
<g id="ytick_4">
|
| 4586 |
<g id="grid-y--5" class="grid grid-y">
|
| 4587 |
-
<path d="M 47.81
|
| 4588 |
</g>
|
| 4589 |
<g id="line2d_10">
|
| 4590 |
<g>
|
| 4591 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4592 |
</g>
|
| 4593 |
</g>
|
| 4594 |
<g id="text_10">
|
| 4595 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="236.
|
| 4596 |
</g>
|
| 4597 |
</g>
|
| 4598 |
<g id="ytick_5">
|
| 4599 |
<g id="grid-y--6" class="grid grid-y">
|
| 4600 |
-
<path d="M 47.81
|
| 4601 |
</g>
|
| 4602 |
<g id="line2d_11">
|
| 4603 |
<g>
|
| 4604 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4605 |
</g>
|
| 4606 |
</g>
|
| 4607 |
<g id="text_11">
|
| 4608 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4609 |
</g>
|
| 4610 |
</g>
|
| 4611 |
<g id="ytick_6">
|
| 4612 |
<g id="grid-y--7" class="grid grid-y">
|
| 4613 |
-
<path d="M 47.81
|
| 4614 |
</g>
|
| 4615 |
<g id="line2d_12">
|
| 4616 |
<g>
|
| 4617 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4618 |
</g>
|
| 4619 |
</g>
|
| 4620 |
<g id="text_12">
|
| 4621 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4622 |
</g>
|
| 4623 |
</g>
|
| 4624 |
<g id="ytick_7">
|
| 4625 |
<g id="grid-y--8" class="grid grid-y">
|
| 4626 |
-
<path d="M 47.81
|
| 4627 |
</g>
|
| 4628 |
<g id="line2d_13">
|
| 4629 |
<g>
|
| 4630 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4631 |
</g>
|
| 4632 |
</g>
|
| 4633 |
<g id="text_13">
|
| 4634 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4635 |
</g>
|
| 4636 |
</g>
|
| 4637 |
<g id="label--y" class="ylabel">
|
|
@@ -4639,73 +4641,73 @@ Installed 37 packages in 327ms
|
|
| 4639 |
</g>
|
| 4640 |
</g>
|
| 4641 |
<g id="series--torch-flash-ma" class="series">
|
| 4642 |
-
<path d="M 83.607806
|
| 4643 |
<defs>
|
| 4644 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4645 |
</defs>
|
| 4646 |
<g clip-path="url(#p09feef2583)">
|
| 4647 |
-
<use ns4:href="#md7efaf3aec" x="83.607806" y="
|
| 4648 |
-
<use ns4:href="#md7efaf3aec" x="226.799032" y="
|
| 4649 |
-
<use ns4:href="#md7efaf3aec" x="369.990258" y="
|
| 4650 |
-
<use ns4:href="#md7efaf3aec" x="513.181484" y="313.
|
| 4651 |
-
<use ns4:href="#md7efaf3aec" x="656.37271" y="
|
| 4652 |
-
<use ns4:href="#md7efaf3aec" x="799.563935" y="
|
| 4653 |
</g>
|
| 4654 |
</g>
|
| 4655 |
<g id="series--torch-mem-eff" class="series">
|
| 4656 |
-
<path d="M 83.607806
|
| 4657 |
<defs>
|
| 4658 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4659 |
</defs>
|
| 4660 |
<g clip-path="url(#p09feef2583)">
|
| 4661 |
-
<use ns4:href="#m9b8c54d372" x="83.607806" y="
|
| 4662 |
-
<use ns4:href="#m9b8c54d372" x="226.799032" y="
|
| 4663 |
-
<use ns4:href="#m9b8c54d372" x="369.990258" y="
|
| 4664 |
-
<use ns4:href="#m9b8c54d372" x="513.181484" y="
|
| 4665 |
-
<use ns4:href="#m9b8c54d372" x="656.37271" y="
|
| 4666 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4667 |
</g>
|
| 4668 |
</g>
|
| 4669 |
<g id="series--xformers-meff" class="series">
|
| 4670 |
-
<path d="M 83.607806
|
| 4671 |
<defs>
|
| 4672 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4673 |
</defs>
|
| 4674 |
<g clip-path="url(#p09feef2583)">
|
| 4675 |
-
<use ns4:href="#mc655281e0b" x="83.607806" y="
|
| 4676 |
-
<use ns4:href="#mc655281e0b" x="226.799032" y="
|
| 4677 |
-
<use ns4:href="#mc655281e0b" x="369.990258" y="
|
| 4678 |
-
<use ns4:href="#mc655281e0b" x="513.181484" y="
|
| 4679 |
-
<use ns4:href="#mc655281e0b" x="656.37271" y="
|
| 4680 |
-
<use ns4:href="#mc655281e0b" x="799.563935" y="
|
| 4681 |
</g>
|
| 4682 |
</g>
|
| 4683 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4684 |
-
<path d="M 83.607806
|
| 4685 |
<defs>
|
| 4686 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4687 |
</defs>
|
| 4688 |
<g clip-path="url(#p09feef2583)">
|
| 4689 |
-
<use ns4:href="#m61c8040d7e" x="83.607806" y="
|
| 4690 |
-
<use ns4:href="#m61c8040d7e" x="226.799032" y="
|
| 4691 |
-
<use ns4:href="#m61c8040d7e" x="369.990258" y="390.
|
| 4692 |
-
<use ns4:href="#m61c8040d7e" x="513.181484" y="
|
| 4693 |
-
<use ns4:href="#m61c8040d7e" x="656.37271" y="
|
| 4694 |
-
<use ns4:href="#m61c8040d7e" x="799.563935" y="
|
| 4695 |
</g>
|
| 4696 |
</g>
|
| 4697 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4698 |
-
<path d="M 83.607806 428.387702 L 226.799032
|
| 4699 |
<defs>
|
| 4700 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4701 |
</defs>
|
| 4702 |
<g clip-path="url(#p09feef2583)">
|
| 4703 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4704 |
-
<use ns4:href="#m7cd35be9cc" x="226.799032" y="
|
| 4705 |
-
<use ns4:href="#m7cd35be9cc" x="369.990258" y="
|
| 4706 |
-
<use ns4:href="#m7cd35be9cc" x="513.181484" y="
|
| 4707 |
-
<use ns4:href="#m7cd35be9cc" x="656.37271" y="
|
| 4708 |
-
<use ns4:href="#m7cd35be9cc" x="799.563935" y="
|
| 4709 |
</g>
|
| 4710 |
</g>
|
| 4711 |
<g id="patch_3">
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
+
<dc:date>2025-12-19T19:09:55.297355</dc:date>
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
| 3896 |
+
<dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
|
| 3897 |
</ns2:Agent>
|
| 3898 |
</dc:creator>
|
| 3899 |
</ns2:Work>
|
|
|
|
| 3999 |
<g id="matplotlib.axis_2">
|
| 4000 |
<g id="ytick_1">
|
| 4001 |
<g id="grid-y--2" class="grid grid-y">
|
| 4002 |
+
<path d="M 47.81 404.469232 L 835.361742 404.469232 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4003 |
</g>
|
| 4004 |
<g id="line2d_7">
|
| 4005 |
<defs>
|
| 4006 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4007 |
</defs>
|
| 4008 |
<g>
|
| 4009 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="404.469232" style="stroke: #000000; stroke-width: 0.8" />
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="text_7">
|
| 4013 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="408.26845" transform="rotate(-0 40.81 408.26845)">1.0</text>
|
| 4014 |
</g>
|
| 4015 |
</g>
|
| 4016 |
<g id="ytick_2">
|
| 4017 |
<g id="grid-y--3" class="grid grid-y">
|
| 4018 |
+
<path d="M 47.81 347.147903 L 835.361742 347.147903 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4019 |
</g>
|
| 4020 |
<g id="line2d_8">
|
| 4021 |
<g>
|
| 4022 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="347.147903" style="stroke: #000000; stroke-width: 0.8" />
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="text_8">
|
| 4026 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="350.947122" transform="rotate(-0 40.81 350.947122)">1.2</text>
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="ytick_3">
|
| 4030 |
<g id="grid-y--4" class="grid grid-y">
|
| 4031 |
+
<path d="M 47.81 289.826575 L 835.361742 289.826575 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4032 |
</g>
|
| 4033 |
<g id="line2d_9">
|
| 4034 |
<g>
|
| 4035 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="289.826575" style="stroke: #000000; stroke-width: 0.8" />
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="text_9">
|
| 4039 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="293.625794" transform="rotate(-0 40.81 293.625794)">1.4</text>
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="ytick_4">
|
| 4043 |
<g id="grid-y--5" class="grid grid-y">
|
| 4044 |
+
<path d="M 47.81 232.505247 L 835.361742 232.505247 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4045 |
</g>
|
| 4046 |
<g id="line2d_10">
|
| 4047 |
<g>
|
| 4048 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="232.505247" style="stroke: #000000; stroke-width: 0.8" />
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="text_10">
|
| 4052 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="236.304466" transform="rotate(-0 40.81 236.304466)">1.6</text>
|
| 4053 |
</g>
|
| 4054 |
</g>
|
| 4055 |
<g id="ytick_5">
|
| 4056 |
<g id="grid-y--6" class="grid grid-y">
|
| 4057 |
+
<path d="M 47.81 175.183919 L 835.361742 175.183919 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4058 |
</g>
|
| 4059 |
<g id="line2d_11">
|
| 4060 |
<g>
|
| 4061 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="175.183919" style="stroke: #000000; stroke-width: 0.8" />
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="text_11">
|
| 4065 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="178.983137" transform="rotate(-0 40.81 178.983137)">1.8</text>
|
| 4066 |
</g>
|
| 4067 |
</g>
|
| 4068 |
<g id="ytick_6">
|
| 4069 |
<g id="grid-y--7" class="grid grid-y">
|
| 4070 |
+
<path d="M 47.81 117.86259 L 835.361742 117.86259 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4071 |
</g>
|
| 4072 |
<g id="line2d_12">
|
| 4073 |
<g>
|
| 4074 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="117.86259" style="stroke: #000000; stroke-width: 0.8" />
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="text_12">
|
| 4078 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="121.661809" transform="rotate(-0 40.81 121.661809)">2.0</text>
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="ytick_7">
|
| 4082 |
<g id="grid-y--8" class="grid grid-y">
|
| 4083 |
+
<path d="M 47.81 60.541262 L 835.361742 60.541262 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4084 |
</g>
|
| 4085 |
<g id="line2d_13">
|
| 4086 |
<g>
|
| 4087 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="60.541262" style="stroke: #000000; stroke-width: 0.8" />
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="text_13">
|
| 4091 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="64.340481" transform="rotate(-0 40.81 64.340481)">2.2</text>
|
| 4092 |
</g>
|
| 4093 |
</g>
|
| 4094 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4096 |
</g>
|
| 4097 |
</g>
|
| 4098 |
<g id="series--torch-flash-ma" class="series">
|
| 4099 |
+
<path d="M 83.607806 346.603064 L 226.799032 331.148661 L 369.990258 322.7508 L 513.181484 313.642154 L 656.37271 270.506995 L 799.563935 259.742049 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4100 |
<defs>
|
| 4101 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4102 |
</defs>
|
| 4103 |
<g clip-path="url(#p09feef2583)">
|
| 4104 |
+
<use ns4:href="#md7efaf3aec" x="83.607806" y="346.603064" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4105 |
+
<use ns4:href="#md7efaf3aec" x="226.799032" y="331.148661" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4106 |
+
<use ns4:href="#md7efaf3aec" x="369.990258" y="322.7508" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4107 |
+
<use ns4:href="#md7efaf3aec" x="513.181484" y="313.642154" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4108 |
+
<use ns4:href="#md7efaf3aec" x="656.37271" y="270.506995" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4109 |
+
<use ns4:href="#md7efaf3aec" x="799.563935" y="259.742049" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4110 |
</g>
|
| 4111 |
</g>
|
| 4112 |
<g id="series--torch-mem-eff" class="series">
|
| 4113 |
+
<path d="M 83.607806 162.593002 L 226.799032 131.641491 L 369.990258 126.594348 L 513.181484 96.170767 L 656.37271 105.428161 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4114 |
<defs>
|
| 4115 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4116 |
</defs>
|
| 4117 |
<g clip-path="url(#p09feef2583)">
|
| 4118 |
+
<use ns4:href="#m9b8c54d372" x="83.607806" y="162.593002" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4119 |
+
<use ns4:href="#m9b8c54d372" x="226.799032" y="131.641491" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4120 |
+
<use ns4:href="#m9b8c54d372" x="369.990258" y="126.594348" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4121 |
+
<use ns4:href="#m9b8c54d372" x="513.181484" y="96.170767" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4122 |
+
<use ns4:href="#m9b8c54d372" x="656.37271" y="105.428161" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4123 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4124 |
</g>
|
| 4125 |
</g>
|
| 4126 |
<g id="series--xformers-meff" class="series">
|
| 4127 |
+
<path d="M 83.607806 410.706939 L 226.799032 396.737158 L 369.990258 386.568354 L 513.181484 386.536541 L 656.37271 333.774551 L 799.563935 337.388661 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4128 |
<defs>
|
| 4129 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4130 |
</defs>
|
| 4131 |
<g clip-path="url(#p09feef2583)">
|
| 4132 |
+
<use ns4:href="#mc655281e0b" x="83.607806" y="410.706939" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4133 |
+
<use ns4:href="#mc655281e0b" x="226.799032" y="396.737158" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4134 |
+
<use ns4:href="#mc655281e0b" x="369.990258" y="386.568354" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4135 |
+
<use ns4:href="#mc655281e0b" x="513.181484" y="386.536541" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4136 |
+
<use ns4:href="#mc655281e0b" x="656.37271" y="333.774551" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4137 |
+
<use ns4:href="#mc655281e0b" x="799.563935" y="337.388661" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4138 |
</g>
|
| 4139 |
</g>
|
| 4140 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4141 |
+
<path d="M 83.607806 416.940633 L 226.799032 399.984697 L 369.990258 390.841946 L 513.181484 387.029791 L 656.37271 344.433452 L 799.563935 341.857145 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4142 |
<defs>
|
| 4143 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4144 |
</defs>
|
| 4145 |
<g clip-path="url(#p09feef2583)">
|
| 4146 |
+
<use ns4:href="#m61c8040d7e" x="83.607806" y="416.940633" style="fill: #d62728; stroke: #d62728" />
|
| 4147 |
+
<use ns4:href="#m61c8040d7e" x="226.799032" y="399.984697" style="fill: #d62728; stroke: #d62728" />
|
| 4148 |
+
<use ns4:href="#m61c8040d7e" x="369.990258" y="390.841946" style="fill: #d62728; stroke: #d62728" />
|
| 4149 |
+
<use ns4:href="#m61c8040d7e" x="513.181484" y="387.029791" style="fill: #d62728; stroke: #d62728" />
|
| 4150 |
+
<use ns4:href="#m61c8040d7e" x="656.37271" y="344.433452" style="fill: #d62728; stroke: #d62728" />
|
| 4151 |
+
<use ns4:href="#m61c8040d7e" x="799.563935" y="341.857145" style="fill: #d62728; stroke: #d62728" />
|
| 4152 |
</g>
|
| 4153 |
</g>
|
| 4154 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4155 |
+
<path d="M 83.607806 428.387702 L 226.799032 412.171498 L 369.990258 404.997448 L 513.181484 400.314295 L 656.37271 358.798463 L 799.563935 355.895138 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4156 |
<defs>
|
| 4157 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4158 |
</defs>
|
| 4159 |
<g clip-path="url(#p09feef2583)">
|
| 4160 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4161 |
+
<use ns4:href="#m7cd35be9cc" x="226.799032" y="412.171498" style="fill: #9467bd; stroke: #9467bd" />
|
| 4162 |
+
<use ns4:href="#m7cd35be9cc" x="369.990258" y="404.997448" style="fill: #9467bd; stroke: #9467bd" />
|
| 4163 |
+
<use ns4:href="#m7cd35be9cc" x="513.181484" y="400.314295" style="fill: #9467bd; stroke: #9467bd" />
|
| 4164 |
+
<use ns4:href="#m7cd35be9cc" x="656.37271" y="358.798463" style="fill: #9467bd; stroke: #9467bd" />
|
| 4165 |
+
<use ns4:href="#m7cd35be9cc" x="799.563935" y="355.895138" style="fill: #9467bd; stroke: #9467bd" />
|
| 4166 |
</g>
|
| 4167 |
</g>
|
| 4168 |
<g id="patch_3">
|
|
|
|
| 4247 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4248 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4249 |
</span> |
|
| 4250 |
+
Cell: combine | 4.87s
|
| 4251 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4252 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4253 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
| 4254 |
</div>
|
| 4255 |
+
<div id="code-combine" class="cell-code collapsed" data-lines="31">
|
| 4256 |
<div class="highlight-with-lines">
|
| 4257 |
<div class="line-numbers" id="lines-combine">
|
| 4258 |
<a class="line-number" data-cell="combine" data-line="1" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 1, true);">1</a>
|
|
|
|
| 4285 |
<a class="line-number" data-cell="combine" data-line="28" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 28, true);">28</a>
|
| 4286 |
<a class="line-number" data-cell="combine" data-line="29" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 29, true);">29</a>
|
| 4287 |
<a class="line-number" data-cell="combine" data-line="30" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 30, true);">30</a>
|
| 4288 |
+
<a class="line-number" data-cell="combine" data-line="31" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 31, true);">31</a>
|
| 4289 |
</div>
|
| 4290 |
<div class="code-wrap">
|
| 4291 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
|
|
|
| 4310 |
<span class="s2">"HF Kernels Flash Attn"</span><span class="p">:</span> <span class="s2">"UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK"</span><span class="p">,</span>
|
| 4311 |
<span class="s2">"HF Kernels Flash Attn3"</span><span class="p">:</span> <span class="s2">"UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK"</span><span class="p">,</span>
|
| 4312 |
<span class="s2">"SageAttention"</span><span class="p">:</span> <span class="s2">"UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK"</span><span class="p">,</span>
|
| 4313 |
+
<span class="c1"># "Flash Attn CUTE": "UVNOTE_FILE_FLASH_ATTN_CUTE_BENCHMARK",</span>
|
| 4314 |
<span class="p">}</span>
|
| 4315 |
|
| 4316 |
<span class="c1"># Generate combined results with visualization</span>
|
|
|
|
| 4356 |
COMBINED BENCHMARK SUMMARY
|
| 4357 |
|
| 4358 |
impl wl p50(ms) ok
|
| 4359 |
+
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
|
| 4360 |
+
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.02 True
|
| 4361 |
+
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
|
| 4362 |
+
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
|
| 4363 |
+
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
|
| 4364 |
+
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.22 True
|
| 4365 |
+
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
|
| 4366 |
+
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
|
| 4367 |
+
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.00 True
|
| 4368 |
+
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True
|
| 4369 |
+
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.16 True
|
| 4370 |
+
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True
|
| 4371 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4372 |
+
Error: module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'
|
| 4373 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4374 |
+
Error: module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'
|
| 4375 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4376 |
+
Error: module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'
|
| 4377 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4378 |
+
Error: module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'
|
| 4379 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4380 |
+
Error: module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'
|
| 4381 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4382 |
+
Error: module 'sage_attention_e8dcde4226fe38e6' has no attribute 'fwd'
|
| 4383 |
+
torch_flash_ma cuda_attn_L128_bfloat16 1.20 True
|
| 4384 |
+
torch_flash_ma cuda_attn_L256_bfloat16 1.26 True
|
| 4385 |
+
torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
|
| 4386 |
+
torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
|
| 4387 |
+
torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
|
| 4388 |
+
torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
|
| 4389 |
+
torch_mem_eff cuda_attn_L128_bfloat16 1.84 True
|
| 4390 |
+
torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
|
| 4391 |
+
torch_mem_eff cuda_attn_L320_bfloat16 1.97 True
|
| 4392 |
+
torch_mem_eff cuda_attn_L384_bfloat16 2.08 True
|
| 4393 |
+
torch_mem_eff cuda_attn_L448_bfloat16 2.04 True
|
| 4394 |
+
torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
|
| 4395 |
+
xformers_meff cuda_attn_L128_bfloat16 0.98 True
|
| 4396 |
+
xformers_meff cuda_attn_L256_bfloat16 1.03 True
|
| 4397 |
+
xformers_meff cuda_attn_L320_bfloat16 1.06 True
|
| 4398 |
+
xformers_meff cuda_attn_L384_bfloat16 1.06 True
|
| 4399 |
+
xformers_meff cuda_attn_L448_bfloat16 1.25 True
|
| 4400 |
+
xformers_meff cuda_attn_L512_bfloat16 1.23 True
|
| 4401 |
|
| 4402 |
GENERATING COMBINED VISUALIZATION
|
| 4403 |
|
|
|
|
| 4421 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4422 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4423 |
<div class="uv-logs-content" style="display: none;">
|
| 4424 |
+
Installed 37 packages in 315ms
|
| 4425 |
</div>
|
| 4426 |
</div>
|
| 4427 |
<div class="cell-artifacts">
|
|
|
|
| 4434 |
<rdf:RDF>
|
| 4435 |
<ns2:Work>
|
| 4436 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4437 |
+
<dc:date>2025-12-19T19:09:55.297355</dc:date>
|
| 4438 |
<dc:format>image/svg+xml</dc:format>
|
| 4439 |
<dc:creator>
|
| 4440 |
<ns2:Agent>
|
| 4441 |
+
<dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
|
| 4442 |
</ns2:Agent>
|
| 4443 |
</dc:creator>
|
| 4444 |
</ns2:Work>
|
|
|
|
| 4544 |
<g id="matplotlib.axis_2">
|
| 4545 |
<g id="ytick_1">
|
| 4546 |
<g id="grid-y--2" class="grid grid-y">
|
| 4547 |
+
<path d="M 47.81 404.469232 L 835.361742 404.469232 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4548 |
</g>
|
| 4549 |
<g id="line2d_7">
|
| 4550 |
<defs>
|
| 4551 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4552 |
</defs>
|
| 4553 |
<g>
|
| 4554 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="404.469232" style="stroke: #000000; stroke-width: 0.8" />
|
| 4555 |
</g>
|
| 4556 |
</g>
|
| 4557 |
<g id="text_7">
|
| 4558 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="408.26845" transform="rotate(-0 40.81 408.26845)">1.0</text>
|
| 4559 |
</g>
|
| 4560 |
</g>
|
| 4561 |
<g id="ytick_2">
|
| 4562 |
<g id="grid-y--3" class="grid grid-y">
|
| 4563 |
+
<path d="M 47.81 347.147903 L 835.361742 347.147903 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4564 |
</g>
|
| 4565 |
<g id="line2d_8">
|
| 4566 |
<g>
|
| 4567 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="347.147903" style="stroke: #000000; stroke-width: 0.8" />
|
| 4568 |
</g>
|
| 4569 |
</g>
|
| 4570 |
<g id="text_8">
|
| 4571 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="350.947122" transform="rotate(-0 40.81 350.947122)">1.2</text>
|
| 4572 |
</g>
|
| 4573 |
</g>
|
| 4574 |
<g id="ytick_3">
|
| 4575 |
<g id="grid-y--4" class="grid grid-y">
|
| 4576 |
+
<path d="M 47.81 289.826575 L 835.361742 289.826575 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4577 |
</g>
|
| 4578 |
<g id="line2d_9">
|
| 4579 |
<g>
|
| 4580 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="289.826575" style="stroke: #000000; stroke-width: 0.8" />
|
| 4581 |
</g>
|
| 4582 |
</g>
|
| 4583 |
<g id="text_9">
|
| 4584 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="293.625794" transform="rotate(-0 40.81 293.625794)">1.4</text>
|
| 4585 |
</g>
|
| 4586 |
</g>
|
| 4587 |
<g id="ytick_4">
|
| 4588 |
<g id="grid-y--5" class="grid grid-y">
|
| 4589 |
+
<path d="M 47.81 232.505247 L 835.361742 232.505247 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4590 |
</g>
|
| 4591 |
<g id="line2d_10">
|
| 4592 |
<g>
|
| 4593 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="232.505247" style="stroke: #000000; stroke-width: 0.8" />
|
| 4594 |
</g>
|
| 4595 |
</g>
|
| 4596 |
<g id="text_10">
|
| 4597 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="236.304466" transform="rotate(-0 40.81 236.304466)">1.6</text>
|
| 4598 |
</g>
|
| 4599 |
</g>
|
| 4600 |
<g id="ytick_5">
|
| 4601 |
<g id="grid-y--6" class="grid grid-y">
|
| 4602 |
+
<path d="M 47.81 175.183919 L 835.361742 175.183919 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4603 |
</g>
|
| 4604 |
<g id="line2d_11">
|
| 4605 |
<g>
|
| 4606 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="175.183919" style="stroke: #000000; stroke-width: 0.8" />
|
| 4607 |
</g>
|
| 4608 |
</g>
|
| 4609 |
<g id="text_11">
|
| 4610 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="178.983137" transform="rotate(-0 40.81 178.983137)">1.8</text>
|
| 4611 |
</g>
|
| 4612 |
</g>
|
| 4613 |
<g id="ytick_6">
|
| 4614 |
<g id="grid-y--7" class="grid grid-y">
|
| 4615 |
+
<path d="M 47.81 117.86259 L 835.361742 117.86259 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4616 |
</g>
|
| 4617 |
<g id="line2d_12">
|
| 4618 |
<g>
|
| 4619 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="117.86259" style="stroke: #000000; stroke-width: 0.8" />
|
| 4620 |
</g>
|
| 4621 |
</g>
|
| 4622 |
<g id="text_12">
|
| 4623 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="121.661809" transform="rotate(-0 40.81 121.661809)">2.0</text>
|
| 4624 |
</g>
|
| 4625 |
</g>
|
| 4626 |
<g id="ytick_7">
|
| 4627 |
<g id="grid-y--8" class="grid grid-y">
|
| 4628 |
+
<path d="M 47.81 60.541262 L 835.361742 60.541262 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4629 |
</g>
|
| 4630 |
<g id="line2d_13">
|
| 4631 |
<g>
|
| 4632 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="60.541262" style="stroke: #000000; stroke-width: 0.8" />
|
| 4633 |
</g>
|
| 4634 |
</g>
|
| 4635 |
<g id="text_13">
|
| 4636 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="64.340481" transform="rotate(-0 40.81 64.340481)">2.2</text>
|
| 4637 |
</g>
|
| 4638 |
</g>
|
| 4639 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4641 |
</g>
|
| 4642 |
</g>
|
| 4643 |
<g id="series--torch-flash-ma" class="series">
|
| 4644 |
+
<path d="M 83.607806 346.603064 L 226.799032 331.148661 L 369.990258 322.7508 L 513.181484 313.642154 L 656.37271 270.506995 L 799.563935 259.742049 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4645 |
<defs>
|
| 4646 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4647 |
</defs>
|
| 4648 |
<g clip-path="url(#p09feef2583)">
|
| 4649 |
+
<use ns4:href="#md7efaf3aec" x="83.607806" y="346.603064" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4650 |
+
<use ns4:href="#md7efaf3aec" x="226.799032" y="331.148661" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4651 |
+
<use ns4:href="#md7efaf3aec" x="369.990258" y="322.7508" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4652 |
+
<use ns4:href="#md7efaf3aec" x="513.181484" y="313.642154" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4653 |
+
<use ns4:href="#md7efaf3aec" x="656.37271" y="270.506995" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4654 |
+
<use ns4:href="#md7efaf3aec" x="799.563935" y="259.742049" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4655 |
</g>
|
| 4656 |
</g>
|
| 4657 |
<g id="series--torch-mem-eff" class="series">
|
| 4658 |
+
<path d="M 83.607806 162.593002 L 226.799032 131.641491 L 369.990258 126.594348 L 513.181484 96.170767 L 656.37271 105.428161 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4659 |
<defs>
|
| 4660 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4661 |
</defs>
|
| 4662 |
<g clip-path="url(#p09feef2583)">
|
| 4663 |
+
<use ns4:href="#m9b8c54d372" x="83.607806" y="162.593002" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4664 |
+
<use ns4:href="#m9b8c54d372" x="226.799032" y="131.641491" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4665 |
+
<use ns4:href="#m9b8c54d372" x="369.990258" y="126.594348" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4666 |
+
<use ns4:href="#m9b8c54d372" x="513.181484" y="96.170767" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4667 |
+
<use ns4:href="#m9b8c54d372" x="656.37271" y="105.428161" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4668 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4669 |
</g>
|
| 4670 |
</g>
|
| 4671 |
<g id="series--xformers-meff" class="series">
|
| 4672 |
+
<path d="M 83.607806 410.706939 L 226.799032 396.737158 L 369.990258 386.568354 L 513.181484 386.536541 L 656.37271 333.774551 L 799.563935 337.388661 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4673 |
<defs>
|
| 4674 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4675 |
</defs>
|
| 4676 |
<g clip-path="url(#p09feef2583)">
|
| 4677 |
+
<use ns4:href="#mc655281e0b" x="83.607806" y="410.706939" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4678 |
+
<use ns4:href="#mc655281e0b" x="226.799032" y="396.737158" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4679 |
+
<use ns4:href="#mc655281e0b" x="369.990258" y="386.568354" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4680 |
+
<use ns4:href="#mc655281e0b" x="513.181484" y="386.536541" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4681 |
+
<use ns4:href="#mc655281e0b" x="656.37271" y="333.774551" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4682 |
+
<use ns4:href="#mc655281e0b" x="799.563935" y="337.388661" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4683 |
</g>
|
| 4684 |
</g>
|
| 4685 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4686 |
+
<path d="M 83.607806 416.940633 L 226.799032 399.984697 L 369.990258 390.841946 L 513.181484 387.029791 L 656.37271 344.433452 L 799.563935 341.857145 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4687 |
<defs>
|
| 4688 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4689 |
</defs>
|
| 4690 |
<g clip-path="url(#p09feef2583)">
|
| 4691 |
+
<use ns4:href="#m61c8040d7e" x="83.607806" y="416.940633" style="fill: #d62728; stroke: #d62728" />
|
| 4692 |
+
<use ns4:href="#m61c8040d7e" x="226.799032" y="399.984697" style="fill: #d62728; stroke: #d62728" />
|
| 4693 |
+
<use ns4:href="#m61c8040d7e" x="369.990258" y="390.841946" style="fill: #d62728; stroke: #d62728" />
|
| 4694 |
+
<use ns4:href="#m61c8040d7e" x="513.181484" y="387.029791" style="fill: #d62728; stroke: #d62728" />
|
| 4695 |
+
<use ns4:href="#m61c8040d7e" x="656.37271" y="344.433452" style="fill: #d62728; stroke: #d62728" />
|
| 4696 |
+
<use ns4:href="#m61c8040d7e" x="799.563935" y="341.857145" style="fill: #d62728; stroke: #d62728" />
|
| 4697 |
</g>
|
| 4698 |
</g>
|
| 4699 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4700 |
+
<path d="M 83.607806 428.387702 L 226.799032 412.171498 L 369.990258 404.997448 L 513.181484 400.314295 L 656.37271 358.798463 L 799.563935 355.895138 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4701 |
<defs>
|
| 4702 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4703 |
</defs>
|
| 4704 |
<g clip-path="url(#p09feef2583)">
|
| 4705 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4706 |
+
<use ns4:href="#m7cd35be9cc" x="226.799032" y="412.171498" style="fill: #9467bd; stroke: #9467bd" />
|
| 4707 |
+
<use ns4:href="#m7cd35be9cc" x="369.990258" y="404.997448" style="fill: #9467bd; stroke: #9467bd" />
|
| 4708 |
+
<use ns4:href="#m7cd35be9cc" x="513.181484" y="400.314295" style="fill: #9467bd; stroke: #9467bd" />
|
| 4709 |
+
<use ns4:href="#m7cd35be9cc" x="656.37271" y="358.798463" style="fill: #9467bd; stroke: #9467bd" />
|
| 4710 |
+
<use ns4:href="#m7cd35be9cc" x="799.563935" y="355.895138" style="fill: #9467bd; stroke: #9467bd" />
|
| 4711 |
</g>
|
| 4712 |
</g>
|
| 4713 |
<g id="patch_3">
|
index.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
{"ts": "2025-
|
| 2 |
-
{"ts": "2025-
|
| 3 |
-
{"ts": "2025-
|
| 4 |
-
{"ts": "2025-
|
|
|
|
| 1 |
+
{"ts": "2025-12-19T18:57:07Z", "run": "8a911691677c4be4b2377923d73cef2c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8283059999598663, "p50": 0.8335360000160108, "p90": 0.8356760000083341, "mean": 0.8340919999909602, "iqr": 0.0024800000346658635, "raw_times": [0.8356760000083341, 0.8397459999969215, 0.8331959999736682, 0.8335360000160108, 0.8283059999598663], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8370360000071742, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 2 |
+
{"ts": "2025-12-19T18:57:07Z", "run": "8a911691677c4be4b2377923d73cef2c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6452309999976933, "p50": 1.6598209999756364, "p90": 1.6613920000168036, "mean": 1.6622576000031586, "iqr": 0.0022199999989425123, "raw_times": [1.6591720000178611, 1.6613920000168036, 1.6598209999756364, 1.6856720000077985, 1.6452309999976933], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.654771999994864, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 3 |
+
{"ts": "2025-12-19T18:57:07Z", "run": "8a911691677c4be4b2377923d73cef2c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6495710000299368, "p50": 1.651621000007708, "p90": 1.6563920000294274, "mean": 1.6539776000172424, "iqr": 0.0065000000404324965, "raw_times": [1.649891999988995, 1.6624120000301446, 1.6563920000294274, 1.6495710000299368, 1.651621000007708], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6589109999927132, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 4 |
+
{"ts": "2025-12-19T18:57:07Z", "run": "8a911691677c4be4b2377923d73cef2c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.243421000036051, "p50": 3.2525119999604613, "p90": 3.2605619999799274, "mean": 3.252856000005977, "iqr": 0.017038999942542432, "raw_times": [3.2525119999604613, 3.2642620000160605, 3.2605619999799274, 3.243421000036051, 3.243523000037385], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.250041000001147, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
|
layer_norm/impls/hf_kernels_layer_norm.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3889 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3890 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3891 |
</span> |
|
| 3892 |
-
Cell: benchmark | 6.
|
| 3893 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3894 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3895 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3961,19 +3961,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
|
|
| 3961 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3962 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3963 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3964 |
-
hf_kernels_layer_norm 4.
|
| 3965 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 1.
|
| 3966 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3967 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3968 |
-
Activity Buffer Request
|
| 3969 |
-
aten::view 0.51% 21.
|
| 3970 |
-
aten::empty 1.
|
| 3971 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.22% 9.
|
| 3972 |
-
cudaLaunchKernel 1.
|
| 3973 |
-
cudaDeviceSynchronize
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
-
Self CPU time total: 4.
|
| 3976 |
-
Self CUDA time total: 2.
|
| 3977 |
|
| 3978 |
|
| 3979 |
|
|
@@ -3983,19 +3983,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
|
|
| 3983 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3984 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3985 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3986 |
-
hf_kernels_layer_norm 2.
|
| 3987 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.
|
| 3988 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3989 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3990 |
-
Activity Buffer Request
|
| 3991 |
-
aten::view 0.
|
| 3992 |
-
aten::empty 0.
|
| 3993 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.
|
| 3994 |
-
cudaLaunchKernel 0.
|
| 3995 |
-
cudaDeviceSynchronize
|
| 3996 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3997 |
-
Self CPU time total: 6.
|
| 3998 |
-
Self CUDA time total: 4.
|
| 3999 |
|
| 4000 |
|
| 4001 |
|
|
@@ -4005,19 +4005,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
|
|
| 4005 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4006 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
-
hf_kernels_layer_norm
|
| 4009 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.
|
| 4010 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4011 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4012 |
-
Activity Buffer Request
|
| 4013 |
-
aten::view 0.
|
| 4014 |
-
aten::empty 0.
|
| 4015 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.
|
| 4016 |
-
cudaLaunchKernel 0.
|
| 4017 |
-
cudaDeviceSynchronize
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
-
Self CPU time total: 6.
|
| 4020 |
-
Self CUDA time total: 4.
|
| 4021 |
|
| 4022 |
|
| 4023 |
|
|
@@ -4027,36 +4027,36 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
|
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4029 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4030 |
-
hf_kernels_layer_norm 1.
|
| 4031 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.38% 44.
|
| 4032 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4033 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4034 |
-
Activity Buffer Request
|
| 4035 |
-
aten::view 0.10%
|
| 4036 |
-
aten::empty 0.25% 29.
|
| 4037 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.04% 4.
|
| 4038 |
-
cudaLaunchKernel
|
| 4039 |
-
cudaDeviceSynchronize
|
| 4040 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4041 |
-
Self CPU time total: 11.
|
| 4042 |
-
Self CUDA time total: 9.
|
| 4043 |
|
| 4044 |
|
| 4045 |
impl wl p50(ms) ok
|
| 4046 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
|
| 4047 |
-
hf_kernels_layer_norm LN_B16_S2048_D8192 1.
|
| 4048 |
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4049 |
hf_kernels_layer_norm LN_B16_S4096_D8192 3.25 True
|
| 4050 |
</pre></div>
|
| 4051 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4052 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4053 |
<div class="uv-logs-content" style="display: none;">
|
| 4054 |
-
Installed
|
| 4055 |
</div>
|
| 4056 |
</div>
|
| 4057 |
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4058 |
-
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.
|
| 4059 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.
|
| 4060 |
<div class="cell-artifacts">
|
| 4061 |
<h4>Artifacts:</h4>
|
| 4062 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3889 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3890 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3891 |
</span> |
|
| 3892 |
+
Cell: benchmark | 6.26s
|
| 3893 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3894 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3895 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3961 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3962 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3963 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3964 |
+
hf_kernels_layer_norm 4.17% 177.304us 48.13% 2.048ms 2.048ms 0.000us 0.00% 3.167ms 3.167ms 1
|
| 3965 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 1.47% 62.693us 43.45% 1.849ms 616.229us 2.429ms 100.00% 3.167ms 1.056ms 3
|
| 3966 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.430ms 100.06% 2.430ms 2.430ms 1
|
| 3967 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.429ms 100.00% 2.429ms 809.553us 3
|
| 3968 |
+
Activity Buffer Request 39.70% 1.689ms 39.70% 1.689ms 1.689ms 738.629us 30.41% 738.629us 738.629us 1
|
| 3969 |
+
aten::view 0.51% 21.739us 0.51% 21.739us 3.623us 0.000us 0.00% 0.000us 0.000us 6
|
| 3970 |
+
aten::empty 1.04% 44.400us 1.04% 44.400us 4.933us 0.000us 0.00% 0.000us 0.000us 9
|
| 3971 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.22% 9.310us 0.22% 9.310us 3.103us 0.000us 0.00% 0.000us 0.000us 3
|
| 3972 |
+
cudaLaunchKernel 1.01% 43.131us 1.01% 43.131us 14.377us 0.000us 0.00% 0.000us 0.000us 3
|
| 3973 |
+
cudaDeviceSynchronize 51.87% 2.207ms 51.87% 2.207ms 2.207ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
+
Self CPU time total: 4.255ms
|
| 3976 |
+
Self CUDA time total: 2.429ms
|
| 3977 |
|
| 3978 |
|
| 3979 |
|
|
|
|
| 3983 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3984 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3985 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3986 |
+
hf_kernels_layer_norm 2.14% 140.133us 29.32% 1.923ms 1.923ms 0.000us 0.00% 6.388ms 6.388ms 1
|
| 3987 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.69% 45.053us 27.01% 1.772ms 590.648us 4.807ms 100.00% 6.388ms 2.129ms 3
|
| 3988 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.808ms 100.03% 4.808ms 4.808ms 1
|
| 3989 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.807ms 100.00% 4.807ms 1.602ms 3
|
| 3990 |
+
Activity Buffer Request 25.34% 1.663ms 25.34% 1.663ms 1.663ms 1.581ms 32.89% 1.581ms 1.581ms 1
|
| 3991 |
+
aten::view 0.17% 11.390us 0.17% 11.390us 1.898us 0.000us 0.00% 0.000us 0.000us 6
|
| 3992 |
+
aten::empty 0.45% 29.620us 0.45% 29.620us 3.291us 0.000us 0.00% 0.000us 0.000us 9
|
| 3993 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.820us 0.07% 4.820us 1.607us 0.000us 0.00% 0.000us 0.000us 3
|
| 3994 |
+
cudaLaunchKernel 0.46% 29.860us 0.46% 29.860us 9.953us 0.000us 0.00% 0.000us 0.000us 3
|
| 3995 |
+
cudaDeviceSynchronize 70.68% 4.637ms 70.68% 4.637ms 4.637ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3996 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3997 |
+
Self CPU time total: 6.560ms
|
| 3998 |
+
Self CUDA time total: 4.807ms
|
| 3999 |
|
| 4000 |
|
| 4001 |
|
|
|
|
| 4005 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4006 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
+
hf_kernels_layer_norm 1.98% 129.253us 29.33% 1.919ms 1.919ms 0.000us 0.00% 6.330ms 6.330ms 1
|
| 4009 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.71% 46.780us 27.18% 1.779ms 592.854us 4.774ms 100.00% 6.330ms 2.110ms 3
|
| 4010 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.775ms 100.03% 4.775ms 4.775ms 1
|
| 4011 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.774ms 100.00% 4.774ms 1.591ms 3
|
| 4012 |
+
Activity Buffer Request 25.49% 1.668ms 25.49% 1.668ms 1.668ms 1.556ms 32.59% 1.556ms 1.556ms 1
|
| 4013 |
+
aten::view 0.17% 11.271us 0.17% 11.271us 1.879us 0.000us 0.00% 0.000us 0.000us 6
|
| 4014 |
+
aten::empty 0.45% 29.221us 0.45% 29.221us 3.247us 0.000us 0.00% 0.000us 0.000us 9
|
| 4015 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.980us 0.08% 4.980us 1.660us 0.000us 0.00% 0.000us 0.000us 3
|
| 4016 |
+
cudaLaunchKernel 0.45% 29.470us 0.45% 29.470us 9.823us 0.000us 0.00% 0.000us 0.000us 3
|
| 4017 |
+
cudaDeviceSynchronize 70.67% 4.624ms 70.67% 4.624ms 4.624ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
+
Self CPU time total: 6.543ms
|
| 4020 |
+
Self CUDA time total: 4.774ms
|
| 4021 |
|
| 4022 |
|
| 4023 |
|
|
|
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4029 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4030 |
+
hf_kernels_layer_norm 1.22% 142.314us 18.53% 2.155ms 2.155ms 0.000us 0.00% 12.836ms 12.836ms 1
|
| 4031 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.38% 44.492us 17.20% 2.000ms 666.802us 9.636ms 100.00% 12.836ms 4.279ms 3
|
| 4032 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.637ms 100.02% 9.637ms 9.637ms 1
|
| 4033 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.636ms 100.00% 9.636ms 3.212ms 3
|
| 4034 |
+
Activity Buffer Request 14.57% 1.694ms 14.57% 1.694ms 1.694ms 3.200ms 33.21% 3.200ms 3.200ms 1
|
| 4035 |
+
aten::view 0.10% 12.130us 0.10% 12.130us 2.022us 0.000us 0.00% 0.000us 0.000us 6
|
| 4036 |
+
aten::empty 0.25% 29.499us 0.25% 29.499us 3.278us 0.000us 0.00% 0.000us 0.000us 9
|
| 4037 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.04% 4.820us 0.04% 4.820us 1.607us 0.000us 0.00% 0.000us 0.000us 3
|
| 4038 |
+
cudaLaunchKernel 1.96% 227.814us 1.96% 227.814us 75.938us 0.000us 0.00% 0.000us 0.000us 3
|
| 4039 |
+
cudaDeviceSynchronize 81.47% 9.472ms 81.47% 9.472ms 9.472ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4040 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4041 |
+
Self CPU time total: 11.627ms
|
| 4042 |
+
Self CUDA time total: 9.636ms
|
| 4043 |
|
| 4044 |
|
| 4045 |
impl wl p50(ms) ok
|
| 4046 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
|
| 4047 |
+
hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
|
| 4048 |
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4049 |
hf_kernels_layer_norm LN_B16_S4096_D8192 3.25 True
|
| 4050 |
</pre></div>
|
| 4051 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4052 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4053 |
<div class="uv-logs-content" style="display: none;">
|
| 4054 |
+
Installed 14 packages in 12ms
|
| 4055 |
</div>
|
| 4056 |
</div>
|
| 4057 |
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4058 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.35it/s]
|
| 4059 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.71it/s]</div>
|
| 4060 |
<div class="cell-artifacts">
|
| 4061 |
<h4>Artifacts:</h4>
|
| 4062 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
layer_norm/impls/torch_layer_norm.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: nv | 0.
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3904,16 +3904,16 @@ Cell: nv | 0.22s
|
|
| 3904 |
</div>
|
| 3905 |
</div>
|
| 3906 |
<div id="output-nv" class="cell-output">
|
| 3907 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3908 |
+-----------------------------------------------------------------------------------------+
|
| 3909 |
-
| NVIDIA-SMI 580.
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
| 3911 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3912 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3913 |
| | | MIG M. |
|
| 3914 |
|=========================================+========================+======================|
|
| 3915 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3916 |
-
| N/A
|
| 3917 |
| | | N/A |
|
| 3918 |
+-----------------------------------------+------------------------+----------------------+
|
| 3919 |
|
|
@@ -3937,7 +3937,7 @@ Cell: nv | 0.22s
|
|
| 3937 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3938 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3939 |
</span> |
|
| 3940 |
-
Cell: benchmark | 7.
|
| 3941 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3942 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3943 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3985,19 +3985,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
|
|
| 3985 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3986 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
-
torch_layer_norm 3.
|
| 3989 |
-
aten::layer_norm 0.35% 14.
|
| 3990 |
-
aten::native_layer_norm 1.
|
| 3991 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3992 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3993 |
-
Activity Buffer Request
|
| 3994 |
-
aten::empty 1.11%
|
| 3995 |
-
cudaLaunchKernel 1.
|
| 3996 |
-
aten::view 0.
|
| 3997 |
-
cudaDeviceSynchronize
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
-
Self CPU time total: 4.
|
| 4000 |
-
Self CUDA time total: 2.
|
| 4001 |
|
| 4002 |
|
| 4003 |
|
|
@@ -4007,19 +4007,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
|
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4009 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4010 |
-
torch_layer_norm 1.
|
| 4011 |
-
aten::layer_norm 0.
|
| 4012 |
-
aten::native_layer_norm 0.
|
| 4013 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4014 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4015 |
-
Activity Buffer Request
|
| 4016 |
-
aten::empty 0.
|
| 4017 |
-
cudaLaunchKernel 0.
|
| 4018 |
-
aten::view 0.06% 3.
|
| 4019 |
-
cudaDeviceSynchronize
|
| 4020 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4021 |
-
Self CPU time total: 6.
|
| 4022 |
-
Self CUDA time total: 4.
|
| 4023 |
|
| 4024 |
|
| 4025 |
|
|
@@ -4029,19 +4029,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
|
|
| 4029 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4030 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4031 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4032 |
-
torch_layer_norm 1.
|
| 4033 |
-
aten::layer_norm 0.13% 8.
|
| 4034 |
-
aten::native_layer_norm 0.
|
| 4035 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4036 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4037 |
-
Activity Buffer Request 27.
|
| 4038 |
-
aten::empty 0.
|
| 4039 |
-
cudaLaunchKernel 0.
|
| 4040 |
-
aten::view 0.06%
|
| 4041 |
-
cudaDeviceSynchronize
|
| 4042 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4043 |
-
Self CPU time total: 6.
|
| 4044 |
-
Self CUDA time total: 4.
|
| 4045 |
|
| 4046 |
|
| 4047 |
|
|
@@ -4051,23 +4051,23 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
|
|
| 4051 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4052 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
-
torch_layer_norm 0.
|
| 4055 |
-
aten::layer_norm 0.08% 8.
|
| 4056 |
-
aten::native_layer_norm 0.
|
| 4057 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4058 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4059 |
-
Activity Buffer Request 11.
|
| 4060 |
-
aten::empty 0.
|
| 4061 |
-
cudaLaunchKernel 2.
|
| 4062 |
-
aten::view 0.
|
| 4063 |
-
cudaDeviceSynchronize 85.
|
| 4064 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4065 |
-
Self CPU time total: 11.
|
| 4066 |
-
Self CUDA time total: 9.
|
| 4067 |
|
| 4068 |
|
| 4069 |
impl wl p50(ms) ok
|
| 4070 |
-
torch_layer_norm LN_B16_S2048_D4096 0.
|
| 4071 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4072 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4073 |
torch_layer_norm LN_B16_S4096_D8192 3.33 True
|
|
@@ -4075,7 +4075,7 @@ torch_layer_norm LN_B16_S4096_D8192 3.33 True
|
|
| 4075 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4076 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4077 |
<div class="uv-logs-content" style="display: none;">
|
| 4078 |
-
Installed 37 packages in
|
| 4079 |
</div>
|
| 4080 |
</div>
|
| 4081 |
<div class="cell-artifacts">
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: nv | 0.25s
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3904 |
</div>
|
| 3905 |
</div>
|
| 3906 |
<div id="output-nv" class="cell-output">
|
| 3907 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:56:51 2025
|
| 3908 |
+-----------------------------------------------------------------------------------------+
|
| 3909 |
+
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
| 3911 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3912 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3913 |
| | | MIG M. |
|
| 3914 |
|=========================================+========================+======================|
|
| 3915 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3916 |
+
| N/A 34C P0 107W / 350W | 0MiB / 46068MiB | 53% Default |
|
| 3917 |
| | | N/A |
|
| 3918 |
+-----------------------------------------+------------------------+----------------------+
|
| 3919 |
|
|
|
|
| 3937 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3938 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3939 |
</span> |
|
| 3940 |
+
Cell: benchmark | 7.79s
|
| 3941 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3942 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3943 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3985 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3986 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
+
torch_layer_norm 3.56% 149.102us 49.42% 2.068ms 2.068ms 0.000us 0.00% 3.039ms 3.039ms 1
|
| 3989 |
+
aten::layer_norm 0.35% 14.790us 45.86% 1.919ms 639.751us 0.000us 0.00% 3.039ms 1.013ms 3
|
| 3990 |
+
aten::native_layer_norm 1.65% 69.001us 45.51% 1.904ms 634.821us 2.327ms 100.00% 3.039ms 1.013ms 3
|
| 3991 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.329ms 100.06% 2.329ms 2.329ms 1
|
| 3992 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.327ms 100.00% 2.327ms 775.759us 3
|
| 3993 |
+
Activity Buffer Request 41.45% 1.735ms 41.45% 1.735ms 1.735ms 711.588us 30.58% 711.588us 711.588us 1
|
| 3994 |
+
aten::empty 1.11% 46.511us 1.11% 46.511us 5.168us 0.000us 0.00% 0.000us 0.000us 9
|
| 3995 |
+
cudaLaunchKernel 1.13% 47.301us 1.13% 47.301us 15.767us 0.000us 0.00% 0.000us 0.000us 3
|
| 3996 |
+
aten::view 0.16% 6.890us 0.16% 6.890us 1.148us 0.000us 0.00% 0.000us 0.000us 6
|
| 3997 |
+
cudaDeviceSynchronize 50.58% 2.117ms 50.58% 2.117ms 2.117ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
+
Self CPU time total: 4.185ms
|
| 4000 |
+
Self CUDA time total: 2.327ms
|
| 4001 |
|
| 4002 |
|
| 4003 |
|
|
|
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4009 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4010 |
+
torch_layer_norm 1.05% 70.042us 28.68% 1.911ms 1.911ms 0.000us 0.00% 6.475ms 6.475ms 1
|
| 4011 |
+
aten::layer_norm 0.13% 8.728us 27.63% 1.841ms 613.810us 0.000us 0.00% 6.475ms 2.158ms 3
|
| 4012 |
+
aten::native_layer_norm 0.73% 48.442us 27.50% 1.833ms 610.901us 4.886ms 100.00% 6.475ms 2.158ms 3
|
| 4013 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.888ms 100.03% 4.888ms 4.888ms 1
|
| 4014 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.886ms 100.00% 4.886ms 1.629ms 3
|
| 4015 |
+
Activity Buffer Request 25.85% 1.723ms 25.85% 1.723ms 1.723ms 1.589ms 32.51% 1.589ms 1.589ms 1
|
| 4016 |
+
aten::empty 0.43% 28.711us 0.43% 28.711us 3.190us 0.000us 0.00% 0.000us 0.000us 9
|
| 4017 |
+
cudaLaunchKernel 0.44% 29.201us 0.44% 29.201us 9.734us 0.000us 0.00% 0.000us 0.000us 3
|
| 4018 |
+
aten::view 0.06% 3.829us 0.06% 3.829us 0.638us 0.000us 0.00% 0.000us 0.000us 6
|
| 4019 |
+
cudaDeviceSynchronize 71.32% 4.753ms 71.32% 4.753ms 4.753ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4020 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4021 |
+
Self CPU time total: 6.665ms
|
| 4022 |
+
Self CUDA time total: 4.886ms
|
| 4023 |
|
| 4024 |
|
| 4025 |
|
|
|
|
| 4029 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4030 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4031 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4032 |
+
torch_layer_norm 1.06% 69.120us 29.93% 1.960ms 1.960ms 0.000us 0.00% 6.232ms 6.232ms 1
|
| 4033 |
+
aten::layer_norm 0.13% 8.631us 28.88% 1.891ms 630.434us 0.000us 0.00% 6.232ms 2.077ms 3
|
| 4034 |
+
aten::native_layer_norm 0.71% 46.790us 28.75% 1.883ms 627.557us 4.719ms 100.00% 6.232ms 2.077ms 3
|
| 4035 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.721ms 100.03% 4.721ms 4.721ms 1
|
| 4036 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.719ms 100.00% 4.719ms 1.573ms 3
|
| 4037 |
+
Activity Buffer Request 27.06% 1.772ms 27.06% 1.772ms 1.772ms 1.513ms 32.05% 1.513ms 1.513ms 1
|
| 4038 |
+
aten::empty 0.45% 29.333us 0.45% 29.333us 3.259us 0.000us 0.00% 0.000us 0.000us 9
|
| 4039 |
+
cudaLaunchKernel 0.46% 30.200us 0.46% 30.200us 10.067us 0.000us 0.00% 0.000us 0.000us 3
|
| 4040 |
+
aten::view 0.06% 3.850us 0.06% 3.850us 0.642us 0.000us 0.00% 0.000us 0.000us 6
|
| 4041 |
+
cudaDeviceSynchronize 70.07% 4.589ms 70.07% 4.589ms 4.589ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4042 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4043 |
+
Self CPU time total: 6.549ms
|
| 4044 |
+
Self CUDA time total: 4.719ms
|
| 4045 |
|
| 4046 |
|
| 4047 |
|
|
|
|
| 4051 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4052 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
+
torch_layer_norm 0.60% 67.701us 14.52% 1.650ms 1.650ms 0.000us 0.00% 13.091ms 13.091ms 1
|
| 4055 |
+
aten::layer_norm 0.08% 8.549us 13.92% 1.582ms 527.445us 0.000us 0.00% 13.091ms 4.364ms 3
|
| 4056 |
+
aten::native_layer_norm 0.41% 47.051us 13.85% 1.574ms 524.596us 9.846ms 100.00% 13.091ms 4.364ms 3
|
| 4057 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.848ms 100.02% 9.848ms 9.848ms 1
|
| 4058 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.846ms 100.00% 9.846ms 3.282ms 3
|
| 4059 |
+
Activity Buffer Request 11.12% 1.264ms 11.12% 1.264ms 1.264ms 3.245ms 32.96% 3.245ms 3.245ms 1
|
| 4060 |
+
aten::empty 0.26% 29.420us 0.26% 29.420us 3.269us 0.000us 0.00% 0.000us 0.000us 9
|
| 4061 |
+
cudaLaunchKernel 2.02% 229.604us 2.02% 229.604us 76.535us 0.000us 0.00% 0.000us 0.000us 3
|
| 4062 |
+
aten::view 0.04% 3.990us 0.04% 3.990us 0.665us 0.000us 0.00% 0.000us 0.000us 6
|
| 4063 |
+
cudaDeviceSynchronize 85.48% 9.715ms 85.48% 9.715ms 9.715ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4064 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4065 |
+
Self CPU time total: 11.365ms
|
| 4066 |
+
Self CUDA time total: 9.846ms
|
| 4067 |
|
| 4068 |
|
| 4069 |
impl wl p50(ms) ok
|
| 4070 |
+
torch_layer_norm LN_B16_S2048_D4096 0.81 True
|
| 4071 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4072 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4073 |
torch_layer_norm LN_B16_S4096_D8192 3.33 True
|
|
|
|
| 4075 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4076 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4077 |
<div class="uv-logs-content" style="display: none;">
|
| 4078 |
+
Installed 37 packages in 298ms
|
| 4079 |
</div>
|
| 4080 |
</div>
|
| 4081 |
<div class="cell-artifacts">
|
layer_norm/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
layer_norm/results/combined_results.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3889,11 +3889,11 @@ body[data-tool="eraser"] .main-content {
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
-
<dc:date>2025-
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
| 3896 |
-
<dc:title>Matplotlib v3.10.
|
| 3897 |
</ns2:Agent>
|
| 3898 |
</dc:creator>
|
| 3899 |
</ns2:Work>
|
|
@@ -3973,70 +3973,70 @@ body[data-tool="eraser"] .main-content {
|
|
| 3973 |
<g id="matplotlib.axis_2">
|
| 3974 |
<g id="ytick_1">
|
| 3975 |
<g id="grid-y--2" class="grid grid-y">
|
| 3976 |
-
<path d="M 47.72
|
| 3977 |
</g>
|
| 3978 |
<g id="line2d_5">
|
| 3979 |
<defs>
|
| 3980 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3981 |
</defs>
|
| 3982 |
<g>
|
| 3983 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 3984 |
</g>
|
| 3985 |
</g>
|
| 3986 |
<g id="text_5">
|
| 3987 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 3988 |
</g>
|
| 3989 |
</g>
|
| 3990 |
<g id="ytick_2">
|
| 3991 |
<g id="grid-y--3" class="grid grid-y">
|
| 3992 |
-
<path d="M 47.72
|
| 3993 |
</g>
|
| 3994 |
<g id="line2d_6">
|
| 3995 |
<g>
|
| 3996 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="text_6">
|
| 4000 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4001 |
</g>
|
| 4002 |
</g>
|
| 4003 |
<g id="ytick_3">
|
| 4004 |
<g id="grid-y--4" class="grid grid-y">
|
| 4005 |
-
<path d="M 47.72
|
| 4006 |
</g>
|
| 4007 |
<g id="line2d_7">
|
| 4008 |
<g>
|
| 4009 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="text_7">
|
| 4013 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4014 |
</g>
|
| 4015 |
</g>
|
| 4016 |
<g id="ytick_4">
|
| 4017 |
<g id="grid-y--5" class="grid grid-y">
|
| 4018 |
-
<path d="M 47.72
|
| 4019 |
</g>
|
| 4020 |
<g id="line2d_8">
|
| 4021 |
<g>
|
| 4022 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="text_8">
|
| 4026 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="ytick_5">
|
| 4030 |
<g id="grid-y--6" class="grid grid-y">
|
| 4031 |
-
<path d="M 47.72
|
| 4032 |
</g>
|
| 4033 |
<g id="line2d_9">
|
| 4034 |
<g>
|
| 4035 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="text_9">
|
| 4039 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="label--y" class="ylabel">
|
|
@@ -4044,27 +4044,27 @@ body[data-tool="eraser"] .main-content {
|
|
| 4044 |
</g>
|
| 4045 |
</g>
|
| 4046 |
<g id="series--torch-layer-norm" class="series">
|
| 4047 |
-
<path d="M 83.741924 437.689571 L 323.888085
|
| 4048 |
<defs>
|
| 4049 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4050 |
</defs>
|
| 4051 |
<g clip-path="url(#p2214f54723)">
|
| 4052 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4053 |
-
<use ns4:href="#md7efaf3aec" x="323.888085" y="
|
| 4054 |
-
<use ns4:href="#md7efaf3aec" x="564.034245" y="
|
| 4055 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4056 |
</g>
|
| 4057 |
</g>
|
| 4058 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4059 |
-
<path d="M 83.741924
|
| 4060 |
<defs>
|
| 4061 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4062 |
</defs>
|
| 4063 |
<g clip-path="url(#p2214f54723)">
|
| 4064 |
-
<use ns4:href="#m9b8c54d372" x="83.741924" y="
|
| 4065 |
-
<use ns4:href="#m9b8c54d372" x="323.888085" y="
|
| 4066 |
-
<use ns4:href="#m9b8c54d372" x="564.034245" y="
|
| 4067 |
-
<use ns4:href="#m9b8c54d372" x="804.180406" y="
|
| 4068 |
</g>
|
| 4069 |
</g>
|
| 4070 |
<g id="patch_3">
|
|
@@ -4122,7 +4122,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4122 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4123 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4124 |
</span> |
|
| 4125 |
-
Cell: combine | 4.
|
| 4126 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4127 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4128 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4210,10 +4210,10 @@ COMBINED BENCHMARK SUMMARY
|
|
| 4210 |
|
| 4211 |
impl wl p50(ms) ok
|
| 4212 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
|
| 4213 |
-
hf_kernels_layer_norm LN_B16_S2048_D8192 1.
|
| 4214 |
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4215 |
hf_kernels_layer_norm LN_B16_S4096_D8192 3.25 True
|
| 4216 |
-
torch_layer_norm LN_B16_S2048_D4096 0.
|
| 4217 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4218 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4219 |
torch_layer_norm LN_B16_S4096_D8192 3.33 True
|
|
@@ -4236,7 +4236,7 @@ Implementations included:
|
|
| 4236 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4237 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4238 |
<div class="uv-logs-content" style="display: none;">
|
| 4239 |
-
Installed 37 packages in
|
| 4240 |
</div>
|
| 4241 |
</div>
|
| 4242 |
<div class="cell-artifacts">
|
|
@@ -4249,11 +4249,11 @@ Installed 37 packages in 284ms
|
|
| 4249 |
<rdf:RDF>
|
| 4250 |
<ns2:Work>
|
| 4251 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4252 |
-
<dc:date>2025-
|
| 4253 |
<dc:format>image/svg+xml</dc:format>
|
| 4254 |
<dc:creator>
|
| 4255 |
<ns2:Agent>
|
| 4256 |
-
<dc:title>Matplotlib v3.10.
|
| 4257 |
</ns2:Agent>
|
| 4258 |
</dc:creator>
|
| 4259 |
</ns2:Work>
|
|
@@ -4333,70 +4333,70 @@ Installed 37 packages in 284ms
|
|
| 4333 |
<g id="matplotlib.axis_2">
|
| 4334 |
<g id="ytick_1">
|
| 4335 |
<g id="grid-y--2" class="grid grid-y">
|
| 4336 |
-
<path d="M 47.72
|
| 4337 |
</g>
|
| 4338 |
<g id="line2d_5">
|
| 4339 |
<defs>
|
| 4340 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4341 |
</defs>
|
| 4342 |
<g>
|
| 4343 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4344 |
</g>
|
| 4345 |
</g>
|
| 4346 |
<g id="text_5">
|
| 4347 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4348 |
</g>
|
| 4349 |
</g>
|
| 4350 |
<g id="ytick_2">
|
| 4351 |
<g id="grid-y--3" class="grid grid-y">
|
| 4352 |
-
<path d="M 47.72
|
| 4353 |
</g>
|
| 4354 |
<g id="line2d_6">
|
| 4355 |
<g>
|
| 4356 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4357 |
</g>
|
| 4358 |
</g>
|
| 4359 |
<g id="text_6">
|
| 4360 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4361 |
</g>
|
| 4362 |
</g>
|
| 4363 |
<g id="ytick_3">
|
| 4364 |
<g id="grid-y--4" class="grid grid-y">
|
| 4365 |
-
<path d="M 47.72
|
| 4366 |
</g>
|
| 4367 |
<g id="line2d_7">
|
| 4368 |
<g>
|
| 4369 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4370 |
</g>
|
| 4371 |
</g>
|
| 4372 |
<g id="text_7">
|
| 4373 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4374 |
</g>
|
| 4375 |
</g>
|
| 4376 |
<g id="ytick_4">
|
| 4377 |
<g id="grid-y--5" class="grid grid-y">
|
| 4378 |
-
<path d="M 47.72
|
| 4379 |
</g>
|
| 4380 |
<g id="line2d_8">
|
| 4381 |
<g>
|
| 4382 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4383 |
</g>
|
| 4384 |
</g>
|
| 4385 |
<g id="text_8">
|
| 4386 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4387 |
</g>
|
| 4388 |
</g>
|
| 4389 |
<g id="ytick_5">
|
| 4390 |
<g id="grid-y--6" class="grid grid-y">
|
| 4391 |
-
<path d="M 47.72
|
| 4392 |
</g>
|
| 4393 |
<g id="line2d_9">
|
| 4394 |
<g>
|
| 4395 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4396 |
</g>
|
| 4397 |
</g>
|
| 4398 |
<g id="text_9">
|
| 4399 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4400 |
</g>
|
| 4401 |
</g>
|
| 4402 |
<g id="label--y" class="ylabel">
|
|
@@ -4404,27 +4404,27 @@ Installed 37 packages in 284ms
|
|
| 4404 |
</g>
|
| 4405 |
</g>
|
| 4406 |
<g id="series--torch-layer-norm" class="series">
|
| 4407 |
-
<path d="M 83.741924 437.689571 L 323.888085
|
| 4408 |
<defs>
|
| 4409 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4410 |
</defs>
|
| 4411 |
<g clip-path="url(#p2214f54723)">
|
| 4412 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4413 |
-
<use ns4:href="#md7efaf3aec" x="323.888085" y="
|
| 4414 |
-
<use ns4:href="#md7efaf3aec" x="564.034245" y="
|
| 4415 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4416 |
</g>
|
| 4417 |
</g>
|
| 4418 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4419 |
-
<path d="M 83.741924
|
| 4420 |
<defs>
|
| 4421 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4422 |
</defs>
|
| 4423 |
<g clip-path="url(#p2214f54723)">
|
| 4424 |
-
<use ns4:href="#m9b8c54d372" x="83.741924" y="
|
| 4425 |
-
<use ns4:href="#m9b8c54d372" x="323.888085" y="
|
| 4426 |
-
<use ns4:href="#m9b8c54d372" x="564.034245" y="
|
| 4427 |
-
<use ns4:href="#m9b8c54d372" x="804.180406" y="
|
| 4428 |
</g>
|
| 4429 |
</g>
|
| 4430 |
<g id="patch_3">
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
+
<dc:date>2025-12-19T19:09:50.663153</dc:date>
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
| 3896 |
+
<dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
|
| 3897 |
</ns2:Agent>
|
| 3898 |
</dc:creator>
|
| 3899 |
</ns2:Work>
|
|
|
|
| 3973 |
<g id="matplotlib.axis_2">
|
| 3974 |
<g id="ytick_1">
|
| 3975 |
<g id="grid-y--2" class="grid grid-y">
|
| 3976 |
+
<path d="M 47.72 408.774166 L 840.20233 408.774166 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3977 |
</g>
|
| 3978 |
<g id="line2d_5">
|
| 3979 |
<defs>
|
| 3980 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3981 |
</defs>
|
| 3982 |
<g>
|
| 3983 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="408.774166" style="stroke: #000000; stroke-width: 0.8" />
|
| 3984 |
</g>
|
| 3985 |
</g>
|
| 3986 |
<g id="text_5">
|
| 3987 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.573385" transform="rotate(-0 40.72 412.573385)">1.0</text>
|
| 3988 |
</g>
|
| 3989 |
</g>
|
| 3990 |
<g id="ytick_2">
|
| 3991 |
<g id="grid-y--3" class="grid grid-y">
|
| 3992 |
+
<path d="M 47.72 330.886714 L 840.20233 330.886714 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3993 |
</g>
|
| 3994 |
<g id="line2d_6">
|
| 3995 |
<g>
|
| 3996 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="330.886714" style="stroke: #000000; stroke-width: 0.8" />
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="text_6">
|
| 4000 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.685933" transform="rotate(-0 40.72 334.685933)">1.5</text>
|
| 4001 |
</g>
|
| 4002 |
</g>
|
| 4003 |
<g id="ytick_3">
|
| 4004 |
<g id="grid-y--4" class="grid grid-y">
|
| 4005 |
+
<path d="M 47.72 252.999261 L 840.20233 252.999261 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4006 |
</g>
|
| 4007 |
<g id="line2d_7">
|
| 4008 |
<g>
|
| 4009 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="252.999261" style="stroke: #000000; stroke-width: 0.8" />
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="text_7">
|
| 4013 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.79848" transform="rotate(-0 40.72 256.79848)">2.0</text>
|
| 4014 |
</g>
|
| 4015 |
</g>
|
| 4016 |
<g id="ytick_4">
|
| 4017 |
<g id="grid-y--5" class="grid grid-y">
|
| 4018 |
+
<path d="M 47.72 175.111809 L 840.20233 175.111809 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4019 |
</g>
|
| 4020 |
<g id="line2d_8">
|
| 4021 |
<g>
|
| 4022 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="175.111809" style="stroke: #000000; stroke-width: 0.8" />
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="text_8">
|
| 4026 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.911028" transform="rotate(-0 40.72 178.911028)">2.5</text>
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="ytick_5">
|
| 4030 |
<g id="grid-y--6" class="grid grid-y">
|
| 4031 |
+
<path d="M 47.72 97.224356 L 840.20233 97.224356 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4032 |
</g>
|
| 4033 |
<g id="line2d_9">
|
| 4034 |
<g>
|
| 4035 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="97.224356" style="stroke: #000000; stroke-width: 0.8" />
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="text_9">
|
| 4039 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.023575" transform="rotate(-0 40.72 101.023575)">3.0</text>
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4044 |
</g>
|
| 4045 |
</g>
|
| 4046 |
<g id="series--torch-layer-norm" class="series">
|
| 4047 |
+
<path d="M 83.741924 437.689571 L 323.888085 302.950354 L 564.034245 314.128917 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4048 |
<defs>
|
| 4049 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4050 |
</defs>
|
| 4051 |
<g clip-path="url(#p2214f54723)">
|
| 4052 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4053 |
+
<use ns4:href="#md7efaf3aec" x="323.888085" y="302.950354" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4054 |
+
<use ns4:href="#md7efaf3aec" x="564.034245" y="314.128917" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4055 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4056 |
</g>
|
| 4057 |
</g>
|
| 4058 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4059 |
+
<path d="M 83.741924 434.70508 L 323.888085 305.990613 L 564.034245 307.267967 L 804.180406 57.889324 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4060 |
<defs>
|
| 4061 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4062 |
</defs>
|
| 4063 |
<g clip-path="url(#p2214f54723)">
|
| 4064 |
+
<use ns4:href="#m9b8c54d372" x="83.741924" y="434.70508" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4065 |
+
<use ns4:href="#m9b8c54d372" x="323.888085" y="305.990613" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4066 |
+
<use ns4:href="#m9b8c54d372" x="564.034245" y="307.267967" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4067 |
+
<use ns4:href="#m9b8c54d372" x="804.180406" y="57.889324" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4068 |
</g>
|
| 4069 |
</g>
|
| 4070 |
<g id="patch_3">
|
|
|
|
| 4122 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4123 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4124 |
</span> |
|
| 4125 |
+
Cell: combine | 4.51s
|
| 4126 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4127 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4128 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4210 |
|
| 4211 |
impl wl p50(ms) ok
|
| 4212 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
|
| 4213 |
+
hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
|
| 4214 |
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4215 |
hf_kernels_layer_norm LN_B16_S4096_D8192 3.25 True
|
| 4216 |
+
torch_layer_norm LN_B16_S2048_D4096 0.81 True
|
| 4217 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4218 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4219 |
torch_layer_norm LN_B16_S4096_D8192 3.33 True
|
|
|
|
| 4236 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4237 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4238 |
<div class="uv-logs-content" style="display: none;">
|
| 4239 |
+
Installed 37 packages in 297ms
|
| 4240 |
</div>
|
| 4241 |
</div>
|
| 4242 |
<div class="cell-artifacts">
|
|
|
|
| 4249 |
<rdf:RDF>
|
| 4250 |
<ns2:Work>
|
| 4251 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4252 |
+
<dc:date>2025-12-19T19:09:50.663153</dc:date>
|
| 4253 |
<dc:format>image/svg+xml</dc:format>
|
| 4254 |
<dc:creator>
|
| 4255 |
<ns2:Agent>
|
| 4256 |
+
<dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
|
| 4257 |
</ns2:Agent>
|
| 4258 |
</dc:creator>
|
| 4259 |
</ns2:Work>
|
|
|
|
| 4333 |
<g id="matplotlib.axis_2">
|
| 4334 |
<g id="ytick_1">
|
| 4335 |
<g id="grid-y--2" class="grid grid-y">
|
| 4336 |
+
<path d="M 47.72 408.774166 L 840.20233 408.774166 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4337 |
</g>
|
| 4338 |
<g id="line2d_5">
|
| 4339 |
<defs>
|
| 4340 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4341 |
</defs>
|
| 4342 |
<g>
|
| 4343 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="408.774166" style="stroke: #000000; stroke-width: 0.8" />
|
| 4344 |
</g>
|
| 4345 |
</g>
|
| 4346 |
<g id="text_5">
|
| 4347 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.573385" transform="rotate(-0 40.72 412.573385)">1.0</text>
|
| 4348 |
</g>
|
| 4349 |
</g>
|
| 4350 |
<g id="ytick_2">
|
| 4351 |
<g id="grid-y--3" class="grid grid-y">
|
| 4352 |
+
<path d="M 47.72 330.886714 L 840.20233 330.886714 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4353 |
</g>
|
| 4354 |
<g id="line2d_6">
|
| 4355 |
<g>
|
| 4356 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="330.886714" style="stroke: #000000; stroke-width: 0.8" />
|
| 4357 |
</g>
|
| 4358 |
</g>
|
| 4359 |
<g id="text_6">
|
| 4360 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.685933" transform="rotate(-0 40.72 334.685933)">1.5</text>
|
| 4361 |
</g>
|
| 4362 |
</g>
|
| 4363 |
<g id="ytick_3">
|
| 4364 |
<g id="grid-y--4" class="grid grid-y">
|
| 4365 |
+
<path d="M 47.72 252.999261 L 840.20233 252.999261 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4366 |
</g>
|
| 4367 |
<g id="line2d_7">
|
| 4368 |
<g>
|
| 4369 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="252.999261" style="stroke: #000000; stroke-width: 0.8" />
|
| 4370 |
</g>
|
| 4371 |
</g>
|
| 4372 |
<g id="text_7">
|
| 4373 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.79848" transform="rotate(-0 40.72 256.79848)">2.0</text>
|
| 4374 |
</g>
|
| 4375 |
</g>
|
| 4376 |
<g id="ytick_4">
|
| 4377 |
<g id="grid-y--5" class="grid grid-y">
|
| 4378 |
+
<path d="M 47.72 175.111809 L 840.20233 175.111809 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4379 |
</g>
|
| 4380 |
<g id="line2d_8">
|
| 4381 |
<g>
|
| 4382 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="175.111809" style="stroke: #000000; stroke-width: 0.8" />
|
| 4383 |
</g>
|
| 4384 |
</g>
|
| 4385 |
<g id="text_8">
|
| 4386 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.911028" transform="rotate(-0 40.72 178.911028)">2.5</text>
|
| 4387 |
</g>
|
| 4388 |
</g>
|
| 4389 |
<g id="ytick_5">
|
| 4390 |
<g id="grid-y--6" class="grid grid-y">
|
| 4391 |
+
<path d="M 47.72 97.224356 L 840.20233 97.224356 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4392 |
</g>
|
| 4393 |
<g id="line2d_9">
|
| 4394 |
<g>
|
| 4395 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="97.224356" style="stroke: #000000; stroke-width: 0.8" />
|
| 4396 |
</g>
|
| 4397 |
</g>
|
| 4398 |
<g id="text_9">
|
| 4399 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.023575" transform="rotate(-0 40.72 101.023575)">3.0</text>
|
| 4400 |
</g>
|
| 4401 |
</g>
|
| 4402 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4404 |
</g>
|
| 4405 |
</g>
|
| 4406 |
<g id="series--torch-layer-norm" class="series">
|
| 4407 |
+
<path d="M 83.741924 437.689571 L 323.888085 302.950354 L 564.034245 314.128917 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4408 |
<defs>
|
| 4409 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4410 |
</defs>
|
| 4411 |
<g clip-path="url(#p2214f54723)">
|
| 4412 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4413 |
+
<use ns4:href="#md7efaf3aec" x="323.888085" y="302.950354" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4414 |
+
<use ns4:href="#md7efaf3aec" x="564.034245" y="314.128917" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4415 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4416 |
</g>
|
| 4417 |
</g>
|
| 4418 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4419 |
+
<path d="M 83.741924 434.70508 L 323.888085 305.990613 L 564.034245 307.267967 L 804.180406 57.889324 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4420 |
<defs>
|
| 4421 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4422 |
</defs>
|
| 4423 |
<g clip-path="url(#p2214f54723)">
|
| 4424 |
+
<use ns4:href="#m9b8c54d372" x="83.741924" y="434.70508" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4425 |
+
<use ns4:href="#m9b8c54d372" x="323.888085" y="305.990613" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4426 |
+
<use ns4:href="#m9b8c54d372" x="564.034245" y="307.267967" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4427 |
+
<use ns4:href="#m9b8c54d372" x="804.180406" y="57.889324" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4428 |
</g>
|
| 4429 |
</g>
|
| 4430 |
<g id="patch_3">
|
openai_moe/impls/artifacts/benchmark/openai_moe.jsonl
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
-
{"ts": "2025-
|
| 2 |
-
{"ts": "2025-
|
| 3 |
-
{"ts": "2025-
|
| 4 |
-
{"ts": "2025-
|
| 5 |
-
{"ts": "2025-
|
| 6 |
-
{"ts": "2025-
|
| 7 |
-
{"ts": "2025-
|
| 8 |
-
{"ts": "2025-
|
|
|
|
| 1 |
+
{"ts": "2025-12-19T18:57:39Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_E2", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 155.7981640000321, "p50": 157.7297640000097, "p90": 159.48504900001126, "mean": 158.39911260001145, "iqr": 2.223896000032255, "raw_times": [161.72143300002517, 157.261152999979, 155.7981640000321, 159.48504900001126, 157.7297640000097], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 159.10347999999885, "peak_bytes": 416866816, "ok": true, "absmax": 2.765655517578125e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.765655517578125e-05, "mae": 2.0696452338597737e-06, "mse": 7.332408985538663e-12, "ref": "naive_moe"}, "err": null}
|
| 2 |
+
{"ts": "2025-12-19T18:58:03Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_E4", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 199.79041199997027, "p50": 204.82147100000248, "p90": 205.0451750000093, "mean": 203.32668460000605, "iqr": 3.4747309999829668, "raw_times": [205.40592100002186, 199.79041199997027, 201.57044400002633, 205.0451750000093, 204.82147100000248], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 200.72428899999295, "peak_bytes": 632035840, "ok": true, "absmax": 1.621246337890625e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.621246337890625e-05, "mae": 9.61917862696282e-07, "mse": 1.59423277530657e-12, "ref": "naive_moe"}, "err": null}
|
| 3 |
+
{"ts": "2025-12-19T18:58:47Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S1024_E2", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 372.8170420000083, "p50": 383.31174900002907, "p90": 392.9121939999618, "mean": 385.07766660000016, "iqr": 10.251173999961338, "raw_times": [393.68632800000114, 392.9121939999618, 382.66102000000046, 383.31174900002907, 372.8170420000083], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 393.062126000018, "peak_bytes": 643844608, "ok": true, "absmax": 2.6226043701171875e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.6226043701171875e-05, "mae": 2.0501920516835526e-06, "mse": 7.1848811622476916e-12, "ref": "naive_moe"}, "err": null}
|
| 4 |
+
{"ts": "2025-12-19T18:59:36Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S1024_E4", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 418.8624209999716, "p50": 421.41534400002456, "p90": 422.4395519999007, "mean": 421.30189059998884, "iqr": 1.8283119999296105, "raw_times": [423.18089600007625, 421.41534400002456, 418.8624209999716, 420.6112399999711, 422.4395519999007], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 421.8970150000132, "peak_bytes": 823386112, "ok": true, "absmax": 1.3589859008789062e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.3589859008789062e-05, "mae": 9.400179123986163e-07, "mse": 1.5130355735665235e-12, "ref": "naive_moe"}, "err": null}
|
| 5 |
+
{"ts": "2025-12-19T19:01:05Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S512_E2", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 766.098573000022, "p50": 773.6994500000947, "p90": 774.865274999911, "mean": 772.9573942000115, "iqr": 8.746404999897095, "raw_times": [766.1188700000139, 773.6994500000947, 766.098573000022, 774.865274999911, 784.0048030000162], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 782.9079639999463, "peak_bytes": 1036112384, "ok": true, "absmax": 3.2901763916015625e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 3.2901763916015625e-05, "mae": 2.0572656467265915e-06, "mse": 7.247809123700488e-12, "ref": "naive_moe"}, "err": null}
|
| 6 |
+
{"ts": "2025-12-19T19:02:49Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S512_E4", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 828.9166780000414, "p50": 840.0145479999992, "p90": 848.174653000001, "mean": 841.7884347999916, "iqr": 11.353517000088686, "raw_times": [855.0151590000041, 828.9166780000414, 848.174653000001, 836.8211359999123, 840.0145479999992], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 857.4785790000305, "peak_bytes": 1235263488, "ok": true, "absmax": 1.430511474609375e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.430511474609375e-05, "mae": 9.400343401466671e-07, "mse": 1.5107844445957919e-12, "ref": "naive_moe"}, "err": null}
|
| 7 |
+
{"ts": "2025-12-19T19:05:50Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S1024_E2", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1492.7651169999763, "p50": 1513.7102520000099, "p90": 1522.1755649999977, "mean": 1513.4781133999923, "iqr": 10.99431700004061, "raw_times": [1492.7651169999763, 1511.1812479999571, 1522.1755649999977, 1527.5583850000203, 1513.7102520000099], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1532.0516410000664, "peak_bytes": 1861947904, "ok": true, "absmax": 2.6226043701171875e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.6226043701171875e-05, "mae": 2.060702854578267e-06, "mse": 7.262949790198814e-12, "ref": "naive_moe"}, "err": null}
|
| 8 |
+
{"ts": "2025-12-19T19:09:07Z", "run": "aec86efe9ed1483a979a8427ac940bd6", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S1024_E4", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1654.5569229999728, "p50": 1658.7427389999903, "p90": 1665.0588319999997, "mean": 1660.4780848000016, "iqr": 7.11779099992782, "raw_times": [1658.7427389999903, 1665.0588319999997, 1666.0908889999746, 1657.941041000072, 1654.5569229999728], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1670.381679000002, "peak_bytes": 2062163968, "ok": true, "absmax": 1.5974044799804688e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.5974044799804688e-05, "mae": 9.529014732834185e-07, "mse": 1.5621694476192216e-12, "ref": "naive_moe"}, "err": null}
|
openai_moe/impls/binned_torch.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: nv | 0.
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3904,16 +3904,16 @@ Cell: nv | 0.22s
|
|
| 3904 |
</div>
|
| 3905 |
</div>
|
| 3906 |
<div id="output-nv" class="cell-output">
|
| 3907 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3908 |
+-----------------------------------------------------------------------------------------+
|
| 3909 |
-
| NVIDIA-SMI 580.
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
| 3911 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3912 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3913 |
| | | MIG M. |
|
| 3914 |
|=========================================+========================+======================|
|
| 3915 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3916 |
-
| N/A
|
| 3917 |
| | | N/A |
|
| 3918 |
+-----------------------------------------+------------------------+----------------------+
|
| 3919 |
|
|
@@ -3937,7 +3937,7 @@ Cell: nv | 0.22s
|
|
| 3937 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3938 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3939 |
</span> |
|
| 3940 |
-
Cell: benchmark |
|
| 3941 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3942 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3943 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4095,29 +4095,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S512_E2
|
|
| 4095 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4096 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4097 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4098 |
-
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4099 |
-
binned_torch
|
| 4100 |
-
aten::item 1.
|
| 4101 |
-
aten::_local_scalar_dense 5.
|
| 4102 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 15.
|
| 4103 |
-
aten::floor_divide 5.
|
| 4104 |
-
aten::bmm 0.02%
|
| 4105 |
-
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4106 |
-
aten::copy_ 3.
|
| 4107 |
-
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.579ms 12.
|
| 4108 |
-
aten::mul 3.
|
| 4109 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4110 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4111 |
-
aten::remainder 3.
|
| 4112 |
-
aten::add 2.
|
| 4113 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4114 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4115 |
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.023ms 3.99% 2.023ms 1.317us 1536
|
| 4116 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4117 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
-
Self CPU time total:
|
| 4120 |
-
Self CUDA time total: 50.
|
| 4121 |
|
| 4122 |
|
| 4123 |
|
|
@@ -4127,29 +4127,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S512_E4
|
|
| 4127 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4128 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4129 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4130 |
-
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4131 |
-
binned_torch 25
|
| 4132 |
-
aten::item 1.76% 16.
|
| 4133 |
-
aten::_local_scalar_dense
|
| 4134 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 17.
|
| 4135 |
-
|
| 4136 |
-
|
| 4137 |
-
|
| 4138 |
-
aten::copy_ 3.
|
| 4139 |
-
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4140 |
-
aten::add
|
| 4141 |
-
aten::mul
|
| 4142 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4143 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4144 |
-
aten::remainder 2.
|
| 4145 |
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 6.70% 3.655ms 1.190us 3072
|
| 4146 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4147 |
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.030ms 3.72% 2.030ms 1.322us 1536
|
| 4148 |
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.822ms 3.34% 1.822ms 1.186us 1536
|
| 4149 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4150 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4151 |
-
Self CPU time total:
|
| 4152 |
-
Self CUDA time total: 54.
|
| 4153 |
|
| 4154 |
|
| 4155 |
|
|
@@ -4159,29 +4159,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S1024_E2
|
|
| 4159 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4160 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4161 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4162 |
-
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4163 |
-
binned_torch
|
| 4164 |
-
aten::item 1.
|
| 4165 |
-
aten::_local_scalar_dense 5.
|
| 4166 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 31.
|
| 4167 |
-
aten::floor_divide 5.
|
| 4168 |
-
aten::bmm 0.01%
|
| 4169 |
-
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.
|
| 4170 |
-
aten::copy_ 3.
|
| 4171 |
-
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.
|
| 4172 |
-
aten::mul 3.
|
| 4173 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4174 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.
|
| 4175 |
-
aten::remainder 3.
|
| 4176 |
-
aten::add 2.
|
| 4177 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4178 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4179 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4180 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4181 |
-
aten::clamp 0.00%
|
| 4182 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4183 |
-
Self CPU time total: 1.
|
| 4184 |
-
Self CUDA time total:
|
| 4185 |
|
| 4186 |
|
| 4187 |
|
|
@@ -4191,29 +4191,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S1024_E4
|
|
| 4191 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4192 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4193 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4194 |
-
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4195 |
-
binned_torch
|
| 4196 |
-
aten::item 1.
|
| 4197 |
-
aten::_local_scalar_dense 6.
|
| 4198 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4199 |
-
aten::floor_divide 5.
|
| 4200 |
-
aten::bmm 0.01%
|
| 4201 |
-
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.
|
| 4202 |
-
aten::copy_ 3.
|
| 4203 |
-
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.
|
| 4204 |
-
aten::mul 2.
|
| 4205 |
-
aten::add 3.
|
| 4206 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4207 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.
|
| 4208 |
-
aten::remainder 2.
|
| 4209 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4210 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4211 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4212 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4213 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4214 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4215 |
-
Self CPU time total: 1.
|
| 4216 |
-
Self CUDA time total: 110.
|
| 4217 |
|
| 4218 |
|
| 4219 |
|
|
@@ -4223,29 +4223,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S512_E2
|
|
| 4223 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4224 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4225 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4226 |
-
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4227 |
-
binned_torch
|
| 4228 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 63.
|
| 4229 |
-
aten::item 1.
|
| 4230 |
-
aten::_local_scalar_dense
|
| 4231 |
-
aten::floor_divide 5.
|
| 4232 |
-
aten::bmm 0.01%
|
| 4233 |
-
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4234 |
-
aten::copy_ 3.
|
| 4235 |
-
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.
|
| 4236 |
-
aten::mul 3.
|
| 4237 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.
|
| 4238 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 17.
|
| 4239 |
-
aten::add 2.
|
| 4240 |
-
aten::remainder 3.
|
| 4241 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.
|
| 4242 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.
|
| 4243 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.
|
| 4244 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4245 |
-
|
| 4246 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4247 |
-
Self CPU time total: 3.
|
| 4248 |
-
Self CUDA time total: 212.
|
| 4249 |
|
| 4250 |
|
| 4251 |
|
|
@@ -4255,29 +4255,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S512_E4
|
|
| 4255 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4256 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4257 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4258 |
-
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4259 |
-
binned_torch 23.
|
| 4260 |
-
aten::item 1.
|
| 4261 |
-
aten::_local_scalar_dense
|
| 4262 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4263 |
-
aten::floor_divide 5.
|
| 4264 |
-
|
| 4265 |
-
|
| 4266 |
-
aten::copy_ 3.
|
| 4267 |
-
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.
|
| 4268 |
-
aten::mul
|
| 4269 |
-
aten::add
|
| 4270 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.
|
| 4271 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.
|
| 4272 |
-
aten::remainder 2.
|
| 4273 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.
|
| 4274 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.
|
| 4275 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.
|
| 4276 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4277 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4278 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4279 |
-
Self CPU time total: 3.
|
| 4280 |
-
Self CUDA time total: 225.
|
| 4281 |
|
| 4282 |
|
| 4283 |
|
|
@@ -4287,29 +4287,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S1024_E2
|
|
| 4287 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4288 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4289 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4290 |
-
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4291 |
-
binned_torch 24.10% 1.
|
| 4292 |
-
aten::item 1.
|
| 4293 |
-
aten::_local_scalar_dense
|
| 4294 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4295 |
-
aten::floor_divide 5.
|
| 4296 |
-
aten::bmm 0.00% 234.
|
| 4297 |
-
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 56.
|
| 4298 |
-
aten::copy_ 4.
|
| 4299 |
-
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.
|
| 4300 |
-
aten::mul 3.
|
| 4301 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.
|
| 4302 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 34.
|
| 4303 |
-
aten::add 2.
|
| 4304 |
-
aten::remainder 3.
|
| 4305 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.
|
| 4306 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.
|
| 4307 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.
|
| 4308 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.
|
| 4309 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4310 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4311 |
-
Self CPU time total:
|
| 4312 |
-
Self CUDA time total:
|
| 4313 |
|
| 4314 |
|
| 4315 |
|
|
@@ -4319,40 +4319,40 @@ PROFILE TRACE: binned_torch | cuda_B4_S1024_E4
|
|
| 4319 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4320 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4321 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4322 |
-
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4323 |
-
binned_torch 23.
|
| 4324 |
-
aten::item 1.
|
| 4325 |
-
aten::_local_scalar_dense 6.
|
| 4326 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4327 |
-
aten::floor_divide 5.
|
| 4328 |
-
|
| 4329 |
-
|
| 4330 |
-
aten::copy_ 3.
|
| 4331 |
-
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.
|
| 4332 |
-
aten::mul
|
| 4333 |
-
aten::add
|
| 4334 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.
|
| 4335 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 34.
|
| 4336 |
-
aten::remainder 2.
|
| 4337 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.
|
| 4338 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.
|
| 4339 |
-
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.
|
| 4340 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.
|
| 4341 |
-
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.
|
| 4342 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4343 |
-
Self CPU time total: 7.
|
| 4344 |
-
Self CUDA time total:
|
| 4345 |
|
| 4346 |
|
| 4347 |
impl wl p50(ms) ok
|
| 4348 |
-
binned_torch cuda_B1_S1024_E2
|
| 4349 |
-
binned_torch cuda_B1_S1024_E4
|
| 4350 |
-
binned_torch cuda_B1_S512_E2
|
| 4351 |
-
binned_torch cuda_B1_S512_E4
|
| 4352 |
-
binned_torch cuda_B4_S1024_E2
|
| 4353 |
-
binned_torch cuda_B4_S1024_E4
|
| 4354 |
-
binned_torch cuda_B4_S512_E2
|
| 4355 |
-
binned_torch cuda_B4_S512_E4
|
| 4356 |
</pre></div>
|
| 4357 |
<div class="cell-artifacts">
|
| 4358 |
<h4>Artifacts:</h4>
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: nv | 0.25s
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3904 |
</div>
|
| 3905 |
</div>
|
| 3906 |
<div id="output-nv" class="cell-output">
|
| 3907 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:56:28 2025
|
| 3908 |
+-----------------------------------------------------------------------------------------+
|
| 3909 |
+
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3910 |
+-----------------------------------------+------------------------+----------------------+
|
| 3911 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3912 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3913 |
| | | MIG M. |
|
| 3914 |
|=========================================+========================+======================|
|
| 3915 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3916 |
+
| N/A 34C P0 80W / 350W | 0MiB / 46068MiB | 41% Default |
|
| 3917 |
| | | N/A |
|
| 3918 |
+-----------------------------------------+------------------------+----------------------+
|
| 3919 |
|
|
|
|
| 3937 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3938 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3939 |
</span> |
|
| 3940 |
+
Cell: benchmark | 730.34s
|
| 3941 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3942 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3943 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4095 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4096 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4097 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4098 |
+
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 919.007ms 1814.55% 919.007ms 919.007ms 1
|
| 4099 |
+
binned_torch 24.74% 227.809ms 100.00% 920.989ms 920.989ms 0.000us 0.00% 50.650ms 50.650ms 1
|
| 4100 |
+
aten::item 1.86% 17.169ms 26.20% 241.261ms 15.722us 0.000us 0.00% 15.873ms 1.034us 15345
|
| 4101 |
+
aten::_local_scalar_dense 5.94% 54.669ms 24.33% 224.092ms 14.604us 15.872ms 31.34% 15.873ms 1.034us 15345
|
| 4102 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 15.872ms 31.34% 15.872ms 1.034us 15345
|
| 4103 |
+
aten::floor_divide 5.47% 50.387ms 13.12% 120.822ms 19.665us 7.812ms 15.43% 7.812ms 1.272us 6144
|
| 4104 |
+
aten::bmm 0.02% 191.383us 0.03% 231.124us 38.521us 7.592ms 14.99% 7.592ms 1.265ms 6
|
| 4105 |
+
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.592ms 14.99% 7.592ms 1.265ms 6
|
| 4106 |
+
aten::copy_ 3.61% 33.260ms 9.01% 82.984ms 13.480us 6.583ms 13.00% 6.585ms 1.070us 6156
|
| 4107 |
+
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.579ms 12.99% 6.579ms 1.069us 6153
|
| 4108 |
+
aten::mul 3.25% 29.933ms 5.69% 52.377ms 17.000us 4.706ms 9.29% 4.706ms 1.527us 3081
|
| 4109 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.478ms 8.84% 4.478ms 1.458us 3072
|
| 4110 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.159ms 8.21% 4.159ms 1.354us 3072
|
| 4111 |
+
aten::remainder 3.14% 28.956ms 4.78% 44.045ms 14.337us 3.839ms 7.58% 3.839ms 1.250us 3072
|
| 4112 |
+
aten::add 2.87% 26.444ms 4.82% 44.437ms 14.651us 3.761ms 7.43% 3.761ms 1.240us 3033
|
| 4113 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 7.22% 3.655ms 1.190us 3072
|
| 4114 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.365ms 6.64% 3.365ms 1.110us 3030
|
| 4115 |
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.023ms 3.99% 2.023ms 1.317us 1536
|
| 4116 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.816ms 3.58% 1.816ms 1.182us 1536
|
| 4117 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 287.650us 0.57% 287.650us 47.942us 6
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
+
Self CPU time total: 920.998ms
|
| 4120 |
+
Self CUDA time total: 50.647ms
|
| 4121 |
|
| 4122 |
|
| 4123 |
|
|
|
|
| 4127 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4128 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4129 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4130 |
+
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 934.694ms 1714.22% 934.694ms 934.694ms 1
|
| 4131 |
+
binned_torch 24.25% 226.767ms 100.00% 935.247ms 935.247ms 0.000us 0.00% 54.534ms 54.534ms 1
|
| 4132 |
+
aten::item 1.76% 16.424ms 27.79% 259.914ms 15.348us 0.000us 0.00% 17.987ms 1.062us 16935
|
| 4133 |
+
aten::_local_scalar_dense 6.05% 56.595ms 26.03% 243.490ms 14.378us 17.985ms 32.98% 17.987ms 1.062us 16935
|
| 4134 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 17.985ms 32.98% 17.985ms 1.062us 16935
|
| 4135 |
+
aten::floor_divide 5.13% 47.972ms 12.39% 115.852ms 18.856us 7.812ms 14.33% 7.813ms 1.272us 6144
|
| 4136 |
+
aten::bmm 0.02% 166.771us 0.02% 207.402us 34.567us 7.794ms 14.29% 7.794ms 1.299ms 6
|
| 4137 |
+
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.794ms 14.29% 7.794ms 1.299ms 6
|
| 4138 |
+
aten::copy_ 3.47% 32.488ms 8.51% 79.554ms 12.923us 6.633ms 12.17% 6.635ms 1.078us 6156
|
| 4139 |
+
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.630ms 12.16% 6.630ms 1.078us 6153
|
| 4140 |
+
aten::add 4.14% 38.686ms 7.06% 65.992ms 14.368us 5.259ms 9.64% 5.259ms 1.145us 4593
|
| 4141 |
+
aten::mul 3.02% 28.215ms 5.35% 50.047ms 16.244us 4.701ms 8.62% 4.701ms 1.526us 3081
|
| 4142 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.474ms 8.21% 4.474ms 1.457us 3072
|
| 4143 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.157ms 7.62% 4.157ms 1.353us 3072
|
| 4144 |
+
aten::remainder 2.81% 26.265ms 4.43% 41.468ms 13.499us 3.852ms 7.06% 3.852ms 1.254us 3072
|
| 4145 |
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 6.70% 3.655ms 1.190us 3072
|
| 4146 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.270ms 6.00% 3.270ms 1.079us 3030
|
| 4147 |
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.030ms 3.72% 2.030ms 1.322us 1536
|
| 4148 |
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.822ms 3.34% 1.822ms 1.186us 1536
|
| 4149 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.584ms 2.91% 1.584ms 1.015us 1560
|
| 4150 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4151 |
+
Self CPU time total: 935.255ms
|
| 4152 |
+
Self CUDA time total: 54.526ms
|
| 4153 |
|
| 4154 |
|
| 4155 |
|
|
|
|
| 4159 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4160 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4161 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4162 |
+
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.775s 1705.66% 1.775s 1.775s 1
|
| 4163 |
+
binned_torch 24.39% 432.670ms 100.00% 1.774s 1.774s 0.000us 0.00% 104.087ms 104.087ms 1
|
| 4164 |
+
aten::item 1.67% 29.627ms 26.26% 465.825ms 15.266us 0.000us 0.00% 31.856ms 1.044us 30513
|
| 4165 |
+
aten::_local_scalar_dense 5.88% 104.231ms 24.59% 436.198ms 14.295us 31.854ms 30.61% 31.856ms 1.044us 30513
|
| 4166 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 31.854ms 30.61% 31.854ms 1.044us 30513
|
| 4167 |
+
aten::floor_divide 5.49% 97.404ms 13.46% 238.769ms 19.431us 15.611ms 15.00% 15.612ms 1.270us 12288
|
| 4168 |
+
aten::bmm 0.01% 215.332us 0.01% 258.864us 43.144us 15.009ms 14.42% 15.009ms 2.502ms 6
|
| 4169 |
+
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.009ms 14.42% 15.009ms 2.502ms 6
|
| 4170 |
+
aten::copy_ 3.73% 66.187ms 9.04% 160.371ms 13.038us 13.330ms 12.81% 13.331ms 1.084us 12300
|
| 4171 |
+
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.326ms 12.80% 13.326ms 1.084us 12294
|
| 4172 |
+
aten::mul 3.16% 56.128ms 5.72% 101.496ms 16.495us 11.275ms 10.83% 11.277ms 1.833us 6153
|
| 4173 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.921ms 9.53% 9.921ms 1.615us 6144
|
| 4174 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.311ms 7.99% 8.311ms 1.353us 6144
|
| 4175 |
+
aten::remainder 3.23% 57.334ms 5.09% 90.371ms 14.709us 7.676ms 7.38% 7.678ms 1.250us 6144
|
| 4176 |
+
aten::add 2.88% 51.067ms 5.02% 88.987ms 15.049us 7.641ms 7.34% 7.642ms 1.292us 5913
|
| 4177 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.300ms 7.01% 7.300ms 1.188us 6144
|
| 4178 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.359ms 6.11% 6.359ms 1.076us 5910
|
| 4179 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.045ms 3.89% 4.045ms 1.317us 3072
|
| 4180 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.632ms 3.49% 3.632ms 1.182us 3072
|
| 4181 |
+
aten::clamp 0.00% 74.963us 0.01% 122.824us 20.471us 1.191ms 1.14% 1.191ms 198.444us 6
|
| 4182 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4183 |
+
Self CPU time total: 1.774s
|
| 4184 |
+
Self CUDA time total: 104.078ms
|
| 4185 |
|
| 4186 |
|
| 4187 |
|
|
|
|
| 4191 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4192 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4193 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4194 |
+
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.943s 1756.79% 1.943s 1.943s 1
|
| 4195 |
+
binned_torch 24.29% 471.728ms 100.00% 1.942s 1.942s 0.000us 0.00% 110.592ms 110.592ms 1
|
| 4196 |
+
aten::item 1.62% 31.476ms 26.94% 523.166ms 15.511us 0.000us 0.00% 35.330ms 1.047us 33729
|
| 4197 |
+
aten::_local_scalar_dense 6.11% 118.659ms 25.32% 491.691ms 14.578us 35.327ms 31.95% 35.330ms 1.047us 33729
|
| 4198 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 35.327ms 31.95% 35.327ms 1.047us 33728
|
| 4199 |
+
aten::floor_divide 5.19% 100.816ms 12.43% 241.273ms 19.635us 15.609ms 14.12% 15.611ms 1.270us 12288
|
| 4200 |
+
aten::bmm 0.01% 222.165us 0.01% 267.105us 44.517us 15.085ms 13.64% 15.085ms 2.514ms 6
|
| 4201 |
+
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.085ms 13.64% 15.085ms 2.514ms 6
|
| 4202 |
+
aten::copy_ 3.60% 69.833ms 8.76% 170.090ms 13.828us 13.355ms 12.08% 13.357ms 1.086us 12300
|
| 4203 |
+
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.353ms 12.07% 13.353ms 1.086us 12294
|
| 4204 |
+
aten::mul 2.94% 57.042ms 5.32% 103.331ms 16.794us 10.942ms 9.89% 10.942ms 1.778us 6153
|
| 4205 |
+
aten::add 3.88% 75.326ms 6.94% 134.721ms 14.806us 10.866ms 9.83% 10.866ms 1.194us 9099
|
| 4206 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.591ms 8.67% 9.591ms 1.561us 6144
|
| 4207 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.314ms 7.52% 8.314ms 1.353us 6144
|
| 4208 |
+
aten::remainder 2.77% 53.827ms 4.45% 86.321ms 14.050us 7.697ms 6.96% 7.697ms 1.253us 6144
|
| 4209 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.295ms 6.60% 7.295ms 1.187us 6144
|
| 4210 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.370ms 5.76% 6.370ms 1.078us 5910
|
| 4211 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.058ms 3.67% 4.058ms 1.321us 3072
|
| 4212 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.639ms 3.29% 3.639ms 1.185us 3072
|
| 4213 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.234ms 2.92% 3.234ms 1.015us 3186
|
| 4214 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4215 |
+
Self CPU time total: 1.942s
|
| 4216 |
+
Self CUDA time total: 110.585ms
|
| 4217 |
|
| 4218 |
|
| 4219 |
|
|
|
|
| 4223 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4224 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4225 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4226 |
+
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.554s 1668.92% 3.554s 3.554s 1
|
| 4227 |
+
binned_torch 24.03% 852.954ms 100.00% 3.549s 3.549s 0.000us 0.00% 212.979ms 212.979ms 1
|
| 4228 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 63.933ms 30.02% 63.933ms 1.038us 61586
|
| 4229 |
+
aten::item 1.68% 59.518ms 26.66% 946.248ms 15.364us 0.000us 0.00% 63.933ms 1.038us 61587
|
| 4230 |
+
aten::_local_scalar_dense 6.15% 218.157ms 24.98% 886.634ms 14.396us 63.932ms 30.02% 63.933ms 1.038us 61587
|
| 4231 |
+
aten::floor_divide 5.36% 190.145ms 13.28% 471.339ms 19.179us 31.621ms 14.85% 31.623ms 1.287us 24576
|
| 4232 |
+
aten::bmm 0.01% 230.233us 0.01% 275.904us 45.984us 28.855ms 13.55% 28.855ms 4.809ms 6
|
| 4233 |
+
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 28.855ms 13.55% 28.855ms 4.809ms 6
|
| 4234 |
+
aten::copy_ 3.84% 136.428ms 9.38% 333.073ms 13.546us 26.747ms 12.56% 26.749ms 1.088us 24588
|
| 4235 |
+
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.744ms 12.56% 26.744ms 1.088us 24582
|
| 4236 |
+
aten::mul 3.20% 113.415ms 5.79% 205.629ms 16.722us 25.614ms 12.03% 25.614ms 2.083us 12297
|
| 4237 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.161ms 10.41% 22.161ms 1.803us 12288
|
| 4238 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 17.018ms 7.99% 17.018ms 1.385us 12288
|
| 4239 |
+
aten::add 2.93% 103.833ms 5.19% 184.217ms 14.843us 16.665ms 7.83% 16.666ms 1.343us 12411
|
| 4240 |
+
aten::remainder 3.13% 110.979ms 5.01% 177.878ms 14.476us 15.442ms 7.25% 15.444ms 1.257us 12288
|
| 4241 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.607ms 6.86% 14.607ms 1.189us 12288
|
| 4242 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.543ms 6.36% 13.543ms 1.091us 12408
|
| 4243 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.136ms 3.82% 8.136ms 1.324us 6144
|
| 4244 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.305ms 3.43% 7.305ms 1.189us 6144
|
| 4245 |
+
aten::clamp 0.00% 80.604us 0.00% 131.123us 21.854us 2.608ms 1.22% 2.608ms 434.678us 6
|
| 4246 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4247 |
+
Self CPU time total: 3.549s
|
| 4248 |
+
Self CUDA time total: 212.971ms
|
| 4249 |
|
| 4250 |
|
| 4251 |
|
|
|
|
| 4255 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4256 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4257 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4258 |
+
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.834s 1701.16% 3.834s 3.834s 1
|
| 4259 |
+
binned_torch 23.91% 917.039ms 100.00% 3.836s 3.836s 0.000us 0.00% 225.394ms 225.394ms 1
|
| 4260 |
+
aten::item 1.70% 65.086ms 27.21% 1.044s 15.386us 0.000us 0.00% 70.210ms 1.035us 67845
|
| 4261 |
+
aten::_local_scalar_dense 6.32% 242.356ms 25.52% 978.758ms 14.426us 70.207ms 31.15% 70.210ms 1.035us 67845
|
| 4262 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 70.207ms 31.15% 70.207ms 1.035us 67840
|
| 4263 |
+
aten::floor_divide 5.09% 195.347ms 12.48% 478.676ms 19.477us 31.474ms 13.97% 31.481ms 1.281us 24576
|
| 4264 |
+
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 28.832ms 12.79% 28.832ms 4.805ms 6
|
| 4265 |
+
aten::bmm 0.01% 227.473us 0.01% 274.364us 45.727us 28.832ms 12.79% 28.832ms 4.805ms 6
|
| 4266 |
+
aten::copy_ 3.61% 138.479ms 8.82% 338.314ms 13.759us 26.687ms 11.84% 26.689ms 1.085us 24588
|
| 4267 |
+
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.685ms 11.84% 26.685ms 1.086us 24581
|
| 4268 |
+
aten::mul 2.97% 113.735ms 5.38% 206.436ms 16.787us 25.537ms 11.33% 25.539ms 2.077us 12297
|
| 4269 |
+
aten::add 4.18% 160.247ms 7.41% 284.235ms 15.249us 23.217ms 10.30% 23.217ms 1.246us 18639
|
| 4270 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.084ms 9.80% 22.084ms 1.797us 12288
|
| 4271 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.963ms 7.53% 16.963ms 1.381us 12287
|
| 4272 |
+
aten::remainder 2.89% 110.779ms 4.66% 178.579ms 14.533us 15.327ms 6.80% 15.329ms 1.247us 12288
|
| 4273 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.512ms 6.44% 14.512ms 1.181us 12287
|
| 4274 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.655ms 6.06% 13.655ms 1.101us 12407
|
| 4275 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.083ms 3.59% 8.083ms 1.316us 6144
|
| 4276 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.244ms 3.21% 7.244ms 1.179us 6144
|
| 4277 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.461ms 2.87% 6.461ms 1.037us 6228
|
| 4278 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4279 |
+
Self CPU time total: 3.836s
|
| 4280 |
+
Self CUDA time total: 225.376ms
|
| 4281 |
|
| 4282 |
|
| 4283 |
|
|
|
|
| 4287 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4288 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4289 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4290 |
+
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.307s 1714.16% 7.307s 7.307s 1
|
| 4291 |
+
binned_torch 24.10% 1.762s 100.00% 7.313s 7.313s 0.000us 0.00% 426.284ms 426.284ms 1
|
| 4292 |
+
aten::item 1.74% 126.959ms 26.39% 1.930s 15.721us 0.000us 0.00% 128.245ms 1.045us 122763
|
| 4293 |
+
aten::_local_scalar_dense 6.22% 454.984ms 24.65% 1.803s 14.685us 128.239ms 30.08% 128.245ms 1.045us 122763
|
| 4294 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 128.241ms 30.08% 128.241ms 1.045us 122762
|
| 4295 |
+
aten::floor_divide 5.53% 404.463ms 13.23% 967.808ms 19.690us 63.393ms 14.87% 63.393ms 1.290us 49152
|
| 4296 |
+
aten::bmm 0.00% 234.623us 0.00% 278.223us 46.371us 56.525ms 13.26% 56.525ms 9.421ms 6
|
| 4297 |
+
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 56.525ms 13.26% 56.525ms 9.421ms 6
|
| 4298 |
+
aten::copy_ 4.05% 295.852ms 9.44% 690.402ms 14.045us 53.639ms 12.58% 53.640ms 1.091us 49158
|
| 4299 |
+
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.636ms 12.58% 53.636ms 1.091us 49154
|
| 4300 |
+
aten::mul 3.24% 237.068ms 5.73% 419.319ms 17.056us 51.499ms 12.08% 51.504ms 2.095us 24585
|
| 4301 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.577ms 10.46% 44.577ms 1.814us 24576
|
| 4302 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 34.181ms 8.02% 34.181ms 1.391us 24576
|
| 4303 |
+
aten::add 2.92% 213.232ms 5.07% 370.760ms 15.173us 33.603ms 7.88% 33.606ms 1.375us 24435
|
| 4304 |
+
aten::remainder 3.14% 229.281ms 5.03% 367.714ms 14.962us 30.916ms 7.25% 30.921ms 1.258us 24576
|
| 4305 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.214ms 6.85% 29.214ms 1.189us 24576
|
| 4306 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.954ms 6.32% 26.954ms 1.103us 24431
|
| 4307 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.285ms 3.82% 16.285ms 1.325us 12288
|
| 4308 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.630ms 3.43% 14.630ms 1.191us 12288
|
| 4309 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 5.208ms 1.22% 5.208ms 868.029us 6
|
| 4310 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4311 |
+
Self CPU time total: 7.313s
|
| 4312 |
+
Self CUDA time total: 426.263ms
|
| 4313 |
|
| 4314 |
|
| 4315 |
|
|
|
|
| 4319 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4320 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4321 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4322 |
+
binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.520s 1665.26% 7.520s 7.520s 1
|
| 4323 |
+
binned_torch 23.83% 1.792s 100.00% 7.522s 7.522s 0.000us 0.00% 451.603ms 451.603ms 1
|
| 4324 |
+
aten::item 1.82% 136.877ms 27.31% 2.054s 15.246us 0.000us 0.00% 140.837ms 1.045us 134715
|
| 4325 |
+
aten::_local_scalar_dense 6.26% 471.062ms 25.49% 1.917s 14.230us 140.825ms 31.19% 140.837ms 1.045us 134715
|
| 4326 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 140.826ms 31.19% 140.826ms 1.045us 134706
|
| 4327 |
+
aten::floor_divide 5.15% 387.087ms 12.45% 936.766ms 19.059us 63.494ms 14.06% 63.499ms 1.292us 49152
|
| 4328 |
+
aten::bmm 0.00% 222.563us 0.00% 265.513us 44.252us 56.696ms 12.56% 56.696ms 9.449ms 6
|
| 4329 |
+
ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 56.696ms 12.56% 56.696ms 9.449ms 6
|
| 4330 |
+
aten::copy_ 3.71% 279.306ms 8.85% 665.315ms 13.534us 53.897ms 11.94% 53.900ms 1.096us 49158
|
| 4331 |
+
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.894ms 11.94% 53.894ms 1.097us 49149
|
| 4332 |
+
aten::mul 3.04% 228.311ms 5.39% 405.691ms 16.502us 51.688ms 11.45% 51.695ms 2.103us 24585
|
| 4333 |
+
aten::add 4.00% 300.523ms 6.98% 525.049ms 14.441us 45.565ms 10.09% 45.568ms 1.253us 36357
|
| 4334 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.621ms 9.88% 44.621ms 1.816us 24576
|
| 4335 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 34.193ms 7.57% 34.193ms 1.391us 24573
|
| 4336 |
+
aten::remainder 2.86% 215.282ms 4.58% 344.226ms 14.007us 30.855ms 6.83% 30.857ms 1.256us 24576
|
| 4337 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.302ms 6.49% 29.302ms 1.192us 24573
|
| 4338 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.656ms 5.90% 26.656ms 1.091us 24431
|
| 4339 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.266ms 3.60% 16.266ms 1.324us 12288
|
| 4340 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.588ms 3.23% 14.588ms 1.187us 12288
|
| 4341 |
+
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.278ms 2.72% 12.278ms 1.030us 11922
|
| 4342 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4343 |
+
Self CPU time total: 7.522s
|
| 4344 |
+
Self CUDA time total: 451.562ms
|
| 4345 |
|
| 4346 |
|
| 4347 |
impl wl p50(ms) ok
|
| 4348 |
+
binned_torch cuda_B1_S1024_E2 383.31 True
|
| 4349 |
+
binned_torch cuda_B1_S1024_E4 421.42 True
|
| 4350 |
+
binned_torch cuda_B1_S512_E2 157.73 True
|
| 4351 |
+
binned_torch cuda_B1_S512_E4 204.82 True
|
| 4352 |
+
binned_torch cuda_B4_S1024_E2 1513.71 True
|
| 4353 |
+
binned_torch cuda_B4_S1024_E4 1658.74 True
|
| 4354 |
+
binned_torch cuda_B4_S512_E2 773.70 True
|
| 4355 |
+
binned_torch cuda_B4_S512_E4 840.01 True
|
| 4356 |
</pre></div>
|
| 4357 |
<div class="cell-artifacts">
|
| 4358 |
<h4>Artifacts:</h4>
|
openai_moe/impls/gpt_oss_moe.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
-
Cell: nv | 0.
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3905,16 +3905,16 @@ Cell: nv | 0.22s
|
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div id="output-nv" class="cell-output">
|
| 3908 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
-
| NVIDIA-SMI 580.
|
| 3911 |
+-----------------------------------------+------------------------+----------------------+
|
| 3912 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3913 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3914 |
| | | MIG M. |
|
| 3915 |
|=========================================+========================+======================|
|
| 3916 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3917 |
-
| N/A
|
| 3918 |
| | | N/A |
|
| 3919 |
+-----------------------------------------+------------------------+----------------------+
|
| 3920 |
|
|
@@ -3938,7 +3938,7 @@ Cell: nv | 0.22s
|
|
| 3938 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3939 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3940 |
</span> |
|
| 3941 |
-
Cell: benchmark |
|
| 3942 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3943 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3944 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4042,29 +4042,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E2
|
|
| 4042 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4043 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4044 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4045 |
-
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 10.
|
| 4046 |
-
gpt_oss_experts 15.
|
| 4047 |
-
aten::matmul 0.
|
| 4048 |
-
aten::mm 2.
|
| 4049 |
-
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4050 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4051 |
-
aten::mul 1.
|
| 4052 |
-
aten::add 1.
|
| 4053 |
-
aten::index 1.
|
| 4054 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4055 |
-
aten::index_add_ 0.
|
| 4056 |
-
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4057 |
-
aten::nonzero 2.
|
| 4058 |
-
aten::clamp
|
| 4059 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4060 |
-
aten::where 0.06% 7.
|
| 4061 |
-
aten::nonzero_numpy 0.
|
| 4062 |
-
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 60.
|
| 4063 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 56.
|
| 4064 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
-
Self CPU time total: 12.
|
| 4067 |
-
Self CUDA time total: 5.
|
| 4068 |
|
| 4069 |
|
| 4070 |
|
|
@@ -4074,29 +4074,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E4
|
|
| 4074 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4075 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
-
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 13.
|
| 4078 |
-
gpt_oss_experts
|
| 4079 |
-
aten::matmul 0.
|
| 4080 |
-
aten::mm 2.
|
| 4081 |
-
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4082 |
-
aten::nonzero 2.
|
| 4083 |
-
aten::mul 1.
|
| 4084 |
-
aten::add 2.
|
| 4085 |
-
aten::where 0.
|
| 4086 |
-
aten::nonzero_numpy 0.13% 20.
|
| 4087 |
-
aten::index 2.
|
| 4088 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4089 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4090 |
-
aten::clamp 1.
|
| 4091 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4092 |
-
aten::item 0.49%
|
| 4093 |
-
aten::_local_scalar_dense 1.
|
| 4094 |
-
aten::index_add_ 0.59%
|
| 4095 |
-
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 71.
|
| 4096 |
-
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 66.
|
| 4097 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4098 |
-
Self CPU time total:
|
| 4099 |
-
Self CUDA time total: 6.
|
| 4100 |
|
| 4101 |
|
| 4102 |
|
|
@@ -4106,29 +4106,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E2
|
|
| 4106 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4107 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
-
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 12.
|
| 4110 |
-
gpt_oss_experts 11.
|
| 4111 |
-
aten::matmul 0.
|
| 4112 |
-
aten::mm 1.
|
| 4113 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4114 |
-
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4115 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4116 |
-
aten::mul 1.00%
|
| 4117 |
-
aten::add 1.
|
| 4118 |
-
aten::index_add_ 0.32%
|
| 4119 |
-
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 164.
|
| 4120 |
-
|
| 4121 |
-
|
| 4122 |
-
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4123 |
-
aten::clamp 0.
|
| 4124 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4125 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4126 |
-
aten::nonzero 1.
|
| 4127 |
-
aten::where 0.04% 5.
|
| 4128 |
-
aten::nonzero_numpy 0.07% 10.
|
| 4129 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4130 |
-
Self CPU time total: 14.
|
| 4131 |
-
Self CUDA time total: 8.
|
| 4132 |
|
| 4133 |
|
| 4134 |
|
|
@@ -4138,29 +4138,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E4
|
|
| 4138 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4139 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4140 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4141 |
-
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 18.
|
| 4142 |
-
gpt_oss_experts
|
| 4143 |
-
aten::matmul 0.
|
| 4144 |
-
aten::mm 2.
|
| 4145 |
-
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4146 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4147 |
-
aten::mul
|
| 4148 |
-
aten::add 1.
|
| 4149 |
-
aten::index 1.
|
| 4150 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4151 |
-
aten::index_add_ 0.
|
| 4152 |
-
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4153 |
-
aten::nonzero 1.
|
| 4154 |
-
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4155 |
-
aten::where 0.05% 10.
|
| 4156 |
-
aten::nonzero_numpy 0.10% 20.
|
| 4157 |
-
aten::clamp 1.
|
| 4158 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4159 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4160 |
-
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4161 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4162 |
-
Self CPU time total: 20.
|
| 4163 |
-
Self CUDA time total: 10.
|
| 4164 |
|
| 4165 |
|
| 4166 |
|
|
@@ -4170,29 +4170,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E2
|
|
| 4170 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4171 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4172 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4173 |
-
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4174 |
-
gpt_oss_experts 7.
|
| 4175 |
-
aten::matmul 0.
|
| 4176 |
-
aten::mm 1.
|
| 4177 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4178 |
-
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4179 |
-
aten::add 0.
|
| 4180 |
-
aten::mul 0.
|
| 4181 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4182 |
-
aten::index_add_ 0.21%
|
| 4183 |
-
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4184 |
-
aten::clamp 0.46%
|
| 4185 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4186 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4187 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 276.
|
| 4188 |
-
aten::index 0.
|
| 4189 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4190 |
-
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4191 |
-
aten::sigmoid 0.
|
| 4192 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 176.
|
| 4193 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4194 |
-
Self CPU time total: 23.
|
| 4195 |
-
Self CUDA time total: 17.
|
| 4196 |
|
| 4197 |
|
| 4198 |
|
|
@@ -4202,29 +4202,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E4
|
|
| 4202 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4203 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4204 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4205 |
-
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 24.
|
| 4206 |
-
gpt_oss_experts
|
| 4207 |
-
aten::matmul 0.
|
| 4208 |
-
aten::mm
|
| 4209 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4210 |
-
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4211 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4212 |
-
aten::add 1.
|
| 4213 |
-
aten::mul 1.
|
| 4214 |
-
aten::index_add_ 0.
|
| 4215 |
-
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4216 |
-
aten::index 1.
|
| 4217 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 337.
|
| 4218 |
-
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4219 |
-
aten::clamp 0.
|
| 4220 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4221 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 218.
|
| 4222 |
-
aten::nonzero 1.
|
| 4223 |
-
aten::where 0.04%
|
| 4224 |
-
aten::nonzero_numpy 0.
|
| 4225 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4226 |
-
Self CPU time total:
|
| 4227 |
-
Self CUDA time total: 17.
|
| 4228 |
|
| 4229 |
|
| 4230 |
|
|
@@ -4234,29 +4234,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E2
|
|
| 4234 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4235 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4236 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4237 |
-
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.
|
| 4238 |
-
gpt_oss_experts 4.
|
| 4239 |
-
aten::matmul 0.05%
|
| 4240 |
-
aten::mm 0.
|
| 4241 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4242 |
-
aten::mul 0.37%
|
| 4243 |
-
aten::add 0.45% 185.
|
| 4244 |
-
aten::clamp 0.
|
| 4245 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4246 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4247 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.
|
| 4248 |
-
aten::index_add_ 0.12% 48.
|
| 4249 |
-
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4250 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4251 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4252 |
-
aten::index 0.
|
| 4253 |
-
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4254 |
-
aten::sigmoid 0.
|
| 4255 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4256 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4257 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4258 |
-
Self CPU time total:
|
| 4259 |
-
Self CUDA time total:
|
| 4260 |
|
| 4261 |
|
| 4262 |
|
|
@@ -4266,54 +4266,56 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E4
|
|
| 4266 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4267 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4268 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4269 |
-
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4270 |
-
gpt_oss_experts 6.
|
| 4271 |
-
aten::matmul 0.
|
| 4272 |
-
aten::mm 1.
|
| 4273 |
-
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 20.
|
| 4274 |
-
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 8.
|
| 4275 |
-
aten::add 0.
|
| 4276 |
-
aten::mul 0.72%
|
| 4277 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4278 |
-
aten::index_add_ 0.23%
|
| 4279 |
-
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4280 |
-
aten::clamp 0.
|
| 4281 |
-
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4282 |
-
aten::index 0.
|
| 4283 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 648.
|
| 4284 |
-
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4285 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4286 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4287 |
-
aten::sigmoid 0.17%
|
| 4288 |
-
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4289 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4290 |
-
Self CPU time total:
|
| 4291 |
-
Self CUDA time total:
|
| 4292 |
|
| 4293 |
|
| 4294 |
impl wl p50(ms) ok
|
| 4295 |
-
gpt_oss_experts cuda_B1_S1024_E2 3.
|
| 4296 |
-
gpt_oss_experts cuda_B1_S1024_E4 5.
|
| 4297 |
-
gpt_oss_experts cuda_B1_S512_E2 2.
|
| 4298 |
-
gpt_oss_experts cuda_B1_S512_E4 3.
|
| 4299 |
-
gpt_oss_experts cuda_B4_S1024_E2 13.
|
| 4300 |
-
gpt_oss_experts cuda_B4_S1024_E4 13.
|
| 4301 |
-
gpt_oss_experts cuda_B4_S512_E2 6.
|
| 4302 |
-
gpt_oss_experts cuda_B4_S512_E4 7.
|
| 4303 |
</pre></div>
|
| 4304 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4305 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4306 |
<div class="uv-logs-content" style="display: none;">
|
| 4307 |
Updating https://github.com/huggingface/kernels.git (HEAD)
|
| 4308 |
-
Updated https://github.com/huggingface/kernels.git (
|
| 4309 |
-
Building kernels @ git+https://github.com/huggingface/kernels.git@
|
| 4310 |
-
Built kernels @ git+https://github.com/huggingface/kernels.git@
|
| 4311 |
-
Installed
|
| 4312 |
</div>
|
| 4313 |
</div>
|
| 4314 |
-
<div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 4315 |
-
|
| 4316 |
-
Fetching 6 files:
|
|
|
|
|
|
|
| 4317 |
<div class="cell-artifacts">
|
| 4318 |
<h4>Artifacts:</h4>
|
| 4319 |
<a href="artifacts/benchmark/openai_moe.jsonl" class="artifact" target="_blank">openai_moe.jsonl</a>
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3888 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3889 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3890 |
</span> |
|
| 3891 |
+
Cell: nv | 0.25s
|
| 3892 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3893 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3894 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div id="output-nv" class="cell-output">
|
| 3908 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 18:56:28 2025
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
+
| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
|
| 3911 |
+-----------------------------------------+------------------------+----------------------+
|
| 3912 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3913 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3914 |
| | | MIG M. |
|
| 3915 |
|=========================================+========================+======================|
|
| 3916 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3917 |
+
| N/A 34C P0 80W / 350W | 0MiB / 46068MiB | 41% Default |
|
| 3918 |
| | | N/A |
|
| 3919 |
+-----------------------------------------+------------------------+----------------------+
|
| 3920 |
|
|
|
|
| 3938 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3939 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3940 |
</span> |
|
| 3941 |
+
Cell: benchmark | 24.78s
|
| 3942 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3943 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3944 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4042 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4043 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4044 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4045 |
+
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 10.216ms 197.55% 10.216ms 10.216ms 1
|
| 4046 |
+
gpt_oss_experts 15.91% 1.991ms 99.94% 12.506ms 12.506ms 0.000us 0.00% 5.174ms 5.174ms 1
|
| 4047 |
+
aten::matmul 0.20% 25.600us 3.83% 479.475us 39.956us 0.000us 0.00% 4.551ms 379.252us 12
|
| 4048 |
+
aten::mm 2.39% 299.076us 3.63% 453.875us 37.823us 4.551ms 88.01% 4.551ms 379.252us 12
|
| 4049 |
+
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.080ms 59.56% 3.080ms 342.220us 9
|
| 4050 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.465ms 28.33% 1.465ms 488.237us 3
|
| 4051 |
+
aten::mul 1.26% 158.145us 2.18% 272.217us 11.342us 108.227us 2.09% 108.227us 4.509us 24
|
| 4052 |
+
aten::add 1.55% 194.211us 3.70% 462.764us 25.709us 102.178us 1.98% 102.178us 5.677us 18
|
| 4053 |
+
aten::index 1.59% 198.973us 2.67% 334.663us 27.889us 88.354us 1.71% 88.354us 7.363us 12
|
| 4054 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 79.810us 1.54% 79.810us 6.651us 12
|
| 4055 |
+
aten::index_add_ 0.45% 56.680us 0.73% 90.740us 15.123us 79.552us 1.54% 79.552us 13.259us 6
|
| 4056 |
+
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 79.552us 1.54% 79.552us 13.259us 6
|
| 4057 |
+
aten::nonzero 2.18% 273.387us 6.63% 829.392us 92.155us 65.344us 1.26% 76.032us 8.448us 9
|
| 4058 |
+
aten::clamp 1.03% 129.422us 1.66% 207.823us 17.319us 62.817us 1.21% 62.817us 5.235us 12
|
| 4059 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.817us 1.21% 62.817us 5.235us 12
|
| 4060 |
+
aten::where 0.06% 7.719us 5.20% 651.098us 108.516us 0.000us 0.00% 61.377us 10.230us 6
|
| 4061 |
+
aten::nonzero_numpy 0.10% 11.990us 5.14% 643.379us 107.230us 0.000us 0.00% 61.377us 10.230us 6
|
| 4062 |
+
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 60.705us 1.17% 60.705us 10.117us 6
|
| 4063 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 56.224us 1.09% 56.224us 4.685us 12
|
| 4064 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 52.097us 1.01% 52.097us 1.158us 45
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
+
Self CPU time total: 12.513ms
|
| 4067 |
+
Self CUDA time total: 5.171ms
|
| 4068 |
|
| 4069 |
|
| 4070 |
|
|
|
|
| 4074 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4075 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
+
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 13.651ms 223.52% 13.651ms 13.651ms 1
|
| 4078 |
+
gpt_oss_experts 16.12% 2.545ms 99.96% 15.780ms 15.780ms 0.000us 0.00% 6.110ms 6.110ms 1
|
| 4079 |
+
aten::matmul 0.27% 42.481us 4.88% 770.802us 32.117us 0.000us 0.00% 5.294ms 220.572us 24
|
| 4080 |
+
aten::mm 2.84% 449.097us 4.61% 728.321us 30.347us 5.294ms 86.68% 5.294ms 220.572us 24
|
| 4081 |
+
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.237ms 85.76% 5.237ms 218.225us 24
|
| 4082 |
+
aten::nonzero 2.39% 377.468us 7.75% 1.223ms 81.521us 114.980us 1.88% 137.541us 9.169us 15
|
| 4083 |
+
aten::mul 1.82% 287.750us 3.13% 494.205us 10.296us 131.291us 2.15% 131.291us 2.735us 48
|
| 4084 |
+
aten::add 2.12% 335.279us 3.54% 558.312us 15.509us 126.947us 2.08% 126.947us 3.526us 36
|
| 4085 |
+
aten::where 0.06% 10.192us 7.29% 1.151ms 95.886us 0.000us 0.00% 123.269us 10.272us 12
|
| 4086 |
+
aten::nonzero_numpy 0.13% 20.434us 7.22% 1.140ms 95.037us 0.000us 0.00% 123.269us 10.272us 12
|
| 4087 |
+
aten::index 2.26% 356.611us 3.79% 598.637us 24.943us 111.201us 1.82% 111.201us 4.633us 24
|
| 4088 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 100.995us 1.65% 100.995us 4.208us 24
|
| 4089 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 92.827us 1.52% 92.827us 1.067us 87
|
| 4090 |
+
aten::clamp 1.32% 208.364us 2.23% 352.254us 14.677us 87.969us 1.44% 87.969us 3.665us 24
|
| 4091 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 87.969us 1.44% 87.969us 3.665us 24
|
| 4092 |
+
aten::item 0.49% 76.878us 38.49% 6.076ms 84.392us 0.000us 0.00% 76.474us 1.062us 72
|
| 4093 |
+
aten::_local_scalar_dense 1.91% 301.114us 38.00% 5.999ms 83.325us 76.474us 1.25% 76.474us 1.062us 72
|
| 4094 |
+
aten::index_add_ 0.59% 93.433us 0.97% 153.683us 12.807us 71.618us 1.17% 71.618us 5.968us 12
|
| 4095 |
+
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 71.618us 1.17% 71.618us 5.968us 12
|
| 4096 |
+
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 66.305us 1.09% 66.305us 5.525us 12
|
| 4097 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4098 |
+
Self CPU time total: 15.786ms
|
| 4099 |
+
Self CUDA time total: 6.107ms
|
| 4100 |
|
| 4101 |
|
| 4102 |
|
|
|
|
| 4106 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4107 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
+
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 12.389ms 148.13% 12.389ms 12.389ms 1
|
| 4110 |
+
gpt_oss_experts 11.41% 1.669ms 99.96% 14.621ms 14.621ms 0.000us 0.00% 8.369ms 8.369ms 1
|
| 4111 |
+
aten::matmul 0.15% 21.391us 2.94% 430.078us 35.840us 0.000us 0.00% 7.346ms 612.203us 12
|
| 4112 |
+
aten::mm 1.75% 256.389us 2.79% 408.687us 34.057us 7.346ms 87.84% 7.346ms 612.203us 12
|
| 4113 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 4.488ms 53.66% 4.488ms 748.004us 6
|
| 4114 |
+
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 1.464ms 17.50% 1.464ms 487.982us 3
|
| 4115 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.388ms 16.59% 1.388ms 462.616us 3
|
| 4116 |
+
aten::mul 1.00% 145.604us 1.75% 255.696us 10.654us 194.273us 2.32% 194.273us 8.095us 24
|
| 4117 |
+
aten::add 1.43% 208.704us 2.27% 331.465us 18.415us 186.050us 2.22% 186.050us 10.336us 18
|
| 4118 |
+
aten::index_add_ 0.32% 46.701us 0.54% 78.582us 13.097us 164.160us 1.96% 164.160us 27.360us 6
|
| 4119 |
+
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 164.160us 1.96% 164.160us 27.360us 6
|
| 4120 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 147.425us 1.76% 147.425us 12.285us 12
|
| 4121 |
+
aten::index 1.21% 177.253us 2.08% 304.936us 25.411us 145.886us 1.74% 145.886us 12.157us 12
|
| 4122 |
+
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 115.777us 1.38% 115.777us 19.296us 6
|
| 4123 |
+
aten::clamp 0.73% 106.215us 1.25% 183.083us 15.257us 109.858us 1.31% 109.858us 9.155us 12
|
| 4124 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 109.858us 1.31% 109.858us 9.155us 12
|
| 4125 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 103.393us 1.24% 103.393us 8.616us 12
|
| 4126 |
+
aten::nonzero 1.57% 229.936us 5.04% 737.613us 81.957us 69.954us 0.84% 81.378us 9.042us 9
|
| 4127 |
+
aten::where 0.04% 5.651us 4.11% 600.652us 100.109us 0.000us 0.00% 66.625us 11.104us 6
|
| 4128 |
+
aten::nonzero_numpy 0.07% 10.392us 4.07% 595.001us 99.167us 0.000us 0.00% 66.625us 11.104us 6
|
| 4129 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4130 |
+
Self CPU time total: 14.627ms
|
| 4131 |
+
Self CUDA time total: 8.364ms
|
| 4132 |
|
| 4133 |
|
| 4134 |
|
|
|
|
| 4138 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4139 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4140 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4141 |
+
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 18.030ms 173.64% 18.030ms 18.030ms 1
|
| 4142 |
+
gpt_oss_experts 13.01% 2.655ms 99.97% 20.395ms 20.395ms 0.000us 0.00% 10.389ms 10.389ms 1
|
| 4143 |
+
aten::matmul 0.22% 44.301us 3.96% 808.849us 33.702us 0.000us 0.00% 9.112ms 379.676us 24
|
| 4144 |
+
aten::mm 2.30% 469.031us 3.75% 764.548us 31.856us 9.112ms 87.75% 9.112ms 379.676us 24
|
| 4145 |
+
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 6.210ms 59.81% 6.210ms 345.012us 18
|
| 4146 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.889ms 27.82% 2.889ms 481.470us 6
|
| 4147 |
+
aten::mul 1.42% 289.963us 2.49% 508.435us 10.592us 229.763us 2.21% 229.763us 4.787us 48
|
| 4148 |
+
aten::add 1.72% 350.925us 2.89% 589.949us 16.387us 210.624us 2.03% 210.624us 5.851us 36
|
| 4149 |
+
aten::index 1.71% 348.756us 3.02% 616.348us 25.681us 206.625us 1.99% 206.625us 8.609us 24
|
| 4150 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 164.800us 1.59% 164.800us 6.867us 24
|
| 4151 |
+
aten::index_add_ 0.46% 94.741us 0.78% 158.583us 13.215us 154.948us 1.49% 154.948us 12.912us 12
|
| 4152 |
+
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 154.948us 1.49% 154.948us 12.912us 12
|
| 4153 |
+
aten::nonzero 1.87% 380.973us 6.27% 1.279ms 85.299us 123.616us 1.19% 148.097us 9.873us 15
|
| 4154 |
+
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 147.008us 1.42% 147.008us 12.251us 12
|
| 4155 |
+
aten::where 0.05% 10.520us 5.90% 1.205ms 100.384us 0.000us 0.00% 133.153us 11.096us 12
|
| 4156 |
+
aten::nonzero_numpy 0.10% 20.862us 5.85% 1.194ms 99.507us 0.000us 0.00% 133.153us 11.096us 12
|
| 4157 |
+
aten::clamp 1.12% 227.601us 1.88% 383.872us 15.995us 131.553us 1.27% 131.553us 5.481us 24
|
| 4158 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 131.553us 1.27% 131.553us 5.481us 24
|
| 4159 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 117.823us 1.13% 117.823us 4.909us 24
|
| 4160 |
+
Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 108.771us 1.05% 108.771us 1.250us 87
|
| 4161 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4162 |
+
Self CPU time total: 20.401ms
|
| 4163 |
+
Self CUDA time total: 10.384ms
|
| 4164 |
|
| 4165 |
|
| 4166 |
|
|
|
|
| 4170 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4171 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4172 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4173 |
+
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 20.818ms 119.74% 20.818ms 20.818ms 1
|
| 4174 |
+
gpt_oss_experts 7.44% 1.725ms 99.97% 23.178ms 23.178ms 0.000us 0.00% 17.396ms 17.396ms 1
|
| 4175 |
+
aten::matmul 0.10% 22.710us 1.92% 444.608us 37.051us 0.000us 0.00% 14.530ms 1.211ms 12
|
| 4176 |
+
aten::mm 1.15% 265.607us 1.82% 421.898us 35.158us 14.530ms 83.57% 14.530ms 1.211ms 12
|
| 4177 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 8.913ms 51.26% 8.913ms 1.485ms 6
|
| 4178 |
+
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.608ms 32.26% 5.608ms 934.678us 6
|
| 4179 |
+
aten::add 0.78% 180.710us 1.31% 303.585us 16.866us 773.156us 4.45% 773.156us 42.953us 18
|
| 4180 |
+
aten::mul 0.65% 149.642us 1.11% 257.853us 10.744us 660.963us 3.80% 660.963us 27.540us 24
|
| 4181 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 496.548us 2.86% 496.548us 41.379us 12
|
| 4182 |
+
aten::index_add_ 0.21% 47.690us 0.35% 80.102us 13.350us 447.875us 2.58% 447.875us 74.646us 6
|
| 4183 |
+
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 447.875us 2.58% 447.875us 74.646us 6
|
| 4184 |
+
aten::clamp 0.46% 106.452us 0.78% 180.843us 15.070us 330.692us 1.90% 330.692us 27.558us 12
|
| 4185 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 330.692us 1.90% 330.692us 27.558us 12
|
| 4186 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 303.202us 1.74% 303.202us 50.534us 6
|
| 4187 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 276.608us 1.59% 276.608us 46.101us 6
|
| 4188 |
+
aten::index 0.79% 182.360us 1.33% 307.754us 25.646us 264.037us 1.52% 264.037us 22.003us 12
|
| 4189 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 255.650us 1.47% 255.650us 21.304us 12
|
| 4190 |
+
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 230.532us 1.33% 230.532us 38.422us 6
|
| 4191 |
+
aten::sigmoid 0.15% 34.019us 0.26% 59.750us 9.958us 176.897us 1.02% 176.897us 29.483us 6
|
| 4192 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 176.897us 1.02% 176.897us 29.483us 6
|
| 4193 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4194 |
+
Self CPU time total: 23.184ms
|
| 4195 |
+
Self CUDA time total: 17.386ms
|
| 4196 |
|
| 4197 |
|
| 4198 |
|
|
|
|
| 4202 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4203 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4204 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4205 |
+
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 24.095ms 139.71% 24.095ms 24.095ms 1
|
| 4206 |
+
gpt_oss_experts 10.26% 2.566ms 99.98% 25.007ms 25.007ms 0.000us 0.00% 17.256ms 17.256ms 1
|
| 4207 |
+
aten::matmul 0.18% 46.022us 3.50% 875.333us 36.472us 0.000us 0.00% 15.047ms 626.957us 24
|
| 4208 |
+
aten::mm 2.10% 524.786us 3.32% 829.311us 34.555us 15.047ms 87.25% 15.047ms 626.957us 24
|
| 4209 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 9.083ms 52.67% 9.083ms 756.906us 12
|
| 4210 |
+
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.100ms 17.97% 3.100ms 516.616us 6
|
| 4211 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.851ms 16.53% 2.851ms 475.118us 6
|
| 4212 |
+
aten::add 1.39% 348.094us 2.36% 591.130us 16.420us 420.966us 2.44% 420.966us 11.694us 36
|
| 4213 |
+
aten::mul 1.18% 295.904us 2.08% 520.297us 10.840us 412.933us 2.39% 412.933us 8.603us 48
|
| 4214 |
+
aten::index_add_ 0.37% 93.743us 0.64% 158.984us 13.249us 378.655us 2.20% 378.655us 31.555us 12
|
| 4215 |
+
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 378.655us 2.20% 378.655us 31.555us 12
|
| 4216 |
+
aten::index 1.44% 360.181us 2.46% 616.468us 25.686us 341.602us 1.98% 341.602us 14.233us 24
|
| 4217 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 337.478us 1.96% 337.478us 14.062us 24
|
| 4218 |
+
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 277.186us 1.61% 277.186us 23.099us 12
|
| 4219 |
+
aten::clamp 0.86% 215.346us 1.46% 365.788us 15.241us 227.201us 1.32% 227.201us 9.467us 24
|
| 4220 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 227.201us 1.32% 227.201us 9.467us 24
|
| 4221 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 218.148us 1.26% 218.148us 9.090us 24
|
| 4222 |
+
aten::nonzero 1.58% 395.427us 5.08% 1.271ms 84.763us 129.407us 0.75% 155.998us 10.400us 15
|
| 4223 |
+
aten::where 0.04% 10.161us 4.81% 1.203ms 100.233us 0.000us 0.00% 140.318us 11.693us 12
|
| 4224 |
+
aten::nonzero_numpy 0.09% 22.657us 4.77% 1.193ms 99.386us 0.000us 0.00% 140.318us 11.693us 12
|
| 4225 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4226 |
+
Self CPU time total: 25.012ms
|
| 4227 |
+
Self CUDA time total: 17.246ms
|
| 4228 |
|
| 4229 |
|
| 4230 |
|
|
|
|
| 4234 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4235 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4236 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4237 |
+
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.042ms 109.28% 40.042ms 40.042ms 1
|
| 4238 |
+
gpt_oss_experts 4.23% 1.729ms 99.82% 40.817ms 40.817ms 0.000us 0.00% 36.674ms 36.674ms 1
|
| 4239 |
+
aten::matmul 0.05% 21.410us 1.03% 421.330us 35.111us 0.000us 0.00% 26.675ms 2.223ms 12
|
| 4240 |
+
aten::mm 0.68% 276.698us 0.98% 399.920us 33.327us 26.675ms 72.80% 26.675ms 2.223ms 12
|
| 4241 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 26.671ms 72.79% 26.671ms 2.223ms 12
|
| 4242 |
+
aten::mul 0.37% 150.524us 0.64% 261.025us 10.876us 2.978ms 8.13% 2.978ms 124.096us 24
|
| 4243 |
+
aten::add 0.45% 185.051us 1.06% 431.657us 23.981us 2.397ms 6.54% 2.397ms 133.144us 18
|
| 4244 |
+
aten::clamp 0.27% 109.540us 0.45% 185.742us 15.479us 2.388ms 6.52% 2.388ms 199.031us 12
|
| 4245 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.388ms 6.52% 2.388ms 199.031us 12
|
| 4246 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.988ms 5.43% 1.988ms 165.705us 12
|
| 4247 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.624ms 4.43% 1.624ms 135.337us 12
|
| 4248 |
+
aten::index_add_ 0.12% 48.010us 0.20% 82.940us 13.823us 919.238us 2.51% 919.238us 153.206us 6
|
| 4249 |
+
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 919.238us 2.51% 919.238us 153.206us 6
|
| 4250 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 772.550us 2.11% 772.550us 128.758us 6
|
| 4251 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 739.366us 2.02% 739.366us 123.228us 6
|
| 4252 |
+
aten::index 0.45% 182.853us 0.76% 309.646us 25.804us 710.532us 1.94% 710.532us 59.211us 12
|
| 4253 |
+
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 676.741us 1.85% 676.741us 112.790us 6
|
| 4254 |
+
aten::sigmoid 0.10% 42.329us 0.17% 69.270us 11.545us 319.457us 0.87% 319.457us 53.243us 6
|
| 4255 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 319.457us 0.87% 319.457us 53.243us 6
|
| 4256 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 250.467us 0.68% 250.467us 41.744us 6
|
| 4257 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4258 |
+
Self CPU time total: 40.890ms
|
| 4259 |
+
Self CUDA time total: 36.640ms
|
| 4260 |
|
| 4261 |
|
| 4262 |
|
|
|
|
| 4266 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4267 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4268 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4269 |
+
gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.661ms 117.09% 40.661ms 40.661ms 1
|
| 4270 |
+
gpt_oss_experts 6.16% 2.556ms 99.99% 41.476ms 41.476ms 0.000us 0.00% 34.747ms 34.747ms 1
|
| 4271 |
+
aten::matmul 0.11% 44.399us 2.11% 876.925us 36.539us 0.000us 0.00% 28.768ms 1.199ms 24
|
| 4272 |
+
aten::mm 1.26% 521.881us 2.01% 832.526us 34.689us 28.768ms 82.84% 28.768ms 1.199ms 24
|
| 4273 |
+
void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 20.394ms 58.72% 20.394ms 1.360ms 15
|
| 4274 |
+
ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 8.357ms 24.06% 8.357ms 928.569us 9
|
| 4275 |
+
aten::add 0.86% 357.079us 1.47% 609.793us 16.939us 1.481ms 4.26% 1.481ms 41.126us 36
|
| 4276 |
+
aten::mul 0.72% 298.967us 1.26% 524.144us 10.920us 1.380ms 3.97% 1.380ms 28.743us 48
|
| 4277 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 929.416us 2.68% 929.416us 38.726us 24
|
| 4278 |
+
aten::index_add_ 0.23% 94.554us 0.39% 161.804us 13.484us 921.702us 2.65% 921.702us 76.809us 12
|
| 4279 |
+
void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 921.702us 2.65% 921.702us 76.809us 12
|
| 4280 |
+
aten::clamp 0.53% 218.042us 0.91% 375.616us 15.651us 772.487us 2.22% 772.487us 32.187us 24
|
| 4281 |
+
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 772.487us 2.22% 772.487us 32.187us 24
|
| 4282 |
+
aten::index 0.86% 357.217us 1.47% 607.740us 25.323us 652.838us 1.88% 652.838us 27.202us 24
|
| 4283 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 648.162us 1.87% 648.162us 54.013us 12
|
| 4284 |
+
void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 580.997us 1.67% 580.997us 48.416us 12
|
| 4285 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 551.108us 1.59% 551.108us 45.926us 12
|
| 4286 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 524.097us 1.51% 524.097us 21.837us 24
|
| 4287 |
+
aten::sigmoid 0.17% 69.444us 0.30% 123.064us 10.255us 357.924us 1.03% 357.924us 29.827us 12
|
| 4288 |
+
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 357.924us 1.03% 357.924us 29.827us 12
|
| 4289 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4290 |
+
Self CPU time total: 41.482ms
|
| 4291 |
+
Self CUDA time total: 34.727ms
|
| 4292 |
|
| 4293 |
|
| 4294 |
impl wl p50(ms) ok
|
| 4295 |
+
gpt_oss_experts cuda_B1_S1024_E2 3.77 True
|
| 4296 |
+
gpt_oss_experts cuda_B1_S1024_E4 5.20 True
|
| 4297 |
+
gpt_oss_experts cuda_B1_S512_E2 2.61 True
|
| 4298 |
+
gpt_oss_experts cuda_B1_S512_E4 3.85 True
|
| 4299 |
+
gpt_oss_experts cuda_B4_S1024_E2 13.12 True
|
| 4300 |
+
gpt_oss_experts cuda_B4_S1024_E4 13.22 True
|
| 4301 |
+
gpt_oss_experts cuda_B4_S512_E2 6.64 True
|
| 4302 |
+
gpt_oss_experts cuda_B4_S512_E4 7.30 True
|
| 4303 |
</pre></div>
|
| 4304 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4305 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4306 |
<div class="uv-logs-content" style="display: none;">
|
| 4307 |
Updating https://github.com/huggingface/kernels.git (HEAD)
|
| 4308 |
+
Updated https://github.com/huggingface/kernels.git (55b7c980e96bf5f747f0e4136be61c0b089ab76c)
|
| 4309 |
+
Building kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c
|
| 4310 |
+
Built kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c
|
| 4311 |
+
Installed 51 packages in 279ms
|
| 4312 |
</div>
|
| 4313 |
</div>
|
| 4314 |
+
<div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
|
| 4315 |
+
|
| 4316 |
+
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:00, 5.68it/s]
|
| 4317 |
+
Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 5.21it/s]
|
| 4318 |
+
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 10.50it/s]</div>
|
| 4319 |
<div class="cell-artifacts">
|
| 4320 |
<h4>Artifacts:</h4>
|
| 4321 |
<a href="artifacts/benchmark/openai_moe.jsonl" class="artifact" target="_blank">openai_moe.jsonl</a>
|
openai_moe/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
openai_moe/results/combined_results.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3889,11 +3889,11 @@ body[data-tool="eraser"] .main-content {
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
-
<dc:date>2025-
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
| 3896 |
-
<dc:title>Matplotlib v3.10.
|
| 3897 |
</ns2:Agent>
|
| 3898 |
</dc:creator>
|
| 3899 |
</ns2:Work>
|
|
@@ -3908,320 +3908,294 @@ body[data-tool="eraser"] .main-content {
|
|
| 3908 |
</g>
|
| 3909 |
<g id="axes--1" class="axes">
|
| 3910 |
<g id="patch_2">
|
| 3911 |
-
<path d="M 57.
|
| 3912 |
</g>
|
| 3913 |
<g id="matplotlib.axis_1">
|
| 3914 |
<g id="xtick_1">
|
| 3915 |
<g id="grid-x--1" class="grid grid-x">
|
| 3916 |
-
<path d="M 93.
|
| 3917 |
</g>
|
| 3918 |
<g id="line2d_1">
|
| 3919 |
<defs>
|
| 3920 |
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3921 |
</defs>
|
| 3922 |
<g>
|
| 3923 |
-
<use ns4:href="#mafb3703e5b" x="93.
|
| 3924 |
</g>
|
| 3925 |
</g>
|
| 3926 |
<g id="text_1">
|
| 3927 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.
|
| 3928 |
</g>
|
| 3929 |
</g>
|
| 3930 |
<g id="xtick_2">
|
| 3931 |
<g id="grid-x--2" class="grid grid-x">
|
| 3932 |
-
<path d="M 195.
|
| 3933 |
</g>
|
| 3934 |
<g id="line2d_2">
|
| 3935 |
<g>
|
| 3936 |
-
<use ns4:href="#mafb3703e5b" x="195.
|
| 3937 |
</g>
|
| 3938 |
</g>
|
| 3939 |
<g id="text_2">
|
| 3940 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.
|
| 3941 |
</g>
|
| 3942 |
</g>
|
| 3943 |
<g id="xtick_3">
|
| 3944 |
<g id="grid-x--3" class="grid grid-x">
|
| 3945 |
-
<path d="M 297.
|
| 3946 |
</g>
|
| 3947 |
<g id="line2d_3">
|
| 3948 |
<g>
|
| 3949 |
-
<use ns4:href="#mafb3703e5b" x="297.
|
| 3950 |
</g>
|
| 3951 |
</g>
|
| 3952 |
<g id="text_3">
|
| 3953 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.
|
| 3954 |
</g>
|
| 3955 |
</g>
|
| 3956 |
<g id="xtick_4">
|
| 3957 |
<g id="grid-x--4" class="grid grid-x">
|
| 3958 |
-
<path d="M 400.
|
| 3959 |
</g>
|
| 3960 |
<g id="line2d_4">
|
| 3961 |
<g>
|
| 3962 |
-
<use ns4:href="#mafb3703e5b" x="400.
|
| 3963 |
</g>
|
| 3964 |
</g>
|
| 3965 |
<g id="text_4">
|
| 3966 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.
|
| 3967 |
</g>
|
| 3968 |
</g>
|
| 3969 |
<g id="xtick_5">
|
| 3970 |
<g id="grid-x--5" class="grid grid-x">
|
| 3971 |
-
<path d="M 502.
|
| 3972 |
</g>
|
| 3973 |
<g id="line2d_5">
|
| 3974 |
<g>
|
| 3975 |
-
<use ns4:href="#mafb3703e5b" x="502.
|
| 3976 |
</g>
|
| 3977 |
</g>
|
| 3978 |
<g id="text_5">
|
| 3979 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.
|
| 3980 |
</g>
|
| 3981 |
</g>
|
| 3982 |
<g id="xtick_6">
|
| 3983 |
<g id="grid-x--6" class="grid grid-x">
|
| 3984 |
-
<path d="M 605.
|
| 3985 |
</g>
|
| 3986 |
<g id="line2d_6">
|
| 3987 |
<g>
|
| 3988 |
-
<use ns4:href="#mafb3703e5b" x="605.
|
| 3989 |
</g>
|
| 3990 |
</g>
|
| 3991 |
<g id="text_6">
|
| 3992 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="xtick_7">
|
| 3996 |
<g id="grid-x--7" class="grid grid-x">
|
| 3997 |
-
<path d="M 707.
|
| 3998 |
</g>
|
| 3999 |
<g id="line2d_7">
|
| 4000 |
<g>
|
| 4001 |
-
<use ns4:href="#mafb3703e5b" x="707.
|
| 4002 |
</g>
|
| 4003 |
</g>
|
| 4004 |
<g id="text_7">
|
| 4005 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="xtick_8">
|
| 4009 |
<g id="grid-x--8" class="grid grid-x">
|
| 4010 |
-
<path d="M 809.
|
| 4011 |
</g>
|
| 4012 |
<g id="line2d_8">
|
| 4013 |
<g>
|
| 4014 |
-
<use ns4:href="#mafb3703e5b" x="809.
|
| 4015 |
</g>
|
| 4016 |
</g>
|
| 4017 |
<g id="text_8">
|
| 4018 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="label--x" class="xlabel">
|
| 4022 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="matplotlib.axis_2">
|
| 4026 |
<g id="ytick_1">
|
| 4027 |
<g id="grid-y--2" class="grid grid-y">
|
| 4028 |
-
<path d="M 57.
|
| 4029 |
</g>
|
| 4030 |
<g id="line2d_9">
|
| 4031 |
<defs>
|
| 4032 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4033 |
</defs>
|
| 4034 |
<g>
|
| 4035 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="text_9">
|
| 4039 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="ytick_2">
|
| 4043 |
<g id="grid-y--3" class="grid grid-y">
|
| 4044 |
-
<path d="M 57.
|
| 4045 |
</g>
|
| 4046 |
<g id="line2d_10">
|
| 4047 |
<g>
|
| 4048 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="text_10">
|
| 4052 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4053 |
</g>
|
| 4054 |
</g>
|
| 4055 |
<g id="ytick_3">
|
| 4056 |
<g id="grid-y--4" class="grid grid-y">
|
| 4057 |
-
<path d="M 57.
|
| 4058 |
</g>
|
| 4059 |
<g id="line2d_11">
|
| 4060 |
<g>
|
| 4061 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="text_11">
|
| 4065 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4066 |
</g>
|
| 4067 |
</g>
|
| 4068 |
<g id="ytick_4">
|
| 4069 |
<g id="grid-y--5" class="grid grid-y">
|
| 4070 |
-
<path d="M 57.
|
| 4071 |
</g>
|
| 4072 |
<g id="line2d_12">
|
| 4073 |
<g>
|
| 4074 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="text_12">
|
| 4078 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="ytick_5">
|
| 4082 |
<g id="grid-y--6" class="grid grid-y">
|
| 4083 |
-
<path d="M 57.
|
| 4084 |
</g>
|
| 4085 |
<g id="line2d_13">
|
| 4086 |
<g>
|
| 4087 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="text_13">
|
| 4091 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4092 |
</g>
|
| 4093 |
</g>
|
| 4094 |
<g id="ytick_6">
|
| 4095 |
<g id="grid-y--7" class="grid grid-y">
|
| 4096 |
-
<path d="M 57.
|
| 4097 |
</g>
|
| 4098 |
<g id="line2d_14">
|
| 4099 |
<g>
|
| 4100 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4101 |
</g>
|
| 4102 |
</g>
|
| 4103 |
<g id="text_14">
|
| 4104 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4105 |
</g>
|
| 4106 |
</g>
|
| 4107 |
<g id="ytick_7">
|
| 4108 |
<g id="grid-y--8" class="grid grid-y">
|
| 4109 |
-
<path d="M 57.
|
| 4110 |
</g>
|
| 4111 |
<g id="line2d_15">
|
| 4112 |
<g>
|
| 4113 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4114 |
</g>
|
| 4115 |
</g>
|
| 4116 |
<g id="text_15">
|
| 4117 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4118 |
-
</g>
|
| 4119 |
-
</g>
|
| 4120 |
-
<g id="ytick_8">
|
| 4121 |
-
<g id="grid-y--9" class="grid grid-y">
|
| 4122 |
-
<path d="M 57.26 97.610458 L 845.766818 97.610458 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4123 |
-
</g>
|
| 4124 |
-
<g id="line2d_16">
|
| 4125 |
-
<g>
|
| 4126 |
-
<use ns4:href="#m0fca2865ba" x="57.26" y="97.610458" style="stroke: #000000; stroke-width: 0.8" />
|
| 4127 |
-
</g>
|
| 4128 |
-
</g>
|
| 4129 |
-
<g id="text_16">
|
| 4130 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="101.409677" transform="rotate(-0 50.26 101.409677)">1400</text>
|
| 4131 |
-
</g>
|
| 4132 |
-
</g>
|
| 4133 |
-
<g id="ytick_9">
|
| 4134 |
-
<g id="grid-y--10" class="grid grid-y">
|
| 4135 |
-
<path d="M 57.26 47.422681 L 845.766818 47.422681 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4136 |
-
</g>
|
| 4137 |
-
<g id="line2d_17">
|
| 4138 |
-
<g>
|
| 4139 |
-
<use ns4:href="#m0fca2865ba" x="57.26" y="47.422681" style="stroke: #000000; stroke-width: 0.8" />
|
| 4140 |
-
</g>
|
| 4141 |
-
</g>
|
| 4142 |
-
<g id="text_17">
|
| 4143 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="51.2219" transform="rotate(-0 50.26 51.2219)">1600</text>
|
| 4144 |
</g>
|
| 4145 |
</g>
|
| 4146 |
<g id="label--y" class="ylabel">
|
| 4147 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.
|
| 4148 |
</g>
|
| 4149 |
</g>
|
| 4150 |
<g id="series--binned-torch" class="series">
|
| 4151 |
-
<path d="M 93.
|
| 4152 |
<defs>
|
| 4153 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4154 |
</defs>
|
| 4155 |
-
<g clip-path="url(#
|
| 4156 |
-
<use ns4:href="#md7efaf3aec" x="93.
|
| 4157 |
-
<use ns4:href="#md7efaf3aec" x="195.
|
| 4158 |
-
<use ns4:href="#md7efaf3aec" x="297.
|
| 4159 |
-
<use ns4:href="#md7efaf3aec" x="400.
|
| 4160 |
-
<use ns4:href="#md7efaf3aec" x="502.
|
| 4161 |
-
<use ns4:href="#md7efaf3aec" x="605.
|
| 4162 |
-
<use ns4:href="#md7efaf3aec" x="707.
|
| 4163 |
-
<use ns4:href="#md7efaf3aec" x="809.
|
| 4164 |
</g>
|
| 4165 |
</g>
|
| 4166 |
<g id="series--gpt-oss-experts" class="series">
|
| 4167 |
-
<path d="M 93.
|
| 4168 |
<defs>
|
| 4169 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4170 |
</defs>
|
| 4171 |
-
<g clip-path="url(#
|
| 4172 |
-
<use ns4:href="#m9b8c54d372" x="93.
|
| 4173 |
-
<use ns4:href="#m9b8c54d372" x="195.
|
| 4174 |
-
<use ns4:href="#m9b8c54d372" x="297.
|
| 4175 |
-
<use ns4:href="#m9b8c54d372" x="400.
|
| 4176 |
-
<use ns4:href="#m9b8c54d372" x="502.
|
| 4177 |
-
<use ns4:href="#m9b8c54d372" x="605.
|
| 4178 |
-
<use ns4:href="#m9b8c54d372" x="707.
|
| 4179 |
-
<use ns4:href="#m9b8c54d372" x="809.
|
| 4180 |
</g>
|
| 4181 |
</g>
|
| 4182 |
<g id="patch_3">
|
| 4183 |
-
<path d="M 57.
|
| 4184 |
</g>
|
| 4185 |
<g id="patch_4">
|
| 4186 |
<path d="M 845.766818 468.317269 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4187 |
</g>
|
| 4188 |
<g id="patch_5">
|
| 4189 |
-
<path d="M 57.
|
| 4190 |
</g>
|
| 4191 |
<g id="patch_6">
|
| 4192 |
-
<path d="M 57.
|
| 4193 |
</g>
|
| 4194 |
-
<g id="
|
| 4195 |
-
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.
|
| 4196 |
</g>
|
| 4197 |
<g id="legend" class="legend">
|
| 4198 |
<g id="patch_7">
|
| 4199 |
-
<path d="M 64.
|
| 4200 |
</g>
|
| 4201 |
-
<g id="
|
| 4202 |
-
<path d="M 66.
|
| 4203 |
<g>
|
| 4204 |
-
<use ns4:href="#md7efaf3aec" x="76.
|
| 4205 |
</g>
|
| 4206 |
</g>
|
| 4207 |
<g id="legend-label--binned-torch" class="legend">
|
| 4208 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.
|
| 4209 |
</g>
|
| 4210 |
-
<g id="
|
| 4211 |
-
<path d="M 66.
|
| 4212 |
<g>
|
| 4213 |
-
<use ns4:href="#m9b8c54d372" x="76.
|
| 4214 |
</g>
|
| 4215 |
</g>
|
| 4216 |
<g id="legend-label--gpt-oss-experts" class="legend">
|
| 4217 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.
|
| 4218 |
</g>
|
| 4219 |
</g>
|
| 4220 |
</g>
|
| 4221 |
</g>
|
| 4222 |
<defs>
|
| 4223 |
-
<clipPath id="
|
| 4224 |
-
<rect x="57.
|
| 4225 |
</clipPath>
|
| 4226 |
</defs>
|
| 4227 |
</svg>
|
|
@@ -4234,7 +4208,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4234 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4235 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4236 |
</span> |
|
| 4237 |
-
Cell: combine | 4.
|
| 4238 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4239 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4240 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4323,22 +4297,22 @@ Summary: 2 found, 0 skipped, 0 missing
|
|
| 4323 |
COMBINED BENCHMARK SUMMARY
|
| 4324 |
|
| 4325 |
impl wl p50(ms) ok
|
| 4326 |
-
binned_torch cuda_B1_S1024_E2
|
| 4327 |
-
binned_torch cuda_B1_S1024_E4
|
| 4328 |
-
binned_torch cuda_B1_S512_E2
|
| 4329 |
-
binned_torch cuda_B1_S512_E4
|
| 4330 |
-
binned_torch cuda_B4_S1024_E2
|
| 4331 |
-
binned_torch cuda_B4_S1024_E4
|
| 4332 |
-
binned_torch cuda_B4_S512_E2
|
| 4333 |
-
binned_torch cuda_B4_S512_E4
|
| 4334 |
-
gpt_oss_experts cuda_B1_S1024_E2 3.
|
| 4335 |
-
gpt_oss_experts cuda_B1_S1024_E4 5.
|
| 4336 |
-
gpt_oss_experts cuda_B1_S512_E2 2.
|
| 4337 |
-
gpt_oss_experts cuda_B1_S512_E4 3.
|
| 4338 |
-
gpt_oss_experts cuda_B4_S1024_E2 13.
|
| 4339 |
-
gpt_oss_experts cuda_B4_S1024_E4 13.
|
| 4340 |
-
gpt_oss_experts cuda_B4_S512_E2 6.
|
| 4341 |
-
gpt_oss_experts cuda_B4_S512_E4 7.
|
| 4342 |
|
| 4343 |
GENERATING COMBINED VISUALIZATION
|
| 4344 |
|
|
@@ -4358,7 +4332,7 @@ Implementations included:
|
|
| 4358 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4359 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4360 |
<div class="uv-logs-content" style="display: none;">
|
| 4361 |
-
Installed 37 packages in
|
| 4362 |
</div>
|
| 4363 |
</div>
|
| 4364 |
<div class="cell-artifacts">
|
|
@@ -4371,11 +4345,11 @@ Installed 37 packages in 287ms
|
|
| 4371 |
<rdf:RDF>
|
| 4372 |
<ns2:Work>
|
| 4373 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4374 |
-
<dc:date>2025-
|
| 4375 |
<dc:format>image/svg+xml</dc:format>
|
| 4376 |
<dc:creator>
|
| 4377 |
<ns2:Agent>
|
| 4378 |
-
<dc:title>Matplotlib v3.10.
|
| 4379 |
</ns2:Agent>
|
| 4380 |
</dc:creator>
|
| 4381 |
</ns2:Work>
|
|
@@ -4390,320 +4364,294 @@ Installed 37 packages in 287ms
|
|
| 4390 |
</g>
|
| 4391 |
<g id="axes--1" class="axes">
|
| 4392 |
<g id="patch_2">
|
| 4393 |
-
<path d="M 57.
|
| 4394 |
</g>
|
| 4395 |
<g id="matplotlib.axis_1">
|
| 4396 |
<g id="xtick_1">
|
| 4397 |
<g id="grid-x--1" class="grid grid-x">
|
| 4398 |
-
<path d="M 93.
|
| 4399 |
</g>
|
| 4400 |
<g id="line2d_1">
|
| 4401 |
<defs>
|
| 4402 |
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4403 |
</defs>
|
| 4404 |
<g>
|
| 4405 |
-
<use ns4:href="#mafb3703e5b" x="93.
|
| 4406 |
</g>
|
| 4407 |
</g>
|
| 4408 |
<g id="text_1">
|
| 4409 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.
|
| 4410 |
</g>
|
| 4411 |
</g>
|
| 4412 |
<g id="xtick_2">
|
| 4413 |
<g id="grid-x--2" class="grid grid-x">
|
| 4414 |
-
<path d="M 195.
|
| 4415 |
</g>
|
| 4416 |
<g id="line2d_2">
|
| 4417 |
<g>
|
| 4418 |
-
<use ns4:href="#mafb3703e5b" x="195.
|
| 4419 |
</g>
|
| 4420 |
</g>
|
| 4421 |
<g id="text_2">
|
| 4422 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.
|
| 4423 |
</g>
|
| 4424 |
</g>
|
| 4425 |
<g id="xtick_3">
|
| 4426 |
<g id="grid-x--3" class="grid grid-x">
|
| 4427 |
-
<path d="M 297.
|
| 4428 |
</g>
|
| 4429 |
<g id="line2d_3">
|
| 4430 |
<g>
|
| 4431 |
-
<use ns4:href="#mafb3703e5b" x="297.
|
| 4432 |
</g>
|
| 4433 |
</g>
|
| 4434 |
<g id="text_3">
|
| 4435 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.
|
| 4436 |
</g>
|
| 4437 |
</g>
|
| 4438 |
<g id="xtick_4">
|
| 4439 |
<g id="grid-x--4" class="grid grid-x">
|
| 4440 |
-
<path d="M 400.
|
| 4441 |
</g>
|
| 4442 |
<g id="line2d_4">
|
| 4443 |
<g>
|
| 4444 |
-
<use ns4:href="#mafb3703e5b" x="400.
|
| 4445 |
</g>
|
| 4446 |
</g>
|
| 4447 |
<g id="text_4">
|
| 4448 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.
|
| 4449 |
</g>
|
| 4450 |
</g>
|
| 4451 |
<g id="xtick_5">
|
| 4452 |
<g id="grid-x--5" class="grid grid-x">
|
| 4453 |
-
<path d="M 502.
|
| 4454 |
</g>
|
| 4455 |
<g id="line2d_5">
|
| 4456 |
<g>
|
| 4457 |
-
<use ns4:href="#mafb3703e5b" x="502.
|
| 4458 |
</g>
|
| 4459 |
</g>
|
| 4460 |
<g id="text_5">
|
| 4461 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.
|
| 4462 |
</g>
|
| 4463 |
</g>
|
| 4464 |
<g id="xtick_6">
|
| 4465 |
<g id="grid-x--6" class="grid grid-x">
|
| 4466 |
-
<path d="M 605.
|
| 4467 |
</g>
|
| 4468 |
<g id="line2d_6">
|
| 4469 |
<g>
|
| 4470 |
-
<use ns4:href="#mafb3703e5b" x="605.
|
| 4471 |
</g>
|
| 4472 |
</g>
|
| 4473 |
<g id="text_6">
|
| 4474 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.
|
| 4475 |
</g>
|
| 4476 |
</g>
|
| 4477 |
<g id="xtick_7">
|
| 4478 |
<g id="grid-x--7" class="grid grid-x">
|
| 4479 |
-
<path d="M 707.
|
| 4480 |
</g>
|
| 4481 |
<g id="line2d_7">
|
| 4482 |
<g>
|
| 4483 |
-
<use ns4:href="#mafb3703e5b" x="707.
|
| 4484 |
</g>
|
| 4485 |
</g>
|
| 4486 |
<g id="text_7">
|
| 4487 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.
|
| 4488 |
</g>
|
| 4489 |
</g>
|
| 4490 |
<g id="xtick_8">
|
| 4491 |
<g id="grid-x--8" class="grid grid-x">
|
| 4492 |
-
<path d="M 809.
|
| 4493 |
</g>
|
| 4494 |
<g id="line2d_8">
|
| 4495 |
<g>
|
| 4496 |
-
<use ns4:href="#mafb3703e5b" x="809.
|
| 4497 |
</g>
|
| 4498 |
</g>
|
| 4499 |
<g id="text_8">
|
| 4500 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.
|
| 4501 |
</g>
|
| 4502 |
</g>
|
| 4503 |
<g id="label--x" class="xlabel">
|
| 4504 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.
|
| 4505 |
</g>
|
| 4506 |
</g>
|
| 4507 |
<g id="matplotlib.axis_2">
|
| 4508 |
<g id="ytick_1">
|
| 4509 |
<g id="grid-y--2" class="grid grid-y">
|
| 4510 |
-
<path d="M 57.
|
| 4511 |
</g>
|
| 4512 |
<g id="line2d_9">
|
| 4513 |
<defs>
|
| 4514 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4515 |
</defs>
|
| 4516 |
<g>
|
| 4517 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4518 |
</g>
|
| 4519 |
</g>
|
| 4520 |
<g id="text_9">
|
| 4521 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="ytick_2">
|
| 4525 |
<g id="grid-y--3" class="grid grid-y">
|
| 4526 |
-
<path d="M 57.
|
| 4527 |
</g>
|
| 4528 |
<g id="line2d_10">
|
| 4529 |
<g>
|
| 4530 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4531 |
</g>
|
| 4532 |
</g>
|
| 4533 |
<g id="text_10">
|
| 4534 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4535 |
</g>
|
| 4536 |
</g>
|
| 4537 |
<g id="ytick_3">
|
| 4538 |
<g id="grid-y--4" class="grid grid-y">
|
| 4539 |
-
<path d="M 57.
|
| 4540 |
</g>
|
| 4541 |
<g id="line2d_11">
|
| 4542 |
<g>
|
| 4543 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4544 |
</g>
|
| 4545 |
</g>
|
| 4546 |
<g id="text_11">
|
| 4547 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4548 |
</g>
|
| 4549 |
</g>
|
| 4550 |
<g id="ytick_4">
|
| 4551 |
<g id="grid-y--5" class="grid grid-y">
|
| 4552 |
-
<path d="M 57.
|
| 4553 |
</g>
|
| 4554 |
<g id="line2d_12">
|
| 4555 |
<g>
|
| 4556 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4557 |
</g>
|
| 4558 |
</g>
|
| 4559 |
<g id="text_12">
|
| 4560 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4561 |
</g>
|
| 4562 |
</g>
|
| 4563 |
<g id="ytick_5">
|
| 4564 |
<g id="grid-y--6" class="grid grid-y">
|
| 4565 |
-
<path d="M 57.
|
| 4566 |
</g>
|
| 4567 |
<g id="line2d_13">
|
| 4568 |
<g>
|
| 4569 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4570 |
</g>
|
| 4571 |
</g>
|
| 4572 |
<g id="text_13">
|
| 4573 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4574 |
</g>
|
| 4575 |
</g>
|
| 4576 |
<g id="ytick_6">
|
| 4577 |
<g id="grid-y--7" class="grid grid-y">
|
| 4578 |
-
<path d="M 57.
|
| 4579 |
</g>
|
| 4580 |
<g id="line2d_14">
|
| 4581 |
<g>
|
| 4582 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4583 |
</g>
|
| 4584 |
</g>
|
| 4585 |
<g id="text_14">
|
| 4586 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4587 |
</g>
|
| 4588 |
</g>
|
| 4589 |
<g id="ytick_7">
|
| 4590 |
<g id="grid-y--8" class="grid grid-y">
|
| 4591 |
-
<path d="M 57.
|
| 4592 |
</g>
|
| 4593 |
<g id="line2d_15">
|
| 4594 |
<g>
|
| 4595 |
-
<use ns4:href="#m0fca2865ba" x="57.
|
| 4596 |
</g>
|
| 4597 |
</g>
|
| 4598 |
<g id="text_15">
|
| 4599 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.
|
| 4600 |
-
</g>
|
| 4601 |
-
</g>
|
| 4602 |
-
<g id="ytick_8">
|
| 4603 |
-
<g id="grid-y--9" class="grid grid-y">
|
| 4604 |
-
<path d="M 57.26 97.610458 L 845.766818 97.610458 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4605 |
-
</g>
|
| 4606 |
-
<g id="line2d_16">
|
| 4607 |
-
<g>
|
| 4608 |
-
<use ns4:href="#m0fca2865ba" x="57.26" y="97.610458" style="stroke: #000000; stroke-width: 0.8" />
|
| 4609 |
-
</g>
|
| 4610 |
-
</g>
|
| 4611 |
-
<g id="text_16">
|
| 4612 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="101.409677" transform="rotate(-0 50.26 101.409677)">1400</text>
|
| 4613 |
-
</g>
|
| 4614 |
-
</g>
|
| 4615 |
-
<g id="ytick_9">
|
| 4616 |
-
<g id="grid-y--10" class="grid grid-y">
|
| 4617 |
-
<path d="M 57.26 47.422681 L 845.766818 47.422681 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4618 |
-
</g>
|
| 4619 |
-
<g id="line2d_17">
|
| 4620 |
-
<g>
|
| 4621 |
-
<use ns4:href="#m0fca2865ba" x="57.26" y="47.422681" style="stroke: #000000; stroke-width: 0.8" />
|
| 4622 |
-
</g>
|
| 4623 |
-
</g>
|
| 4624 |
-
<g id="text_17">
|
| 4625 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="51.2219" transform="rotate(-0 50.26 51.2219)">1600</text>
|
| 4626 |
</g>
|
| 4627 |
</g>
|
| 4628 |
<g id="label--y" class="ylabel">
|
| 4629 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.
|
| 4630 |
</g>
|
| 4631 |
</g>
|
| 4632 |
<g id="series--binned-torch" class="series">
|
| 4633 |
-
<path d="M 93.
|
| 4634 |
<defs>
|
| 4635 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4636 |
</defs>
|
| 4637 |
-
<g clip-path="url(#
|
| 4638 |
-
<use ns4:href="#md7efaf3aec" x="93.
|
| 4639 |
-
<use ns4:href="#md7efaf3aec" x="195.
|
| 4640 |
-
<use ns4:href="#md7efaf3aec" x="297.
|
| 4641 |
-
<use ns4:href="#md7efaf3aec" x="400.
|
| 4642 |
-
<use ns4:href="#md7efaf3aec" x="502.
|
| 4643 |
-
<use ns4:href="#md7efaf3aec" x="605.
|
| 4644 |
-
<use ns4:href="#md7efaf3aec" x="707.
|
| 4645 |
-
<use ns4:href="#md7efaf3aec" x="809.
|
| 4646 |
</g>
|
| 4647 |
</g>
|
| 4648 |
<g id="series--gpt-oss-experts" class="series">
|
| 4649 |
-
<path d="M 93.
|
| 4650 |
<defs>
|
| 4651 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4652 |
</defs>
|
| 4653 |
-
<g clip-path="url(#
|
| 4654 |
-
<use ns4:href="#m9b8c54d372" x="93.
|
| 4655 |
-
<use ns4:href="#m9b8c54d372" x="195.
|
| 4656 |
-
<use ns4:href="#m9b8c54d372" x="297.
|
| 4657 |
-
<use ns4:href="#m9b8c54d372" x="400.
|
| 4658 |
-
<use ns4:href="#m9b8c54d372" x="502.
|
| 4659 |
-
<use ns4:href="#m9b8c54d372" x="605.
|
| 4660 |
-
<use ns4:href="#m9b8c54d372" x="707.
|
| 4661 |
-
<use ns4:href="#m9b8c54d372" x="809.
|
| 4662 |
</g>
|
| 4663 |
</g>
|
| 4664 |
<g id="patch_3">
|
| 4665 |
-
<path d="M 57.
|
| 4666 |
</g>
|
| 4667 |
<g id="patch_4">
|
| 4668 |
<path d="M 845.766818 468.317269 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4669 |
</g>
|
| 4670 |
<g id="patch_5">
|
| 4671 |
-
<path d="M 57.
|
| 4672 |
</g>
|
| 4673 |
<g id="patch_6">
|
| 4674 |
-
<path d="M 57.
|
| 4675 |
</g>
|
| 4676 |
-
<g id="
|
| 4677 |
-
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.
|
| 4678 |
</g>
|
| 4679 |
<g id="legend" class="legend">
|
| 4680 |
<g id="patch_7">
|
| 4681 |
-
<path d="M 64.
|
| 4682 |
</g>
|
| 4683 |
-
<g id="
|
| 4684 |
-
<path d="M 66.
|
| 4685 |
<g>
|
| 4686 |
-
<use ns4:href="#md7efaf3aec" x="76.
|
| 4687 |
</g>
|
| 4688 |
</g>
|
| 4689 |
<g id="legend-label--binned-torch" class="legend">
|
| 4690 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.
|
| 4691 |
</g>
|
| 4692 |
-
<g id="
|
| 4693 |
-
<path d="M 66.
|
| 4694 |
<g>
|
| 4695 |
-
<use ns4:href="#m9b8c54d372" x="76.
|
| 4696 |
</g>
|
| 4697 |
</g>
|
| 4698 |
<g id="legend-label--gpt-oss-experts" class="legend">
|
| 4699 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.
|
| 4700 |
</g>
|
| 4701 |
</g>
|
| 4702 |
</g>
|
| 4703 |
</g>
|
| 4704 |
<defs>
|
| 4705 |
-
<clipPath id="
|
| 4706 |
-
<rect x="57.
|
| 4707 |
</clipPath>
|
| 4708 |
</defs>
|
| 4709 |
</svg>
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
+
<dc:date>2025-12-19T19:10:00.094905</dc:date>
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
| 3896 |
+
<dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
|
| 3897 |
</ns2:Agent>
|
| 3898 |
</dc:creator>
|
| 3899 |
</ns2:Work>
|
|
|
|
| 3908 |
</g>
|
| 3909 |
<g id="axes--1" class="axes">
|
| 3910 |
<g id="patch_2">
|
| 3911 |
+
<path d="M 57.17 468.317269 L 845.766818 468.317269 L 845.766818 26.88 L 57.17 26.88 L 57.17 468.317269 z " style="fill: none" />
|
| 3912 |
</g>
|
| 3913 |
<g id="matplotlib.axis_1">
|
| 3914 |
<g id="xtick_1">
|
| 3915 |
<g id="grid-x--1" class="grid grid-x">
|
| 3916 |
+
<path d="M 93.01531 468.317269 L 93.01531 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3917 |
</g>
|
| 3918 |
<g id="line2d_1">
|
| 3919 |
<defs>
|
| 3920 |
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3921 |
</defs>
|
| 3922 |
<g>
|
| 3923 |
+
<use ns4:href="#mafb3703e5b" x="93.01531" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 3924 |
</g>
|
| 3925 |
</g>
|
| 3926 |
<g id="text_1">
|
| 3927 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.817431 544.791615) rotate(-45)">cuda_B1_S512_E2</text>
|
| 3928 |
</g>
|
| 3929 |
</g>
|
| 3930 |
<g id="xtick_2">
|
| 3931 |
<g id="grid-x--2" class="grid grid-x">
|
| 3932 |
+
<path d="M 195.430481 468.317269 L 195.430481 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3933 |
</g>
|
| 3934 |
<g id="line2d_2">
|
| 3935 |
<g>
|
| 3936 |
+
<use ns4:href="#mafb3703e5b" x="195.430481" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 3937 |
</g>
|
| 3938 |
</g>
|
| 3939 |
<g id="text_2">
|
| 3940 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.232602 544.791615) rotate(-45)">cuda_B1_S512_E4</text>
|
| 3941 |
</g>
|
| 3942 |
</g>
|
| 3943 |
<g id="xtick_3">
|
| 3944 |
<g id="grid-x--3" class="grid grid-x">
|
| 3945 |
+
<path d="M 297.845652 468.317269 L 297.845652 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3946 |
</g>
|
| 3947 |
<g id="line2d_3">
|
| 3948 |
<g>
|
| 3949 |
+
<use ns4:href="#mafb3703e5b" x="297.845652" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 3950 |
</g>
|
| 3951 |
</g>
|
| 3952 |
<g id="text_3">
|
| 3953 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.39829 549.290582) rotate(-45)">cuda_B1_S1024_E2</text>
|
| 3954 |
</g>
|
| 3955 |
</g>
|
| 3956 |
<g id="xtick_4">
|
| 3957 |
<g id="grid-x--4" class="grid grid-x">
|
| 3958 |
+
<path d="M 400.260823 468.317269 L 400.260823 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3959 |
</g>
|
| 3960 |
<g id="line2d_4">
|
| 3961 |
<g>
|
| 3962 |
+
<use ns4:href="#mafb3703e5b" x="400.260823" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 3963 |
</g>
|
| 3964 |
</g>
|
| 3965 |
<g id="text_4">
|
| 3966 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.813461 549.290582) rotate(-45)">cuda_B1_S1024_E4</text>
|
| 3967 |
</g>
|
| 3968 |
</g>
|
| 3969 |
<g id="xtick_5">
|
| 3970 |
<g id="grid-x--5" class="grid grid-x">
|
| 3971 |
+
<path d="M 502.675995 468.317269 L 502.675995 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3972 |
</g>
|
| 3973 |
<g id="line2d_5">
|
| 3974 |
<g>
|
| 3975 |
+
<use ns4:href="#mafb3703e5b" x="502.675995" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 3976 |
</g>
|
| 3977 |
</g>
|
| 3978 |
<g id="text_5">
|
| 3979 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.478116 544.791615) rotate(-45)">cuda_B4_S512_E2</text>
|
| 3980 |
</g>
|
| 3981 |
</g>
|
| 3982 |
<g id="xtick_6">
|
| 3983 |
<g id="grid-x--6" class="grid grid-x">
|
| 3984 |
+
<path d="M 605.091166 468.317269 L 605.091166 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3985 |
</g>
|
| 3986 |
<g id="line2d_6">
|
| 3987 |
<g>
|
| 3988 |
+
<use ns4:href="#mafb3703e5b" x="605.091166" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 3989 |
</g>
|
| 3990 |
</g>
|
| 3991 |
<g id="text_6">
|
| 3992 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.893287 544.791615) rotate(-45)">cuda_B4_S512_E4</text>
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="xtick_7">
|
| 3996 |
<g id="grid-x--7" class="grid grid-x">
|
| 3997 |
+
<path d="M 707.506337 468.317269 L 707.506337 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3998 |
</g>
|
| 3999 |
<g id="line2d_7">
|
| 4000 |
<g>
|
| 4001 |
+
<use ns4:href="#mafb3703e5b" x="707.506337" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4002 |
</g>
|
| 4003 |
</g>
|
| 4004 |
<g id="text_7">
|
| 4005 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.058975 549.290582) rotate(-45)">cuda_B4_S1024_E2</text>
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="xtick_8">
|
| 4009 |
<g id="grid-x--8" class="grid grid-x">
|
| 4010 |
+
<path d="M 809.921508 468.317269 L 809.921508 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4011 |
</g>
|
| 4012 |
<g id="line2d_8">
|
| 4013 |
<g>
|
| 4014 |
+
<use ns4:href="#mafb3703e5b" x="809.921508" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4015 |
</g>
|
| 4016 |
</g>
|
| 4017 |
<g id="text_8">
|
| 4018 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.474146 549.290582) rotate(-45)">cuda_B4_S1024_E4</text>
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="label--x" class="xlabel">
|
| 4022 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.468409" y="562.556245" transform="rotate(-0 451.468409 562.556245)">Workload</text>
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="matplotlib.axis_2">
|
| 4026 |
<g id="ytick_1">
|
| 4027 |
<g id="grid-y--2" class="grid grid-y">
|
| 4028 |
+
<path d="M 57.17 448.88374 L 845.766818 448.88374 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4029 |
</g>
|
| 4030 |
<g id="line2d_9">
|
| 4031 |
<defs>
|
| 4032 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4033 |
</defs>
|
| 4034 |
<g>
|
| 4035 |
+
<use ns4:href="#m0fca2865ba" x="57.17" y="448.88374" style="stroke: #000000; stroke-width: 0.8" />
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="text_9">
|
| 4039 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="452.682959" transform="rotate(-0 50.17 452.682959)">0</text>
|
| 4040 |
</g>
|
| 4041 |
</g>
|
| 4042 |
<g id="ytick_2">
|
| 4043 |
<g id="grid-y--3" class="grid grid-y">
|
| 4044 |
+
<path d="M 57.17 388.304965 L 845.766818 388.304965 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4045 |
</g>
|
| 4046 |
<g id="line2d_10">
|
| 4047 |
<g>
|
| 4048 |
+
<use ns4:href="#m0fca2865ba" x="57.17" y="388.304965" style="stroke: #000000; stroke-width: 0.8" />
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="text_10">
|
| 4052 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="392.104184" transform="rotate(-0 50.17 392.104184)">250</text>
|
| 4053 |
</g>
|
| 4054 |
</g>
|
| 4055 |
<g id="ytick_3">
|
| 4056 |
<g id="grid-y--4" class="grid grid-y">
|
| 4057 |
+
<path d="M 57.17 327.726191 L 845.766818 327.726191 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4058 |
</g>
|
| 4059 |
<g id="line2d_11">
|
| 4060 |
<g>
|
| 4061 |
+
<use ns4:href="#m0fca2865ba" x="57.17" y="327.726191" style="stroke: #000000; stroke-width: 0.8" />
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="text_11">
|
| 4065 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="331.52541" transform="rotate(-0 50.17 331.52541)">500</text>
|
| 4066 |
</g>
|
| 4067 |
</g>
|
| 4068 |
<g id="ytick_4">
|
| 4069 |
<g id="grid-y--5" class="grid grid-y">
|
| 4070 |
+
<path d="M 57.17 267.147416 L 845.766818 267.147416 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4071 |
</g>
|
| 4072 |
<g id="line2d_12">
|
| 4073 |
<g>
|
| 4074 |
+
<use ns4:href="#m0fca2865ba" x="57.17" y="267.147416" style="stroke: #000000; stroke-width: 0.8" />
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="text_12">
|
| 4078 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="270.946635" transform="rotate(-0 50.17 270.946635)">750</text>
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="ytick_5">
|
| 4082 |
<g id="grid-y--6" class="grid grid-y">
|
| 4083 |
+
<path d="M 57.17 206.568642 L 845.766818 206.568642 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4084 |
</g>
|
| 4085 |
<g id="line2d_13">
|
| 4086 |
<g>
|
| 4087 |
+
<use ns4:href="#m0fca2865ba" x="57.17" y="206.568642" style="stroke: #000000; stroke-width: 0.8" />
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="text_13">
|
| 4091 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="210.367861" transform="rotate(-0 50.17 210.367861)">1000</text>
|
| 4092 |
</g>
|
| 4093 |
</g>
|
| 4094 |
<g id="ytick_6">
|
| 4095 |
<g id="grid-y--7" class="grid grid-y">
|
| 4096 |
+
<path d="M 57.17 145.989867 L 845.766818 145.989867 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4097 |
</g>
|
| 4098 |
<g id="line2d_14">
|
| 4099 |
<g>
|
| 4100 |
+
<use ns4:href="#m0fca2865ba" x="57.17" y="145.989867" style="stroke: #000000; stroke-width: 0.8" />
|
| 4101 |
</g>
|
| 4102 |
</g>
|
| 4103 |
<g id="text_14">
|
| 4104 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="149.789086" transform="rotate(-0 50.17 149.789086)">1250</text>
|
| 4105 |
</g>
|
| 4106 |
</g>
|
| 4107 |
<g id="ytick_7">
|
| 4108 |
<g id="grid-y--8" class="grid grid-y">
|
| 4109 |
+
<path d="M 57.17 85.411093 L 845.766818 85.411093 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4110 |
</g>
|
| 4111 |
<g id="line2d_15">
|
| 4112 |
<g>
|
| 4113 |
+
<use ns4:href="#m0fca2865ba" x="57.17" y="85.411093" style="stroke: #000000; stroke-width: 0.8" />
|
| 4114 |
</g>
|
| 4115 |
</g>
|
| 4116 |
<g id="text_15">
|
| 4117 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="89.210312" transform="rotate(-0 50.17 89.210312)">1500</text>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4118 |
</g>
|
| 4119 |
</g>
|
| 4120 |
<g id="label--y" class="ylabel">
|
| 4121 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.640312" y="247.598635" transform="rotate(-90 18.640312 247.598635)">Latency P50 (ms)</text>
|
| 4122 |
</g>
|
| 4123 |
</g>
|
| 4124 |
<g id="series--binned-torch" class="series">
|
| 4125 |
+
<path d="M 93.01531 410.663437 L 195.430481 399.252405 L 297.845652 356.001516 L 400.260823 346.76844 L 502.675995 261.404682 L 605.091166 245.335532 L 707.506337 82.088892 L 809.921508 46.94533 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4126 |
<defs>
|
| 4127 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4128 |
</defs>
|
| 4129 |
+
<g clip-path="url(#p5307ca50d8)">
|
| 4130 |
+
<use ns4:href="#md7efaf3aec" x="93.01531" y="410.663437" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4131 |
+
<use ns4:href="#md7efaf3aec" x="195.430481" y="399.252405" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4132 |
+
<use ns4:href="#md7efaf3aec" x="297.845652" y="356.001516" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4133 |
+
<use ns4:href="#md7efaf3aec" x="400.260823" y="346.76844" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4134 |
+
<use ns4:href="#md7efaf3aec" x="502.675995" y="261.404682" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4135 |
+
<use ns4:href="#md7efaf3aec" x="605.091166" y="245.335532" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4136 |
+
<use ns4:href="#md7efaf3aec" x="707.506337" y="82.088892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4137 |
+
<use ns4:href="#md7efaf3aec" x="809.921508" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4138 |
</g>
|
| 4139 |
</g>
|
| 4140 |
<g id="series--gpt-oss-experts" class="series">
|
| 4141 |
+
<path d="M 93.01531 448.251939 L 195.430481 447.950554 L 297.845652 447.969402 L 400.260823 447.623067 L 502.675995 447.274263 L 605.091166 447.115423 L 707.506337 445.70441 L 809.921508 445.680583 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4142 |
<defs>
|
| 4143 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4144 |
</defs>
|
| 4145 |
+
<g clip-path="url(#p5307ca50d8)">
|
| 4146 |
+
<use ns4:href="#m9b8c54d372" x="93.01531" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4147 |
+
<use ns4:href="#m9b8c54d372" x="195.430481" y="447.950554" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4148 |
+
<use ns4:href="#m9b8c54d372" x="297.845652" y="447.969402" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4149 |
+
<use ns4:href="#m9b8c54d372" x="400.260823" y="447.623067" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4150 |
+
<use ns4:href="#m9b8c54d372" x="502.675995" y="447.274263" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4151 |
+
<use ns4:href="#m9b8c54d372" x="605.091166" y="447.115423" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4152 |
+
<use ns4:href="#m9b8c54d372" x="707.506337" y="445.70441" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4153 |
+
<use ns4:href="#m9b8c54d372" x="809.921508" y="445.680583" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4154 |
</g>
|
| 4155 |
</g>
|
| 4156 |
<g id="patch_3">
|
| 4157 |
+
<path d="M 57.17 468.317269 L 57.17 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4158 |
</g>
|
| 4159 |
<g id="patch_4">
|
| 4160 |
<path d="M 845.766818 468.317269 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4161 |
</g>
|
| 4162 |
<g id="patch_5">
|
| 4163 |
+
<path d="M 57.17 468.317269 L 845.766818 468.317269 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4164 |
</g>
|
| 4165 |
<g id="patch_6">
|
| 4166 |
+
<path d="M 57.17 26.88 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4167 |
</g>
|
| 4168 |
+
<g id="text_16">
|
| 4169 |
+
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.468409" y="20.88" transform="rotate(-0 451.468409 20.88)">Attention Implementation Latency</text>
|
| 4170 |
</g>
|
| 4171 |
<g id="legend" class="legend">
|
| 4172 |
<g id="patch_7">
|
| 4173 |
+
<path d="M 64.17 64.7925 L 176.96375 64.7925 Q 178.96375 64.7925 178.96375 62.7925 L 178.96375 33.88 Q 178.96375 31.88 176.96375 31.88 L 64.17 31.88 Q 62.17 31.88 62.17 33.88 L 62.17 62.7925 Q 62.17 64.7925 64.17 64.7925 L 64.17 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4174 |
</g>
|
| 4175 |
+
<g id="line2d_16">
|
| 4176 |
+
<path d="M 66.17 39.978438 L 76.17 39.978438 L 86.17 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4177 |
<g>
|
| 4178 |
+
<use ns4:href="#md7efaf3aec" x="76.17" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4179 |
</g>
|
| 4180 |
</g>
|
| 4181 |
<g id="legend-label--binned-torch" class="legend">
|
| 4182 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.17" y="43.478438" transform="rotate(-0 94.17 43.478438)">binned_torch</text>
|
| 4183 |
</g>
|
| 4184 |
+
<g id="line2d_17">
|
| 4185 |
+
<path d="M 66.17 54.934687 L 76.17 54.934687 L 86.17 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4186 |
<g>
|
| 4187 |
+
<use ns4:href="#m9b8c54d372" x="76.17" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4188 |
</g>
|
| 4189 |
</g>
|
| 4190 |
<g id="legend-label--gpt-oss-experts" class="legend">
|
| 4191 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.17" y="58.434687" transform="rotate(-0 94.17 58.434687)">gpt_oss_experts</text>
|
| 4192 |
</g>
|
| 4193 |
</g>
|
| 4194 |
</g>
|
| 4195 |
</g>
|
| 4196 |
<defs>
|
| 4197 |
+
<clipPath id="p5307ca50d8">
|
| 4198 |
+
<rect x="57.17" y="26.88" width="788.596818" height="441.437269" />
|
| 4199 |
</clipPath>
|
| 4200 |
</defs>
|
| 4201 |
</svg>
|
|
|
|
| 4208 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4209 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4210 |
</span> |
|
| 4211 |
+
Cell: combine | 4.53s
|
| 4212 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4213 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4214 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4297 |
COMBINED BENCHMARK SUMMARY
|
| 4298 |
|
| 4299 |
impl wl p50(ms) ok
|
| 4300 |
+
binned_torch cuda_B1_S1024_E2 383.31 True
|
| 4301 |
+
binned_torch cuda_B1_S1024_E4 421.42 True
|
| 4302 |
+
binned_torch cuda_B1_S512_E2 157.73 True
|
| 4303 |
+
binned_torch cuda_B1_S512_E4 204.82 True
|
| 4304 |
+
binned_torch cuda_B4_S1024_E2 1513.71 True
|
| 4305 |
+
binned_torch cuda_B4_S1024_E4 1658.74 True
|
| 4306 |
+
binned_torch cuda_B4_S512_E2 773.70 True
|
| 4307 |
+
binned_torch cuda_B4_S512_E4 840.01 True
|
| 4308 |
+
gpt_oss_experts cuda_B1_S1024_E2 3.77 True
|
| 4309 |
+
gpt_oss_experts cuda_B1_S1024_E4 5.20 True
|
| 4310 |
+
gpt_oss_experts cuda_B1_S512_E2 2.61 True
|
| 4311 |
+
gpt_oss_experts cuda_B1_S512_E4 3.85 True
|
| 4312 |
+
gpt_oss_experts cuda_B4_S1024_E2 13.12 True
|
| 4313 |
+
gpt_oss_experts cuda_B4_S1024_E4 13.22 True
|
| 4314 |
+
gpt_oss_experts cuda_B4_S512_E2 6.64 True
|
| 4315 |
+
gpt_oss_experts cuda_B4_S512_E4 7.30 True
|
| 4316 |
|
| 4317 |
GENERATING COMBINED VISUALIZATION
|
| 4318 |
|
|
|
|
| 4332 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4333 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4334 |
<div class="uv-logs-content" style="display: none;">
|
| 4335 |
+
Installed 37 packages in 270ms
|
| 4336 |
</div>
|
| 4337 |
</div>
|
| 4338 |
<div class="cell-artifacts">
|
|
|
|
| 4345 |
<rdf:RDF>
|
| 4346 |
<ns2:Work>
|
| 4347 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4348 |
+
<dc:date>2025-12-19T19:10:00.094905</dc:date>
|
| 4349 |
<dc:format>image/svg+xml</dc:format>
|
| 4350 |
<dc:creator>
|
| 4351 |
<ns2:Agent>
|
| 4352 |
+
<dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
|
| 4353 |
</ns2:Agent>
|
| 4354 |
</dc:creator>
|
| 4355 |
</ns2:Work>
|
|
|
|
| 4364 |
</g>
|
| 4365 |
<g id="axes--1" class="axes">
|
| 4366 |
<g id="patch_2">
|
| 4367 |
+
<path d="M 57.17 468.317269 L 845.766818 468.317269 L 845.766818 26.88 L 57.17 26.88 L 57.17 468.317269 z " style="fill: none" />
|
| 4368 |
</g>
|
| 4369 |
<g id="matplotlib.axis_1">
|
| 4370 |
<g id="xtick_1">
|
| 4371 |
<g id="grid-x--1" class="grid grid-x">
|
| 4372 |
+
<path d="M 93.01531 468.317269 L 93.01531 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4373 |
</g>
|
| 4374 |
<g id="line2d_1">
|
| 4375 |
<defs>
|
| 4376 |
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4377 |
</defs>
|
| 4378 |
<g>
|
| 4379 |
+
<use ns4:href="#mafb3703e5b" x="93.01531" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4380 |
</g>
|
| 4381 |
</g>
|
| 4382 |
<g id="text_1">
|
| 4383 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.817431 544.791615) rotate(-45)">cuda_B1_S512_E2</text>
|
| 4384 |
</g>
|
| 4385 |
</g>
|
| 4386 |
<g id="xtick_2">
|
| 4387 |
<g id="grid-x--2" class="grid grid-x">
|
| 4388 |
+
<path d="M 195.430481 468.317269 L 195.430481 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4389 |
</g>
|
| 4390 |
<g id="line2d_2">
|
| 4391 |
<g>
|
| 4392 |
+
<use ns4:href="#mafb3703e5b" x="195.430481" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4393 |
</g>
|
| 4394 |
</g>
|
| 4395 |
<g id="text_2">
|
| 4396 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.232602 544.791615) rotate(-45)">cuda_B1_S512_E4</text>
|
| 4397 |
</g>
|
| 4398 |
</g>
|
| 4399 |
<g id="xtick_3">
|
| 4400 |
<g id="grid-x--3" class="grid grid-x">
|
| 4401 |
+
<path d="M 297.845652 468.317269 L 297.845652 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4402 |
</g>
|
| 4403 |
<g id="line2d_3">
|
| 4404 |
<g>
|
| 4405 |
+
<use ns4:href="#mafb3703e5b" x="297.845652" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4406 |
</g>
|
| 4407 |
</g>
|
| 4408 |
<g id="text_3">
|
| 4409 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.39829 549.290582) rotate(-45)">cuda_B1_S1024_E2</text>
|
| 4410 |
</g>
|
| 4411 |
</g>
|
| 4412 |
<g id="xtick_4">
|
| 4413 |
<g id="grid-x--4" class="grid grid-x">
|
| 4414 |
+
<path d="M 400.260823 468.317269 L 400.260823 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4415 |
</g>
|
| 4416 |
<g id="line2d_4">
|
| 4417 |
<g>
|
| 4418 |
+
<use ns4:href="#mafb3703e5b" x="400.260823" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4419 |
</g>
|
| 4420 |
</g>
|
| 4421 |
<g id="text_4">
|
| 4422 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.813461 549.290582) rotate(-45)">cuda_B1_S1024_E4</text>
|
| 4423 |
</g>
|
| 4424 |
</g>
|
| 4425 |
<g id="xtick_5">
|
| 4426 |
<g id="grid-x--5" class="grid grid-x">
|
| 4427 |
+
<path d="M 502.675995 468.317269 L 502.675995 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4428 |
</g>
|
| 4429 |
<g id="line2d_5">
|
| 4430 |
<g>
|
| 4431 |
+
<use ns4:href="#mafb3703e5b" x="502.675995" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4432 |
</g>
|
| 4433 |
</g>
|
| 4434 |
<g id="text_5">
|
| 4435 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.478116 544.791615) rotate(-45)">cuda_B4_S512_E2</text>
|
| 4436 |
</g>
|
| 4437 |
</g>
|
| 4438 |
<g id="xtick_6">
|
| 4439 |
<g id="grid-x--6" class="grid grid-x">
|
| 4440 |
+
<path d="M 605.091166 468.317269 L 605.091166 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4441 |
</g>
|
| 4442 |
<g id="line2d_6">
|
| 4443 |
<g>
|
| 4444 |
+
<use ns4:href="#mafb3703e5b" x="605.091166" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4445 |
</g>
|
| 4446 |
</g>
|
| 4447 |
<g id="text_6">
|
| 4448 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.893287 544.791615) rotate(-45)">cuda_B4_S512_E4</text>
|
| 4449 |
</g>
|
| 4450 |
</g>
|
| 4451 |
<g id="xtick_7">
|
| 4452 |
<g id="grid-x--7" class="grid grid-x">
|
| 4453 |
+
<path d="M 707.506337 468.317269 L 707.506337 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4454 |
</g>
|
| 4455 |
<g id="line2d_7">
|
| 4456 |
<g>
|
| 4457 |
+
<use ns4:href="#mafb3703e5b" x="707.506337" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4458 |
</g>
|
| 4459 |
</g>
|
| 4460 |
<g id="text_7">
|
| 4461 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.058975 549.290582) rotate(-45)">cuda_B4_S1024_E2</text>
|
| 4462 |
</g>
|
| 4463 |
</g>
|
| 4464 |
<g id="xtick_8">
|
| 4465 |
<g id="grid-x--8" class="grid grid-x">
|
| 4466 |
+
<path d="M 809.921508 468.317269 L 809.921508 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4467 |
</g>
|
| 4468 |
<g id="line2d_8">
|
| 4469 |
<g>
|
| 4470 |
+
<use ns4:href="#mafb3703e5b" x="809.921508" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
|
| 4471 |
</g>
|
| 4472 |
</g>
|
| 4473 |
<g id="text_8">
|
| 4474 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.474146 549.290582) rotate(-45)">cuda_B4_S1024_E4</text>
|
| 4475 |
</g>
|
| 4476 |
</g>
|
| 4477 |
<g id="label--x" class="xlabel">
|
| 4478 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.468409" y="562.556245" transform="rotate(-0 451.468409 562.556245)">Workload</text>
|
| 4479 |
</g>
|
| 4480 |
</g>
|
| 4481 |
<g id="matplotlib.axis_2">
|
| 4482 |
<g id="ytick_1">
|
| 4483 |
<g id="grid-y--2" class="grid grid-y">
|
| 4484 |
+
<path d="M 57.17 448.88374 L 845.766818 448.88374 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4485 |
</g>
|
| 4486 |
<g id="line2d_9">
|
| 4487 |
<defs>
|
| 4488 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4489 |
</defs>
|
| 4490 |
<g>
|
| 4491 |
+
<use ns4:href="#m0fca2865ba" x="57.17" y="448.88374" style="stroke: #000000; stroke-width: 0.8" />
|
| 4492 |
</g>
|
| 4493 |
</g>
|
| 4494 |
<g id="text_9">
|
| 4495 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="452.682959" transform="rotate(-0 50.17 452.682959)">0</text>
|
| 4496 |
</g>
|
| 4497 |
</g>
|
| 4498 |
<g id="ytick_2">
|
| 4499 |
<g id="grid-y--3" class="grid grid-y">
|
| 4500 |
+
<path d="M 57.17 388.304965 L 845.766818 388.304965 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4501 |
</g>
|
| 4502 |
<g id="line2d_10">
|
| 4503 |
<g>
|
| 4504 |
+
<use ns4:href="#m0fca2865ba" x="57.17" y="388.304965" style="stroke: #000000; stroke-width: 0.8" />
|
| 4505 |
</g>
|
| 4506 |
</g>
|
| 4507 |
<g id="text_10">
|
| 4508 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="392.104184" transform="rotate(-0 50.17 392.104184)">250</text>
|
| 4509 |
</g>
|
| 4510 |
</g>
|
| 4511 |
<g id="ytick_3">
|
| 4512 |
<g id="grid-y--4" class="grid grid-y">
|
| 4513 |
+
<path d="M 57.17 327.726191 L 845.766818 327.726191 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4514 |
</g>
|
| 4515 |
<g id="line2d_11">
|
| 4516 |
<g>
|
| 4517 |
+
<use ns4:href="#m0fca2865ba" x="57.17" y="327.726191" style="stroke: #000000; stroke-width: 0.8" />
|
| 4518 |
</g>
|
| 4519 |
</g>
|
| 4520 |
<g id="text_11">
|
| 4521 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="331.52541" transform="rotate(-0 50.17 331.52541)">500</text>
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="ytick_4">
|
| 4525 |
<g id="grid-y--5" class="grid grid-y">
|
| 4526 |
+
<path d="M 57.17 267.147416 L 845.766818 267.147416 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4527 |
</g>
|
| 4528 |
<g id="line2d_12">
|
| 4529 |
<g>
|
| 4530 |
+
<use ns4:href="#m0fca2865ba" x="57.17" y="267.147416" style="stroke: #000000; stroke-width: 0.8" />
|
| 4531 |
</g>
|
| 4532 |
</g>
|
| 4533 |
<g id="text_12">
|
| 4534 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="270.946635" transform="rotate(-0 50.17 270.946635)">750</text>
|
| 4535 |
</g>
|
| 4536 |
</g>
|
| 4537 |
<g id="ytick_5">
|
| 4538 |
<g id="grid-y--6" class="grid grid-y">
|
| 4539 |
+
<path d="M 57.17 206.568642 L 845.766818 206.568642 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4540 |
</g>
|
| 4541 |
<g id="line2d_13">
|
| 4542 |
<g>
|
| 4543 |
+
<use ns4:href="#m0fca2865ba" x="57.17" y="206.568642" style="stroke: #000000; stroke-width: 0.8" />
|
| 4544 |
</g>
|
| 4545 |
</g>
|
| 4546 |
<g id="text_13">
|
| 4547 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="210.367861" transform="rotate(-0 50.17 210.367861)">1000</text>
|
| 4548 |
</g>
|
| 4549 |
</g>
|
| 4550 |
<g id="ytick_6">
|
| 4551 |
<g id="grid-y--7" class="grid grid-y">
|
| 4552 |
+
<path d="M 57.17 145.989867 L 845.766818 145.989867 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4553 |
</g>
|
| 4554 |
<g id="line2d_14">
|
| 4555 |
<g>
|
| 4556 |
+
<use ns4:href="#m0fca2865ba" x="57.17" y="145.989867" style="stroke: #000000; stroke-width: 0.8" />
|
| 4557 |
</g>
|
| 4558 |
</g>
|
| 4559 |
<g id="text_14">
|
| 4560 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="149.789086" transform="rotate(-0 50.17 149.789086)">1250</text>
|
| 4561 |
</g>
|
| 4562 |
</g>
|
| 4563 |
<g id="ytick_7">
|
| 4564 |
<g id="grid-y--8" class="grid grid-y">
|
| 4565 |
+
<path d="M 57.17 85.411093 L 845.766818 85.411093 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4566 |
</g>
|
| 4567 |
<g id="line2d_15">
|
| 4568 |
<g>
|
| 4569 |
+
<use ns4:href="#m0fca2865ba" x="57.17" y="85.411093" style="stroke: #000000; stroke-width: 0.8" />
|
| 4570 |
</g>
|
| 4571 |
</g>
|
| 4572 |
<g id="text_15">
|
| 4573 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="89.210312" transform="rotate(-0 50.17 89.210312)">1500</text>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4574 |
</g>
|
| 4575 |
</g>
|
| 4576 |
<g id="label--y" class="ylabel">
|
| 4577 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.640312" y="247.598635" transform="rotate(-90 18.640312 247.598635)">Latency P50 (ms)</text>
|
| 4578 |
</g>
|
| 4579 |
</g>
|
| 4580 |
<g id="series--binned-torch" class="series">
|
| 4581 |
+
<path d="M 93.01531 410.663437 L 195.430481 399.252405 L 297.845652 356.001516 L 400.260823 346.76844 L 502.675995 261.404682 L 605.091166 245.335532 L 707.506337 82.088892 L 809.921508 46.94533 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4582 |
<defs>
|
| 4583 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4584 |
</defs>
|
| 4585 |
+
<g clip-path="url(#p5307ca50d8)">
|
| 4586 |
+
<use ns4:href="#md7efaf3aec" x="93.01531" y="410.663437" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4587 |
+
<use ns4:href="#md7efaf3aec" x="195.430481" y="399.252405" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4588 |
+
<use ns4:href="#md7efaf3aec" x="297.845652" y="356.001516" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4589 |
+
<use ns4:href="#md7efaf3aec" x="400.260823" y="346.76844" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4590 |
+
<use ns4:href="#md7efaf3aec" x="502.675995" y="261.404682" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4591 |
+
<use ns4:href="#md7efaf3aec" x="605.091166" y="245.335532" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4592 |
+
<use ns4:href="#md7efaf3aec" x="707.506337" y="82.088892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4593 |
+
<use ns4:href="#md7efaf3aec" x="809.921508" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4594 |
</g>
|
| 4595 |
</g>
|
| 4596 |
<g id="series--gpt-oss-experts" class="series">
|
| 4597 |
+
<path d="M 93.01531 448.251939 L 195.430481 447.950554 L 297.845652 447.969402 L 400.260823 447.623067 L 502.675995 447.274263 L 605.091166 447.115423 L 707.506337 445.70441 L 809.921508 445.680583 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4598 |
<defs>
|
| 4599 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4600 |
</defs>
|
| 4601 |
+
<g clip-path="url(#p5307ca50d8)">
|
| 4602 |
+
<use ns4:href="#m9b8c54d372" x="93.01531" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4603 |
+
<use ns4:href="#m9b8c54d372" x="195.430481" y="447.950554" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4604 |
+
<use ns4:href="#m9b8c54d372" x="297.845652" y="447.969402" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4605 |
+
<use ns4:href="#m9b8c54d372" x="400.260823" y="447.623067" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4606 |
+
<use ns4:href="#m9b8c54d372" x="502.675995" y="447.274263" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4607 |
+
<use ns4:href="#m9b8c54d372" x="605.091166" y="447.115423" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4608 |
+
<use ns4:href="#m9b8c54d372" x="707.506337" y="445.70441" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4609 |
+
<use ns4:href="#m9b8c54d372" x="809.921508" y="445.680583" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4610 |
</g>
|
| 4611 |
</g>
|
| 4612 |
<g id="patch_3">
|
| 4613 |
+
<path d="M 57.17 468.317269 L 57.17 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4614 |
</g>
|
| 4615 |
<g id="patch_4">
|
| 4616 |
<path d="M 845.766818 468.317269 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4617 |
</g>
|
| 4618 |
<g id="patch_5">
|
| 4619 |
+
<path d="M 57.17 468.317269 L 845.766818 468.317269 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4620 |
</g>
|
| 4621 |
<g id="patch_6">
|
| 4622 |
+
<path d="M 57.17 26.88 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4623 |
</g>
|
| 4624 |
+
<g id="text_16">
|
| 4625 |
+
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.468409" y="20.88" transform="rotate(-0 451.468409 20.88)">Attention Implementation Latency</text>
|
| 4626 |
</g>
|
| 4627 |
<g id="legend" class="legend">
|
| 4628 |
<g id="patch_7">
|
| 4629 |
+
<path d="M 64.17 64.7925 L 176.96375 64.7925 Q 178.96375 64.7925 178.96375 62.7925 L 178.96375 33.88 Q 178.96375 31.88 176.96375 31.88 L 64.17 31.88 Q 62.17 31.88 62.17 33.88 L 62.17 62.7925 Q 62.17 64.7925 64.17 64.7925 L 64.17 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4630 |
</g>
|
| 4631 |
+
<g id="line2d_16">
|
| 4632 |
+
<path d="M 66.17 39.978438 L 76.17 39.978438 L 86.17 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4633 |
<g>
|
| 4634 |
+
<use ns4:href="#md7efaf3aec" x="76.17" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4635 |
</g>
|
| 4636 |
</g>
|
| 4637 |
<g id="legend-label--binned-torch" class="legend">
|
| 4638 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.17" y="43.478438" transform="rotate(-0 94.17 43.478438)">binned_torch</text>
|
| 4639 |
</g>
|
| 4640 |
+
<g id="line2d_17">
|
| 4641 |
+
<path d="M 66.17 54.934687 L 76.17 54.934687 L 86.17 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4642 |
<g>
|
| 4643 |
+
<use ns4:href="#m9b8c54d372" x="76.17" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4644 |
</g>
|
| 4645 |
</g>
|
| 4646 |
<g id="legend-label--gpt-oss-experts" class="legend">
|
| 4647 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.17" y="58.434687" transform="rotate(-0 94.17 58.434687)">gpt_oss_experts</text>
|
| 4648 |
</g>
|
| 4649 |
</g>
|
| 4650 |
</g>
|
| 4651 |
</g>
|
| 4652 |
<defs>
|
| 4653 |
+
<clipPath id="p5307ca50d8">
|
| 4654 |
+
<rect x="57.17" y="26.88" width="788.596818" height="441.437269" />
|
| 4655 |
</clipPath>
|
| 4656 |
</defs>
|
| 4657 |
</svg>
|
rotary/impls/artifacts/benchmark/rotary.jsonl
CHANGED
|
@@ -1,24 +1,24 @@
|
|
| 1 |
-
{"ts": "2025-
|
| 2 |
-
{"ts": "2025-
|
| 3 |
-
{"ts": "2025-
|
| 4 |
-
{"ts": "2025-
|
| 5 |
-
{"ts": "2025-
|
| 6 |
-
{"ts": "2025-
|
| 7 |
-
{"ts": "2025-
|
| 8 |
-
{"ts": "2025-
|
| 9 |
-
{"ts": "2025-
|
| 10 |
-
{"ts": "2025-
|
| 11 |
-
{"ts": "2025-
|
| 12 |
-
{"ts": "2025-
|
| 13 |
-
{"ts": "2025-
|
| 14 |
-
{"ts": "2025-
|
| 15 |
-
{"ts": "2025-
|
| 16 |
-
{"ts": "2025-
|
| 17 |
-
{"ts": "2025-
|
| 18 |
-
{"ts": "2025-
|
| 19 |
-
{"ts": "2025-
|
| 20 |
-
{"ts": "2025-
|
| 21 |
-
{"ts": "2025-
|
| 22 |
-
{"ts": "2025-
|
| 23 |
-
{"ts": "2025-
|
| 24 |
-
{"ts": "2025-
|
|
|
|
| 1 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17331300000478223, "p50": 0.17603300000246236, "p90": 0.1797429999896849, "mean": 0.1784169999950791, "iqr": 0.0038800000083938357, "raw_times": [0.17603300000246236, 0.17586299998129107, 0.1797429999896849, 0.18713299999717492, 0.17331300000478223], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18657300000768373, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 2 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21556299998337636, "p50": 0.2165239999953883, "p90": 0.21698299997297, "mean": 0.21635159998822928, "iqr": 0.0013189999776841432, "raw_times": [0.2165239999953883, 0.21698299997297, 0.21566399999528585, 0.21556299998337636, 0.21702399999412592], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21905399995603148, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 3 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21391299998185787, "p50": 0.21503299996084024, "p90": 0.21681300000864212, "mean": 0.21537540000053923, "iqr": 0.0027289999593449465, "raw_times": [0.21503299996084024, 0.21408400004929717, 0.21681300000864212, 0.21703400000205875, 0.21391299998185787], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2214840000078766, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 4 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21356299998842587, "p50": 0.2151840000124139, "p90": 0.2162740000244412, "mean": 0.21532140000317668, "iqr": 0.0011410000411160581, "raw_times": [0.2162740000244412, 0.21513299998332513, 0.2151840000124139, 0.2164530000072773, 0.21356299998842587], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2165939999940747, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 5 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21375400001488742, "p50": 0.21507399998199617, "p90": 0.21535299998731716, "mean": 0.21505959999785773, "iqr": 0.0006099999723119254, "raw_times": [0.21474300001500524, 0.21375400001488742, 0.21535299998731716, 0.21507399998199617, 0.21637399999008267], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2174030000219318, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 6 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2112430000238419, "p50": 0.21400400004267794, "p90": 0.21425299996735703, "mean": 0.21312160000661606, "iqr": 0.002878999964650575, "raw_times": [0.21137400000270645, 0.21425299996735703, 0.21400400004267794, 0.214733999996497, 0.2112430000238419], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22874399996908323, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 7 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21246400001473376, "p50": 0.2133630000002995, "p90": 0.21390399996334963, "mean": 0.2133594000042649, "iqr": 0.0008009999419300584, "raw_times": [0.21396300002152202, 0.21246400001473376, 0.21390399996334963, 0.21310300002141958, 0.2133630000002995], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2195139999798812, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 8 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21415399999114015, "p50": 0.21443299999646115, "p90": 0.2147029999832739, "mean": 0.2148253999962435, "iqr": 0.000368999963029637, "raw_times": [0.21650299999009803, 0.21415399999114015, 0.21443299999646115, 0.21433400002024428, 0.2147029999832739], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2173330000232454, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 9 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21359300001222437, "p50": 0.2138830000149028, "p90": 0.21400299999640993, "mean": 0.21457699999700708, "iqr": 0.00012000003835055395, "raw_times": [0.21400299999640993, 0.2138830000149028, 0.21359300001222437, 0.21752300000343894, 0.21388299995805937], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2191329999732261, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 10 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21296400001347138, "p50": 0.21389400001226022, "p90": 0.21517300001505646, "mean": 0.21466560000362733, "iqr": 0.0013790000252811296, "raw_times": [0.21296400001347138, 0.21517300001505646, 0.21750299998757328, 0.21379399998977533, 0.21389400001226022], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21542299998600356, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 11 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2098030000183826, "p50": 0.21347300003071723, "p90": 0.21457399998325855, "mean": 0.21505920001345658, "iqr": 0.0023309999619414157, "raw_times": [0.2098030000183826, 0.21457399998325855, 0.21347300003071723, 0.21224300002131713, 0.22520300001360738], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21741399996244581, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 12 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22297399999615664, "p50": 0.22381400003723684, "p90": 0.22385300002270014, "mean": 0.2239618000203336, "iqr": 0.0007890000119914475, "raw_times": [0.2230640000107087, 0.22385300002270014, 0.22381400003723684, 0.22610400003486575, 0.22297399999615664], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22540300000173374, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 13 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21275400001741218, "p50": 0.21372400004793235, "p90": 0.21630299994512825, "mean": 0.22107159999222858, "iqr": 0.0030399999673136335, "raw_times": [0.21372400004793235, 0.24931399997285553, 0.21630299994512825, 0.21326299997781462, 0.21275400001741218], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21886299998641334, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 14 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21458399999119138, "p50": 0.21627299997817317, "p90": 0.21634299997685957, "mean": 0.21600339998713025, "iqr": 0.0007099999947968172, "raw_times": [0.21627299997817317, 0.21718400000736438, 0.21563299998206276, 0.21458399999119138, 0.21634299997685957], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2226130000053672, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 15 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2140940000003866, "p50": 0.215932999992674, "p90": 0.21619400001782196, "mean": 0.21597160000510485, "iqr": 0.0015699999948992627, "raw_times": [0.2140940000003866, 0.2146240000229227, 0.215932999992674, 0.21619400001782196, 0.21901299999171897], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2184540000484958, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 16 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21190300003581797, "p50": 0.21745400005102056, "p90": 0.21756400002459486, "mean": 0.21624960003236993, "iqr": 0.0009400000067216752, "raw_times": [0.21190300003581797, 0.21745400005102056, 0.21756400002459486, 0.2166240000178732, 0.21770300003254306], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25062399998887486, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 17 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21696300001394775, "p50": 0.21815399998104112, "p90": 0.21820400002070528, "mean": 0.21879360000411907, "iqr": 0.0004510000053414842, "raw_times": [0.2177530000153638, 0.21815399998104112, 0.2228939999895374, 0.21696300001394775, 0.21820400002070528], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2236640000319312, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 18 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21379300000035073, "p50": 0.21643299999141163, "p90": 0.21674399999938032, "mean": 0.21709340001052624, "iqr": 0.00039999997625272954, "raw_times": [0.21379300000035073, 0.21674399999938032, 0.21643299999141163, 0.2163440000231276, 0.22215300003836091], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21868300001415264, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 19 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2133630000002995, "p50": 0.21632300001783733, "p90": 0.21671399997558183, "mean": 0.21582319999424726, "iqr": 0.0009309999882134434, "raw_times": [0.21693299999014926, 0.2133630000002995, 0.21632300001783733, 0.21671399997558183, 0.21578299998736838], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2180729999849973, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 20 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21294399999760572, "p50": 0.21446299996341622, "p90": 0.21984300002486634, "mean": 0.21647359999406035, "iqr": 0.006489000043075066, "raw_times": [0.21335399998179128, 0.2217640000026222, 0.21984300002486634, 0.21294399999760572, 0.21446299996341622], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21826299996519083, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 21 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21578299998736838, "p50": 0.21700399997826025, "p90": 0.2204729999562005, "mean": 0.21918559997402554, "iqr": 0.004118999981983507, "raw_times": [0.21700399997826025, 0.22631399997408153, 0.2204729999562005, 0.216353999974217, 0.21578299998736838], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22091399995360916, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 22 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2135729999963587, "p50": 0.2144540000017514, "p90": 0.2173039999888715, "mean": 0.21536960000503313, "iqr": 0.003270999968663091, "raw_times": [0.21403300002020842, 0.2144540000017514, 0.2135729999963587, 0.2173039999888715, 0.21748400001797563], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22203300000001036, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 23 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22941399998899215, "p50": 0.23028300000760282, "p90": 0.23160400002097958, "mean": 0.23061779999125065, "iqr": 0.0017800000478018774, "raw_times": [0.231963999965501, 0.22941399998899215, 0.2298239999731777, 0.23160400002097958, 0.23028300000760282], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23190299998532282, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 24 |
+
{"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.6428210000422041, "p50": 0.6484909999926458, "p90": 0.6486400000085268, "mean": 0.6472164000115299, "iqr": 0.0035200000070290116, "raw_times": [0.651010000012775, 0.6428210000422041, 0.6486400000085268, 0.6451200000014978, 0.6484909999926458], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.6451109999829896, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
rotary/impls/cells/benchmark.py
CHANGED
|
@@ -4,7 +4,6 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
-
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
@@ -13,36 +12,46 @@
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
-
from kernels import get_kernel
|
| 17 |
|
| 18 |
-
# Load the rotary kernel
|
| 19 |
-
rotary = get_kernel("kernels-community/rotary")
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
|
|
|
|
| 23 |
rotary_dim = cos.shape[-1]
|
| 24 |
|
| 25 |
-
# Clone to avoid modifying
|
| 26 |
q_out = query.clone()
|
| 27 |
k_out = key.clone()
|
| 28 |
|
| 29 |
# Apply rotation to query
|
| 30 |
q1 = q_out[..., :rotary_dim]
|
| 31 |
q2 = q_out[..., rotary_dim : 2 * rotary_dim]
|
| 32 |
-
|
|
|
|
|
|
|
| 33 |
|
| 34 |
# Apply rotation to key
|
| 35 |
k1 = k_out[..., :rotary_dim]
|
| 36 |
k2 = k_out[..., rotary_dim : 2 * rotary_dim]
|
| 37 |
-
|
|
|
|
|
|
|
| 38 |
|
| 39 |
return q_out, k_out
|
| 40 |
|
| 41 |
|
| 42 |
run_benchmark(
|
| 43 |
kernel_type=KernelTypeEnum.ROTARY,
|
| 44 |
-
impl_name="
|
| 45 |
-
impl_tags={"family": "
|
| 46 |
-
impl_func=
|
| 47 |
-
dtype="float32",
|
| 48 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
|
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
|
|
|
| 15 |
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
def apply_rotary_torch(x1, x2, cos, sin, conj=False):
|
| 18 |
+
"""Reference rotary implementation."""
|
| 19 |
+
if not conj:
|
| 20 |
+
out1 = x1 * cos - x2 * sin
|
| 21 |
+
out2 = x1 * sin + x2 * cos
|
| 22 |
+
else:
|
| 23 |
+
out1 = x1 * cos + x2 * sin
|
| 24 |
+
out2 = -x1 * sin + x2 * cos
|
| 25 |
+
return out1, out2
|
| 26 |
|
| 27 |
+
|
| 28 |
+
def torch_rotary(query, key, cos, sin, conj=False):
|
| 29 |
rotary_dim = cos.shape[-1]
|
| 30 |
|
| 31 |
+
# Clone inputs to avoid modifying them
|
| 32 |
q_out = query.clone()
|
| 33 |
k_out = key.clone()
|
| 34 |
|
| 35 |
# Apply rotation to query
|
| 36 |
q1 = q_out[..., :rotary_dim]
|
| 37 |
q2 = q_out[..., rotary_dim : 2 * rotary_dim]
|
| 38 |
+
q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
|
| 39 |
+
q_out[..., :rotary_dim] = q_out_1
|
| 40 |
+
q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2
|
| 41 |
|
| 42 |
# Apply rotation to key
|
| 43 |
k1 = k_out[..., :rotary_dim]
|
| 44 |
k2 = k_out[..., rotary_dim : 2 * rotary_dim]
|
| 45 |
+
k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
|
| 46 |
+
k_out[..., :rotary_dim] = k_out_1
|
| 47 |
+
k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2
|
| 48 |
|
| 49 |
return q_out, k_out
|
| 50 |
|
| 51 |
|
| 52 |
run_benchmark(
|
| 53 |
kernel_type=KernelTypeEnum.ROTARY,
|
| 54 |
+
impl_name="torch_eager",
|
| 55 |
+
impl_tags={"family": "pytorch", "backend": "eager"},
|
| 56 |
+
impl_func=torch_rotary,
|
|
|
|
| 57 |
)
|
rotary/impls/hf_kernels_rotary.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rotary/impls/torch_rotary.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rotary/index.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
rotary/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
rotary/results/combined_results.html
CHANGED
|
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
-
Linux x86_64 | Linux-6.12.
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
@@ -3889,11 +3889,11 @@ body[data-tool="eraser"] .main-content {
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
-
<dc:date>2025-
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
| 3896 |
-
<dc:title>Matplotlib v3.10.
|
| 3897 |
</ns2:Agent>
|
| 3898 |
</dc:creator>
|
| 3899 |
</ns2:Work>
|
|
@@ -4233,109 +4233,109 @@ body[data-tool="eraser"] .main-content {
|
|
| 4233 |
<g id="matplotlib.axis_2">
|
| 4234 |
<g id="ytick_1">
|
| 4235 |
<g id="grid-y--2" class="grid grid-y">
|
| 4236 |
-
<path d="M 47.72
|
| 4237 |
</g>
|
| 4238 |
<g id="line2d_25">
|
| 4239 |
<defs>
|
| 4240 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4241 |
</defs>
|
| 4242 |
<g>
|
| 4243 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4244 |
</g>
|
| 4245 |
</g>
|
| 4246 |
<g id="text_25">
|
| 4247 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4248 |
</g>
|
| 4249 |
</g>
|
| 4250 |
<g id="ytick_2">
|
| 4251 |
<g id="grid-y--3" class="grid grid-y">
|
| 4252 |
-
<path d="M 47.72
|
| 4253 |
</g>
|
| 4254 |
<g id="line2d_26">
|
| 4255 |
<g>
|
| 4256 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4257 |
</g>
|
| 4258 |
</g>
|
| 4259 |
<g id="text_26">
|
| 4260 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4261 |
</g>
|
| 4262 |
</g>
|
| 4263 |
<g id="ytick_3">
|
| 4264 |
<g id="grid-y--4" class="grid grid-y">
|
| 4265 |
-
<path d="M 47.72
|
| 4266 |
</g>
|
| 4267 |
<g id="line2d_27">
|
| 4268 |
<g>
|
| 4269 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4270 |
</g>
|
| 4271 |
</g>
|
| 4272 |
<g id="text_27">
|
| 4273 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4274 |
</g>
|
| 4275 |
</g>
|
| 4276 |
<g id="ytick_4">
|
| 4277 |
<g id="grid-y--5" class="grid grid-y">
|
| 4278 |
-
<path d="M 47.72 253.
|
| 4279 |
</g>
|
| 4280 |
<g id="line2d_28">
|
| 4281 |
<g>
|
| 4282 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="253.
|
| 4283 |
</g>
|
| 4284 |
</g>
|
| 4285 |
<g id="text_28">
|
| 4286 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="ytick_5">
|
| 4290 |
<g id="grid-y--6" class="grid grid-y">
|
| 4291 |
-
<path d="M 47.72 206.
|
| 4292 |
</g>
|
| 4293 |
<g id="line2d_29">
|
| 4294 |
<g>
|
| 4295 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="206.
|
| 4296 |
</g>
|
| 4297 |
</g>
|
| 4298 |
<g id="text_29">
|
| 4299 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="210.
|
| 4300 |
</g>
|
| 4301 |
</g>
|
| 4302 |
<g id="ytick_6">
|
| 4303 |
<g id="grid-y--7" class="grid grid-y">
|
| 4304 |
-
<path d="M 47.72 159.
|
| 4305 |
</g>
|
| 4306 |
<g id="line2d_30">
|
| 4307 |
<g>
|
| 4308 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="159.
|
| 4309 |
</g>
|
| 4310 |
</g>
|
| 4311 |
<g id="text_30">
|
| 4312 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="163.
|
| 4313 |
</g>
|
| 4314 |
</g>
|
| 4315 |
<g id="ytick_7">
|
| 4316 |
<g id="grid-y--8" class="grid grid-y">
|
| 4317 |
-
<path d="M 47.72
|
| 4318 |
</g>
|
| 4319 |
<g id="line2d_31">
|
| 4320 |
<g>
|
| 4321 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4322 |
</g>
|
| 4323 |
</g>
|
| 4324 |
<g id="text_31">
|
| 4325 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4326 |
</g>
|
| 4327 |
</g>
|
| 4328 |
<g id="ytick_8">
|
| 4329 |
<g id="grid-y--9" class="grid grid-y">
|
| 4330 |
-
<path d="M 47.72 66.
|
| 4331 |
</g>
|
| 4332 |
<g id="line2d_32">
|
| 4333 |
<g>
|
| 4334 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="66.
|
| 4335 |
</g>
|
| 4336 |
</g>
|
| 4337 |
<g id="text_32">
|
| 4338 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4339 |
</g>
|
| 4340 |
</g>
|
| 4341 |
<g id="label--y" class="ylabel">
|
|
@@ -4343,67 +4343,67 @@ body[data-tool="eraser"] .main-content {
|
|
| 4343 |
</g>
|
| 4344 |
</g>
|
| 4345 |
<g id="series--hf-kernels-rotary" class="series">
|
| 4346 |
-
<path d="M 82.966497 405.060892 L 113.615625
|
| 4347 |
<defs>
|
| 4348 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4349 |
</defs>
|
| 4350 |
<g clip-path="url(#p088c925177)">
|
| 4351 |
<use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4352 |
-
<use ns4:href="#md7efaf3aec" x="113.615625" y="
|
| 4353 |
-
<use ns4:href="#md7efaf3aec" x="144.264753" y="
|
| 4354 |
-
<use ns4:href="#md7efaf3aec" x="174.913881" y="
|
| 4355 |
-
<use ns4:href="#md7efaf3aec" x="205.563009" y="
|
| 4356 |
-
<use ns4:href="#md7efaf3aec" x="236.212137" y="
|
| 4357 |
-
<use ns4:href="#md7efaf3aec" x="266.861265" y="
|
| 4358 |
-
<use ns4:href="#md7efaf3aec" x="297.510393" y="
|
| 4359 |
-
<use ns4:href="#md7efaf3aec" x="328.159521" y="
|
| 4360 |
-
<use ns4:href="#md7efaf3aec" x="358.808648" y="
|
| 4361 |
-
<use ns4:href="#md7efaf3aec" x="389.457776" y="
|
| 4362 |
-
<use ns4:href="#md7efaf3aec" x="420.106904" y="
|
| 4363 |
-
<use ns4:href="#md7efaf3aec" x="450.756032" y="
|
| 4364 |
-
<use ns4:href="#md7efaf3aec" x="481.40516" y="
|
| 4365 |
-
<use ns4:href="#md7efaf3aec" x="512.054288" y="
|
| 4366 |
-
<use ns4:href="#md7efaf3aec" x="542.703416" y="
|
| 4367 |
-
<use ns4:href="#md7efaf3aec" x="573.352544" y="
|
| 4368 |
-
<use ns4:href="#md7efaf3aec" x="604.001672" y="
|
| 4369 |
-
<use ns4:href="#md7efaf3aec" x="634.6508" y="
|
| 4370 |
-
<use ns4:href="#md7efaf3aec" x="665.299928" y="
|
| 4371 |
-
<use ns4:href="#md7efaf3aec" x="695.949056" y="
|
| 4372 |
-
<use ns4:href="#md7efaf3aec" x="726.598184" y="
|
| 4373 |
-
<use ns4:href="#md7efaf3aec" x="757.247312" y="
|
| 4374 |
<use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4375 |
</g>
|
| 4376 |
</g>
|
| 4377 |
<g id="series--torch-eager" class="series">
|
| 4378 |
-
<path d="M 82.966497
|
| 4379 |
<defs>
|
| 4380 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4381 |
</defs>
|
| 4382 |
<g clip-path="url(#p088c925177)">
|
| 4383 |
-
<use ns4:href="#m9b8c54d372" x="82.966497" y="
|
| 4384 |
-
<use ns4:href="#m9b8c54d372" x="113.615625" y="
|
| 4385 |
-
<use ns4:href="#m9b8c54d372" x="144.264753" y="
|
| 4386 |
-
<use ns4:href="#m9b8c54d372" x="174.913881" y="
|
| 4387 |
-
<use ns4:href="#m9b8c54d372" x="205.563009" y="
|
| 4388 |
-
<use ns4:href="#m9b8c54d372" x="236.212137" y="
|
| 4389 |
-
<use ns4:href="#m9b8c54d372" x="266.861265" y="
|
| 4390 |
-
<use ns4:href="#m9b8c54d372" x="297.510393" y="
|
| 4391 |
-
<use ns4:href="#m9b8c54d372" x="328.159521" y="
|
| 4392 |
-
<use ns4:href="#m9b8c54d372" x="358.808648" y="
|
| 4393 |
-
<use ns4:href="#m9b8c54d372" x="389.457776" y="
|
| 4394 |
-
<use ns4:href="#m9b8c54d372" x="420.106904" y="
|
| 4395 |
-
<use ns4:href="#m9b8c54d372" x="450.756032" y="
|
| 4396 |
-
<use ns4:href="#m9b8c54d372" x="481.40516" y="
|
| 4397 |
-
<use ns4:href="#m9b8c54d372" x="512.054288" y="
|
| 4398 |
-
<use ns4:href="#m9b8c54d372" x="542.703416" y="
|
| 4399 |
-
<use ns4:href="#m9b8c54d372" x="573.352544" y="
|
| 4400 |
-
<use ns4:href="#m9b8c54d372" x="604.001672" y="
|
| 4401 |
-
<use ns4:href="#m9b8c54d372" x="634.6508" y="
|
| 4402 |
-
<use ns4:href="#m9b8c54d372" x="665.299928" y="
|
| 4403 |
-
<use ns4:href="#m9b8c54d372" x="695.949056" y="
|
| 4404 |
-
<use ns4:href="#m9b8c54d372" x="726.598184" y="
|
| 4405 |
-
<use ns4:href="#m9b8c54d372" x="757.247312" y="
|
| 4406 |
-
<use ns4:href="#m9b8c54d372" x="787.896439" y="
|
| 4407 |
</g>
|
| 4408 |
</g>
|
| 4409 |
<g id="patch_3">
|
|
@@ -4461,7 +4461,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4461 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4462 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4463 |
</span> |
|
| 4464 |
-
Cell: combine | 4.
|
| 4465 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4466 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4467 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4551,7 +4551,7 @@ impl wl p50(ms) ok
|
|
| 4551 |
hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 True
|
| 4552 |
hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 True
|
| 4553 |
hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 True
|
| 4554 |
-
hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.
|
| 4555 |
hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.26 True
|
| 4556 |
hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 True
|
| 4557 |
hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 True
|
|
@@ -4573,29 +4573,29 @@ hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09 True
|
|
| 4573 |
hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 True
|
| 4574 |
hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True
|
| 4575 |
torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
|
| 4576 |
-
torch_eager cuda_B1_S128_H32_D64_R32 0.
|
| 4577 |
-
torch_eager cuda_B1_S128_H8_D128_R64 0.
|
| 4578 |
torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
|
| 4579 |
-
torch_eager cuda_B1_S2048_H32_D128_R64 0.
|
| 4580 |
-
torch_eager cuda_B1_S2048_H32_D64_R32 0.
|
| 4581 |
-
torch_eager cuda_B1_S2048_H8_D128_R64 0.
|
| 4582 |
-
torch_eager cuda_B1_S2048_H8_D64_R32 0.
|
| 4583 |
-
torch_eager cuda_B1_S512_H32_D128_R64 0.
|
| 4584 |
-
torch_eager cuda_B1_S512_H32_D64_R32 0.
|
| 4585 |
-
torch_eager cuda_B1_S512_H8_D128_R64 0.
|
| 4586 |
-
torch_eager cuda_B1_S512_H8_D64_R32 0.
|
| 4587 |
torch_eager cuda_B2_S128_H32_D128_R64 0.22 True
|
| 4588 |
torch_eager cuda_B2_S128_H32_D64_R32 0.22 True
|
| 4589 |
torch_eager cuda_B2_S128_H8_D128_R64 0.22 True
|
| 4590 |
-
torch_eager cuda_B2_S128_H8_D64_R32 0.
|
| 4591 |
torch_eager cuda_B2_S2048_H32_D128_R64 0.65 True
|
| 4592 |
torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
|
| 4593 |
-
torch_eager cuda_B2_S2048_H8_D128_R64 0.
|
| 4594 |
torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
|
| 4595 |
-
torch_eager cuda_B2_S512_H32_D128_R64 0.
|
| 4596 |
-
torch_eager cuda_B2_S512_H32_D64_R32 0.
|
| 4597 |
torch_eager cuda_B2_S512_H8_D128_R64 0.22 True
|
| 4598 |
-
torch_eager cuda_B2_S512_H8_D64_R32 0.
|
| 4599 |
|
| 4600 |
GENERATING COMBINED VISUALIZATION
|
| 4601 |
|
|
@@ -4615,7 +4615,7 @@ Implementations included:
|
|
| 4615 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4616 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4617 |
<div class="uv-logs-content" style="display: none;">
|
| 4618 |
-
Installed 37 packages in
|
| 4619 |
</div>
|
| 4620 |
</div>
|
| 4621 |
<div class="cell-artifacts">
|
|
@@ -4628,11 +4628,11 @@ Installed 37 packages in 282ms
|
|
| 4628 |
<rdf:RDF>
|
| 4629 |
<ns2:Work>
|
| 4630 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4631 |
-
<dc:date>2025-
|
| 4632 |
<dc:format>image/svg+xml</dc:format>
|
| 4633 |
<dc:creator>
|
| 4634 |
<ns2:Agent>
|
| 4635 |
-
<dc:title>Matplotlib v3.10.
|
| 4636 |
</ns2:Agent>
|
| 4637 |
</dc:creator>
|
| 4638 |
</ns2:Work>
|
|
@@ -4972,109 +4972,109 @@ Installed 37 packages in 282ms
|
|
| 4972 |
<g id="matplotlib.axis_2">
|
| 4973 |
<g id="ytick_1">
|
| 4974 |
<g id="grid-y--2" class="grid grid-y">
|
| 4975 |
-
<path d="M 47.72
|
| 4976 |
</g>
|
| 4977 |
<g id="line2d_25">
|
| 4978 |
<defs>
|
| 4979 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4980 |
</defs>
|
| 4981 |
<g>
|
| 4982 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4983 |
</g>
|
| 4984 |
</g>
|
| 4985 |
<g id="text_25">
|
| 4986 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4987 |
</g>
|
| 4988 |
</g>
|
| 4989 |
<g id="ytick_2">
|
| 4990 |
<g id="grid-y--3" class="grid grid-y">
|
| 4991 |
-
<path d="M 47.72
|
| 4992 |
</g>
|
| 4993 |
<g id="line2d_26">
|
| 4994 |
<g>
|
| 4995 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4996 |
</g>
|
| 4997 |
</g>
|
| 4998 |
<g id="text_26">
|
| 4999 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 5000 |
</g>
|
| 5001 |
</g>
|
| 5002 |
<g id="ytick_3">
|
| 5003 |
<g id="grid-y--4" class="grid grid-y">
|
| 5004 |
-
<path d="M 47.72
|
| 5005 |
</g>
|
| 5006 |
<g id="line2d_27">
|
| 5007 |
<g>
|
| 5008 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 5009 |
</g>
|
| 5010 |
</g>
|
| 5011 |
<g id="text_27">
|
| 5012 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 5013 |
</g>
|
| 5014 |
</g>
|
| 5015 |
<g id="ytick_4">
|
| 5016 |
<g id="grid-y--5" class="grid grid-y">
|
| 5017 |
-
<path d="M 47.72 253.
|
| 5018 |
</g>
|
| 5019 |
<g id="line2d_28">
|
| 5020 |
<g>
|
| 5021 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="253.
|
| 5022 |
</g>
|
| 5023 |
</g>
|
| 5024 |
<g id="text_28">
|
| 5025 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 5026 |
</g>
|
| 5027 |
</g>
|
| 5028 |
<g id="ytick_5">
|
| 5029 |
<g id="grid-y--6" class="grid grid-y">
|
| 5030 |
-
<path d="M 47.72 206.
|
| 5031 |
</g>
|
| 5032 |
<g id="line2d_29">
|
| 5033 |
<g>
|
| 5034 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="206.
|
| 5035 |
</g>
|
| 5036 |
</g>
|
| 5037 |
<g id="text_29">
|
| 5038 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="210.
|
| 5039 |
</g>
|
| 5040 |
</g>
|
| 5041 |
<g id="ytick_6">
|
| 5042 |
<g id="grid-y--7" class="grid grid-y">
|
| 5043 |
-
<path d="M 47.72 159.
|
| 5044 |
</g>
|
| 5045 |
<g id="line2d_30">
|
| 5046 |
<g>
|
| 5047 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="159.
|
| 5048 |
</g>
|
| 5049 |
</g>
|
| 5050 |
<g id="text_30">
|
| 5051 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="163.
|
| 5052 |
</g>
|
| 5053 |
</g>
|
| 5054 |
<g id="ytick_7">
|
| 5055 |
<g id="grid-y--8" class="grid grid-y">
|
| 5056 |
-
<path d="M 47.72
|
| 5057 |
</g>
|
| 5058 |
<g id="line2d_31">
|
| 5059 |
<g>
|
| 5060 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 5061 |
</g>
|
| 5062 |
</g>
|
| 5063 |
<g id="text_31">
|
| 5064 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 5065 |
</g>
|
| 5066 |
</g>
|
| 5067 |
<g id="ytick_8">
|
| 5068 |
<g id="grid-y--9" class="grid grid-y">
|
| 5069 |
-
<path d="M 47.72 66.
|
| 5070 |
</g>
|
| 5071 |
<g id="line2d_32">
|
| 5072 |
<g>
|
| 5073 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="66.
|
| 5074 |
</g>
|
| 5075 |
</g>
|
| 5076 |
<g id="text_32">
|
| 5077 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 5078 |
</g>
|
| 5079 |
</g>
|
| 5080 |
<g id="label--y" class="ylabel">
|
|
@@ -5082,67 +5082,67 @@ Installed 37 packages in 282ms
|
|
| 5082 |
</g>
|
| 5083 |
</g>
|
| 5084 |
<g id="series--hf-kernels-rotary" class="series">
|
| 5085 |
-
<path d="M 82.966497 405.060892 L 113.615625
|
| 5086 |
<defs>
|
| 5087 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 5088 |
</defs>
|
| 5089 |
<g clip-path="url(#p088c925177)">
|
| 5090 |
<use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5091 |
-
<use ns4:href="#md7efaf3aec" x="113.615625" y="
|
| 5092 |
-
<use ns4:href="#md7efaf3aec" x="144.264753" y="
|
| 5093 |
-
<use ns4:href="#md7efaf3aec" x="174.913881" y="
|
| 5094 |
-
<use ns4:href="#md7efaf3aec" x="205.563009" y="
|
| 5095 |
-
<use ns4:href="#md7efaf3aec" x="236.212137" y="
|
| 5096 |
-
<use ns4:href="#md7efaf3aec" x="266.861265" y="
|
| 5097 |
-
<use ns4:href="#md7efaf3aec" x="297.510393" y="
|
| 5098 |
-
<use ns4:href="#md7efaf3aec" x="328.159521" y="
|
| 5099 |
-
<use ns4:href="#md7efaf3aec" x="358.808648" y="
|
| 5100 |
-
<use ns4:href="#md7efaf3aec" x="389.457776" y="
|
| 5101 |
-
<use ns4:href="#md7efaf3aec" x="420.106904" y="
|
| 5102 |
-
<use ns4:href="#md7efaf3aec" x="450.756032" y="
|
| 5103 |
-
<use ns4:href="#md7efaf3aec" x="481.40516" y="
|
| 5104 |
-
<use ns4:href="#md7efaf3aec" x="512.054288" y="
|
| 5105 |
-
<use ns4:href="#md7efaf3aec" x="542.703416" y="
|
| 5106 |
-
<use ns4:href="#md7efaf3aec" x="573.352544" y="
|
| 5107 |
-
<use ns4:href="#md7efaf3aec" x="604.001672" y="
|
| 5108 |
-
<use ns4:href="#md7efaf3aec" x="634.6508" y="
|
| 5109 |
-
<use ns4:href="#md7efaf3aec" x="665.299928" y="
|
| 5110 |
-
<use ns4:href="#md7efaf3aec" x="695.949056" y="
|
| 5111 |
-
<use ns4:href="#md7efaf3aec" x="726.598184" y="
|
| 5112 |
-
<use ns4:href="#md7efaf3aec" x="757.247312" y="
|
| 5113 |
<use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5114 |
</g>
|
| 5115 |
</g>
|
| 5116 |
<g id="series--torch-eager" class="series">
|
| 5117 |
-
<path d="M 82.966497
|
| 5118 |
<defs>
|
| 5119 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 5120 |
</defs>
|
| 5121 |
<g clip-path="url(#p088c925177)">
|
| 5122 |
-
<use ns4:href="#m9b8c54d372" x="82.966497" y="
|
| 5123 |
-
<use ns4:href="#m9b8c54d372" x="113.615625" y="
|
| 5124 |
-
<use ns4:href="#m9b8c54d372" x="144.264753" y="
|
| 5125 |
-
<use ns4:href="#m9b8c54d372" x="174.913881" y="
|
| 5126 |
-
<use ns4:href="#m9b8c54d372" x="205.563009" y="
|
| 5127 |
-
<use ns4:href="#m9b8c54d372" x="236.212137" y="
|
| 5128 |
-
<use ns4:href="#m9b8c54d372" x="266.861265" y="
|
| 5129 |
-
<use ns4:href="#m9b8c54d372" x="297.510393" y="
|
| 5130 |
-
<use ns4:href="#m9b8c54d372" x="328.159521" y="
|
| 5131 |
-
<use ns4:href="#m9b8c54d372" x="358.808648" y="
|
| 5132 |
-
<use ns4:href="#m9b8c54d372" x="389.457776" y="
|
| 5133 |
-
<use ns4:href="#m9b8c54d372" x="420.106904" y="
|
| 5134 |
-
<use ns4:href="#m9b8c54d372" x="450.756032" y="
|
| 5135 |
-
<use ns4:href="#m9b8c54d372" x="481.40516" y="
|
| 5136 |
-
<use ns4:href="#m9b8c54d372" x="512.054288" y="
|
| 5137 |
-
<use ns4:href="#m9b8c54d372" x="542.703416" y="
|
| 5138 |
-
<use ns4:href="#m9b8c54d372" x="573.352544" y="
|
| 5139 |
-
<use ns4:href="#m9b8c54d372" x="604.001672" y="
|
| 5140 |
-
<use ns4:href="#m9b8c54d372" x="634.6508" y="
|
| 5141 |
-
<use ns4:href="#m9b8c54d372" x="665.299928" y="
|
| 5142 |
-
<use ns4:href="#m9b8c54d372" x="695.949056" y="
|
| 5143 |
-
<use ns4:href="#m9b8c54d372" x="726.598184" y="
|
| 5144 |
-
<use ns4:href="#m9b8c54d372" x="757.247312" y="
|
| 5145 |
-
<use ns4:href="#m9b8c54d372" x="787.896439" y="
|
| 5146 |
</g>
|
| 5147 |
</g>
|
| 5148 |
<g id="patch_3">
|
|
|
|
| 3874 |
<div class="system-info">
|
| 3875 |
<div class="system-info-header">Generated on:</div>
|
| 3876 |
<div class="system-info-content">
|
| 3877 |
+
Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
|
| 3878 |
</div>
|
| 3879 |
</div>
|
| 3880 |
|
|
|
|
| 3889 |
<rdf:RDF>
|
| 3890 |
<ns2:Work>
|
| 3891 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3892 |
+
<dc:date>2025-12-19T19:09:41.164726</dc:date>
|
| 3893 |
<dc:format>image/svg+xml</dc:format>
|
| 3894 |
<dc:creator>
|
| 3895 |
<ns2:Agent>
|
| 3896 |
+
<dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
|
| 3897 |
</ns2:Agent>
|
| 3898 |
</dc:creator>
|
| 3899 |
</ns2:Work>
|
|
|
|
| 4233 |
<g id="matplotlib.axis_2">
|
| 4234 |
<g id="ytick_1">
|
| 4235 |
<g id="grid-y--2" class="grid grid-y">
|
| 4236 |
+
<path d="M 47.72 394.065769 L 823.142937 394.065769 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4237 |
</g>
|
| 4238 |
<g id="line2d_25">
|
| 4239 |
<defs>
|
| 4240 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4241 |
</defs>
|
| 4242 |
<g>
|
| 4243 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="394.065769" style="stroke: #000000; stroke-width: 0.8" />
|
| 4244 |
</g>
|
| 4245 |
</g>
|
| 4246 |
<g id="text_25">
|
| 4247 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="397.864988" transform="rotate(-0 40.72 397.864988)">0.1</text>
|
| 4248 |
</g>
|
| 4249 |
</g>
|
| 4250 |
<g id="ytick_2">
|
| 4251 |
<g id="grid-y--3" class="grid grid-y">
|
| 4252 |
+
<path d="M 47.72 347.214212 L 823.142937 347.214212 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4253 |
</g>
|
| 4254 |
<g id="line2d_26">
|
| 4255 |
<g>
|
| 4256 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="347.214212" style="stroke: #000000; stroke-width: 0.8" />
|
| 4257 |
</g>
|
| 4258 |
</g>
|
| 4259 |
<g id="text_26">
|
| 4260 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="351.013431" transform="rotate(-0 40.72 351.013431)">0.2</text>
|
| 4261 |
</g>
|
| 4262 |
</g>
|
| 4263 |
<g id="ytick_3">
|
| 4264 |
<g id="grid-y--4" class="grid grid-y">
|
| 4265 |
+
<path d="M 47.72 300.362656 L 823.142937 300.362656 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4266 |
</g>
|
| 4267 |
<g id="line2d_27">
|
| 4268 |
<g>
|
| 4269 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="300.362656" style="stroke: #000000; stroke-width: 0.8" />
|
| 4270 |
</g>
|
| 4271 |
</g>
|
| 4272 |
<g id="text_27">
|
| 4273 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="304.161875" transform="rotate(-0 40.72 304.161875)">0.3</text>
|
| 4274 |
</g>
|
| 4275 |
</g>
|
| 4276 |
<g id="ytick_4">
|
| 4277 |
<g id="grid-y--5" class="grid grid-y">
|
| 4278 |
+
<path d="M 47.72 253.511099 L 823.142937 253.511099 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4279 |
</g>
|
| 4280 |
<g id="line2d_28">
|
| 4281 |
<g>
|
| 4282 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="253.511099" style="stroke: #000000; stroke-width: 0.8" />
|
| 4283 |
</g>
|
| 4284 |
</g>
|
| 4285 |
<g id="text_28">
|
| 4286 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.310318" transform="rotate(-0 40.72 257.310318)">0.4</text>
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="ytick_5">
|
| 4290 |
<g id="grid-y--6" class="grid grid-y">
|
| 4291 |
+
<path d="M 47.72 206.659543 L 823.142937 206.659543 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4292 |
</g>
|
| 4293 |
<g id="line2d_29">
|
| 4294 |
<g>
|
| 4295 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="206.659543" style="stroke: #000000; stroke-width: 0.8" />
|
| 4296 |
</g>
|
| 4297 |
</g>
|
| 4298 |
<g id="text_29">
|
| 4299 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="210.458761" transform="rotate(-0 40.72 210.458761)">0.5</text>
|
| 4300 |
</g>
|
| 4301 |
</g>
|
| 4302 |
<g id="ytick_6">
|
| 4303 |
<g id="grid-y--7" class="grid grid-y">
|
| 4304 |
+
<path d="M 47.72 159.807986 L 823.142937 159.807986 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4305 |
</g>
|
| 4306 |
<g id="line2d_30">
|
| 4307 |
<g>
|
| 4308 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="159.807986" style="stroke: #000000; stroke-width: 0.8" />
|
| 4309 |
</g>
|
| 4310 |
</g>
|
| 4311 |
<g id="text_30">
|
| 4312 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="163.607205" transform="rotate(-0 40.72 163.607205)">0.6</text>
|
| 4313 |
</g>
|
| 4314 |
</g>
|
| 4315 |
<g id="ytick_7">
|
| 4316 |
<g id="grid-y--8" class="grid grid-y">
|
| 4317 |
+
<path d="M 47.72 112.956429 L 823.142937 112.956429 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4318 |
</g>
|
| 4319 |
<g id="line2d_31">
|
| 4320 |
<g>
|
| 4321 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="112.956429" style="stroke: #000000; stroke-width: 0.8" />
|
| 4322 |
</g>
|
| 4323 |
</g>
|
| 4324 |
<g id="text_31">
|
| 4325 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="116.755648" transform="rotate(-0 40.72 116.755648)">0.7</text>
|
| 4326 |
</g>
|
| 4327 |
</g>
|
| 4328 |
<g id="ytick_8">
|
| 4329 |
<g id="grid-y--9" class="grid grid-y">
|
| 4330 |
+
<path d="M 47.72 66.104873 L 823.142937 66.104873 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4331 |
</g>
|
| 4332 |
<g id="line2d_32">
|
| 4333 |
<g>
|
| 4334 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="66.104873" style="stroke: #000000; stroke-width: 0.8" />
|
| 4335 |
</g>
|
| 4336 |
</g>
|
| 4337 |
<g id="text_32">
|
| 4338 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="69.904092" transform="rotate(-0 40.72 69.904092)">0.8</text>
|
| 4339 |
</g>
|
| 4340 |
</g>
|
| 4341 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4343 |
</g>
|
| 4344 |
</g>
|
| 4345 |
<g id="series--hf-kernels-rotary" class="series">
|
| 4346 |
+
<path d="M 82.966497 405.060892 L 113.615625 399.43402 L 144.264753 400.029504 L 174.913881 399.696858 L 205.563009 399.059208 L 236.212137 400.038874 L 266.861265 399.640636 L 297.510393 400.038405 L 328.159521 400.006078 L 358.808648 399.691704 L 389.457776 399.640167 L 420.106904 318.131109 L 450.756032 400.455853 L 481.40516 400.197701 L 512.054288 399.907221 L 542.703416 399.860838 L 573.352544 400.20754 L 604.001672 400.567828 L 634.6508 399.780722 L 665.299928 400.403848 L 695.949056 399.312675 L 726.598184 400.328885 L 757.247312 320.371082 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4347 |
<defs>
|
| 4348 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4349 |
</defs>
|
| 4350 |
<g clip-path="url(#p088c925177)">
|
| 4351 |
<use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4352 |
+
<use ns4:href="#md7efaf3aec" x="113.615625" y="399.43402" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4353 |
+
<use ns4:href="#md7efaf3aec" x="144.264753" y="400.029504" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4354 |
+
<use ns4:href="#md7efaf3aec" x="174.913881" y="399.696858" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4355 |
+
<use ns4:href="#md7efaf3aec" x="205.563009" y="399.059208" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4356 |
+
<use ns4:href="#md7efaf3aec" x="236.212137" y="400.038874" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4357 |
+
<use ns4:href="#md7efaf3aec" x="266.861265" y="399.640636" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4358 |
+
<use ns4:href="#md7efaf3aec" x="297.510393" y="400.038405" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4359 |
+
<use ns4:href="#md7efaf3aec" x="328.159521" y="400.006078" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4360 |
+
<use ns4:href="#md7efaf3aec" x="358.808648" y="399.691704" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4361 |
+
<use ns4:href="#md7efaf3aec" x="389.457776" y="399.640167" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4362 |
+
<use ns4:href="#md7efaf3aec" x="420.106904" y="318.131109" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4363 |
+
<use ns4:href="#md7efaf3aec" x="450.756032" y="400.455853" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4364 |
+
<use ns4:href="#md7efaf3aec" x="481.40516" y="400.197701" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4365 |
+
<use ns4:href="#md7efaf3aec" x="512.054288" y="399.907221" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4366 |
+
<use ns4:href="#md7efaf3aec" x="542.703416" y="399.860838" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4367 |
+
<use ns4:href="#md7efaf3aec" x="573.352544" y="400.20754" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4368 |
+
<use ns4:href="#md7efaf3aec" x="604.001672" y="400.567828" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4369 |
+
<use ns4:href="#md7efaf3aec" x="634.6508" y="399.780722" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4370 |
+
<use ns4:href="#md7efaf3aec" x="665.299928" y="400.403848" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4371 |
+
<use ns4:href="#md7efaf3aec" x="695.949056" y="399.312675" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4372 |
+
<use ns4:href="#md7efaf3aec" x="726.598184" y="400.328885" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4373 |
+
<use ns4:href="#md7efaf3aec" x="757.247312" y="320.371082" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4374 |
<use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4375 |
</g>
|
| 4376 |
</g>
|
| 4377 |
<g id="series--torch-eager" class="series">
|
| 4378 |
+
<path d="M 82.966497 358.443125 L 113.615625 339.472461 L 144.264753 340.171018 L 174.913881 340.100272 L 205.563009 340.151809 L 236.212137 340.65312 L 266.861265 340.953439 L 297.510393 340.452127 L 328.159521 340.709811 L 358.808648 340.704657 L 389.457776 340.901902 L 420.106904 336.056983 L 450.756032 340.784305 L 481.40516 339.590059 L 512.054288 339.749354 L 542.703416 339.036742 L 573.352544 338.708781 L 604.001672 339.515096 L 634.6508 339.566633 L 665.299928 340.438072 L 695.949056 339.247574 L 726.598184 340.442288 L 757.247312 333.026156 L 787.896439 137.089198 " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4379 |
<defs>
|
| 4380 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4381 |
</defs>
|
| 4382 |
<g clip-path="url(#p088c925177)">
|
| 4383 |
+
<use ns4:href="#m9b8c54d372" x="82.966497" y="358.443125" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4384 |
+
<use ns4:href="#m9b8c54d372" x="113.615625" y="339.472461" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4385 |
+
<use ns4:href="#m9b8c54d372" x="144.264753" y="340.171018" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4386 |
+
<use ns4:href="#m9b8c54d372" x="174.913881" y="340.100272" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4387 |
+
<use ns4:href="#m9b8c54d372" x="205.563009" y="340.151809" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4388 |
+
<use ns4:href="#m9b8c54d372" x="236.212137" y="340.65312" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4389 |
+
<use ns4:href="#m9b8c54d372" x="266.861265" y="340.953439" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4390 |
+
<use ns4:href="#m9b8c54d372" x="297.510393" y="340.452127" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4391 |
+
<use ns4:href="#m9b8c54d372" x="328.159521" y="340.709811" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4392 |
+
<use ns4:href="#m9b8c54d372" x="358.808648" y="340.704657" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4393 |
+
<use ns4:href="#m9b8c54d372" x="389.457776" y="340.901902" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4394 |
+
<use ns4:href="#m9b8c54d372" x="420.106904" y="336.056983" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4395 |
+
<use ns4:href="#m9b8c54d372" x="450.756032" y="340.784305" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4396 |
+
<use ns4:href="#m9b8c54d372" x="481.40516" y="339.590059" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4397 |
+
<use ns4:href="#m9b8c54d372" x="512.054288" y="339.749354" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4398 |
+
<use ns4:href="#m9b8c54d372" x="542.703416" y="339.036742" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4399 |
+
<use ns4:href="#m9b8c54d372" x="573.352544" y="338.708781" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4400 |
+
<use ns4:href="#m9b8c54d372" x="604.001672" y="339.515096" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4401 |
+
<use ns4:href="#m9b8c54d372" x="634.6508" y="339.566633" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4402 |
+
<use ns4:href="#m9b8c54d372" x="665.299928" y="340.438072" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4403 |
+
<use ns4:href="#m9b8c54d372" x="695.949056" y="339.247574" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4404 |
+
<use ns4:href="#m9b8c54d372" x="726.598184" y="340.442288" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4405 |
+
<use ns4:href="#m9b8c54d372" x="757.247312" y="333.026156" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4406 |
+
<use ns4:href="#m9b8c54d372" x="787.896439" y="137.089198" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4407 |
</g>
|
| 4408 |
</g>
|
| 4409 |
<g id="patch_3">
|
|
|
|
| 4461 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4462 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4463 |
</span> |
|
| 4464 |
+
Cell: combine | 4.85s
|
| 4465 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4466 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4467 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4551 |
hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 True
|
| 4552 |
hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 True
|
| 4553 |
hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 True
|
| 4554 |
+
hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 True
|
| 4555 |
hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.26 True
|
| 4556 |
hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 True
|
| 4557 |
hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 True
|
|
|
|
| 4573 |
hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 True
|
| 4574 |
hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True
|
| 4575 |
torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
|
| 4576 |
+
torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
|
| 4577 |
+
torch_eager cuda_B1_S128_H8_D128_R64 0.22 True
|
| 4578 |
torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
|
| 4579 |
+
torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True
|
| 4580 |
+
torch_eager cuda_B1_S2048_H32_D64_R32 0.21 True
|
| 4581 |
+
torch_eager cuda_B1_S2048_H8_D128_R64 0.21 True
|
| 4582 |
+
torch_eager cuda_B1_S2048_H8_D64_R32 0.21 True
|
| 4583 |
+
torch_eager cuda_B1_S512_H32_D128_R64 0.21 True
|
| 4584 |
+
torch_eager cuda_B1_S512_H32_D64_R32 0.21 True
|
| 4585 |
+
torch_eager cuda_B1_S512_H8_D128_R64 0.21 True
|
| 4586 |
+
torch_eager cuda_B1_S512_H8_D64_R32 0.22 True
|
| 4587 |
torch_eager cuda_B2_S128_H32_D128_R64 0.22 True
|
| 4588 |
torch_eager cuda_B2_S128_H32_D64_R32 0.22 True
|
| 4589 |
torch_eager cuda_B2_S128_H8_D128_R64 0.22 True
|
| 4590 |
+
torch_eager cuda_B2_S128_H8_D64_R32 0.21 True
|
| 4591 |
torch_eager cuda_B2_S2048_H32_D128_R64 0.65 True
|
| 4592 |
torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
|
| 4593 |
+
torch_eager cuda_B2_S2048_H8_D128_R64 0.21 True
|
| 4594 |
torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
|
| 4595 |
+
torch_eager cuda_B2_S512_H32_D128_R64 0.21 True
|
| 4596 |
+
torch_eager cuda_B2_S512_H32_D64_R32 0.22 True
|
| 4597 |
torch_eager cuda_B2_S512_H8_D128_R64 0.22 True
|
| 4598 |
+
torch_eager cuda_B2_S512_H8_D64_R32 0.22 True
|
| 4599 |
|
| 4600 |
GENERATING COMBINED VISUALIZATION
|
| 4601 |
|
|
|
|
| 4615 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4616 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4617 |
<div class="uv-logs-content" style="display: none;">
|
| 4618 |
+
Installed 37 packages in 330ms
|
| 4619 |
</div>
|
| 4620 |
</div>
|
| 4621 |
<div class="cell-artifacts">
|
|
|
|
| 4628 |
<rdf:RDF>
|
| 4629 |
<ns2:Work>
|
| 4630 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4631 |
+
<dc:date>2025-12-19T19:09:41.164726</dc:date>
|
| 4632 |
<dc:format>image/svg+xml</dc:format>
|
| 4633 |
<dc:creator>
|
| 4634 |
<ns2:Agent>
|
| 4635 |
+
<dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
|
| 4636 |
</ns2:Agent>
|
| 4637 |
</dc:creator>
|
| 4638 |
</ns2:Work>
|
|
|
|
| 4972 |
<g id="matplotlib.axis_2">
|
| 4973 |
<g id="ytick_1">
|
| 4974 |
<g id="grid-y--2" class="grid grid-y">
|
| 4975 |
+
<path d="M 47.72 394.065769 L 823.142937 394.065769 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4976 |
</g>
|
| 4977 |
<g id="line2d_25">
|
| 4978 |
<defs>
|
| 4979 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4980 |
</defs>
|
| 4981 |
<g>
|
| 4982 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="394.065769" style="stroke: #000000; stroke-width: 0.8" />
|
| 4983 |
</g>
|
| 4984 |
</g>
|
| 4985 |
<g id="text_25">
|
| 4986 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="397.864988" transform="rotate(-0 40.72 397.864988)">0.1</text>
|
| 4987 |
</g>
|
| 4988 |
</g>
|
| 4989 |
<g id="ytick_2">
|
| 4990 |
<g id="grid-y--3" class="grid grid-y">
|
| 4991 |
+
<path d="M 47.72 347.214212 L 823.142937 347.214212 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4992 |
</g>
|
| 4993 |
<g id="line2d_26">
|
| 4994 |
<g>
|
| 4995 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="347.214212" style="stroke: #000000; stroke-width: 0.8" />
|
| 4996 |
</g>
|
| 4997 |
</g>
|
| 4998 |
<g id="text_26">
|
| 4999 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="351.013431" transform="rotate(-0 40.72 351.013431)">0.2</text>
|
| 5000 |
</g>
|
| 5001 |
</g>
|
| 5002 |
<g id="ytick_3">
|
| 5003 |
<g id="grid-y--4" class="grid grid-y">
|
| 5004 |
+
<path d="M 47.72 300.362656 L 823.142937 300.362656 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 5005 |
</g>
|
| 5006 |
<g id="line2d_27">
|
| 5007 |
<g>
|
| 5008 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="300.362656" style="stroke: #000000; stroke-width: 0.8" />
|
| 5009 |
</g>
|
| 5010 |
</g>
|
| 5011 |
<g id="text_27">
|
| 5012 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="304.161875" transform="rotate(-0 40.72 304.161875)">0.3</text>
|
| 5013 |
</g>
|
| 5014 |
</g>
|
| 5015 |
<g id="ytick_4">
|
| 5016 |
<g id="grid-y--5" class="grid grid-y">
|
| 5017 |
+
<path d="M 47.72 253.511099 L 823.142937 253.511099 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 5018 |
</g>
|
| 5019 |
<g id="line2d_28">
|
| 5020 |
<g>
|
| 5021 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="253.511099" style="stroke: #000000; stroke-width: 0.8" />
|
| 5022 |
</g>
|
| 5023 |
</g>
|
| 5024 |
<g id="text_28">
|
| 5025 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.310318" transform="rotate(-0 40.72 257.310318)">0.4</text>
|
| 5026 |
</g>
|
| 5027 |
</g>
|
| 5028 |
<g id="ytick_5">
|
| 5029 |
<g id="grid-y--6" class="grid grid-y">
|
| 5030 |
+
<path d="M 47.72 206.659543 L 823.142937 206.659543 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 5031 |
</g>
|
| 5032 |
<g id="line2d_29">
|
| 5033 |
<g>
|
| 5034 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="206.659543" style="stroke: #000000; stroke-width: 0.8" />
|
| 5035 |
</g>
|
| 5036 |
</g>
|
| 5037 |
<g id="text_29">
|
| 5038 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="210.458761" transform="rotate(-0 40.72 210.458761)">0.5</text>
|
| 5039 |
</g>
|
| 5040 |
</g>
|
| 5041 |
<g id="ytick_6">
|
| 5042 |
<g id="grid-y--7" class="grid grid-y">
|
| 5043 |
+
<path d="M 47.72 159.807986 L 823.142937 159.807986 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 5044 |
</g>
|
| 5045 |
<g id="line2d_30">
|
| 5046 |
<g>
|
| 5047 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="159.807986" style="stroke: #000000; stroke-width: 0.8" />
|
| 5048 |
</g>
|
| 5049 |
</g>
|
| 5050 |
<g id="text_30">
|
| 5051 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="163.607205" transform="rotate(-0 40.72 163.607205)">0.6</text>
|
| 5052 |
</g>
|
| 5053 |
</g>
|
| 5054 |
<g id="ytick_7">
|
| 5055 |
<g id="grid-y--8" class="grid grid-y">
|
| 5056 |
+
<path d="M 47.72 112.956429 L 823.142937 112.956429 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 5057 |
</g>
|
| 5058 |
<g id="line2d_31">
|
| 5059 |
<g>
|
| 5060 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="112.956429" style="stroke: #000000; stroke-width: 0.8" />
|
| 5061 |
</g>
|
| 5062 |
</g>
|
| 5063 |
<g id="text_31">
|
| 5064 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="116.755648" transform="rotate(-0 40.72 116.755648)">0.7</text>
|
| 5065 |
</g>
|
| 5066 |
</g>
|
| 5067 |
<g id="ytick_8">
|
| 5068 |
<g id="grid-y--9" class="grid grid-y">
|
| 5069 |
+
<path d="M 47.72 66.104873 L 823.142937 66.104873 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 5070 |
</g>
|
| 5071 |
<g id="line2d_32">
|
| 5072 |
<g>
|
| 5073 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="66.104873" style="stroke: #000000; stroke-width: 0.8" />
|
| 5074 |
</g>
|
| 5075 |
</g>
|
| 5076 |
<g id="text_32">
|
| 5077 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="69.904092" transform="rotate(-0 40.72 69.904092)">0.8</text>
|
| 5078 |
</g>
|
| 5079 |
</g>
|
| 5080 |
<g id="label--y" class="ylabel">
|
|
|
|
| 5082 |
</g>
|
| 5083 |
</g>
|
| 5084 |
<g id="series--hf-kernels-rotary" class="series">
|
| 5085 |
+
<path d="M 82.966497 405.060892 L 113.615625 399.43402 L 144.264753 400.029504 L 174.913881 399.696858 L 205.563009 399.059208 L 236.212137 400.038874 L 266.861265 399.640636 L 297.510393 400.038405 L 328.159521 400.006078 L 358.808648 399.691704 L 389.457776 399.640167 L 420.106904 318.131109 L 450.756032 400.455853 L 481.40516 400.197701 L 512.054288 399.907221 L 542.703416 399.860838 L 573.352544 400.20754 L 604.001672 400.567828 L 634.6508 399.780722 L 665.299928 400.403848 L 695.949056 399.312675 L 726.598184 400.328885 L 757.247312 320.371082 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 5086 |
<defs>
|
| 5087 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 5088 |
</defs>
|
| 5089 |
<g clip-path="url(#p088c925177)">
|
| 5090 |
<use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5091 |
+
<use ns4:href="#md7efaf3aec" x="113.615625" y="399.43402" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5092 |
+
<use ns4:href="#md7efaf3aec" x="144.264753" y="400.029504" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5093 |
+
<use ns4:href="#md7efaf3aec" x="174.913881" y="399.696858" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5094 |
+
<use ns4:href="#md7efaf3aec" x="205.563009" y="399.059208" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5095 |
+
<use ns4:href="#md7efaf3aec" x="236.212137" y="400.038874" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5096 |
+
<use ns4:href="#md7efaf3aec" x="266.861265" y="399.640636" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5097 |
+
<use ns4:href="#md7efaf3aec" x="297.510393" y="400.038405" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5098 |
+
<use ns4:href="#md7efaf3aec" x="328.159521" y="400.006078" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5099 |
+
<use ns4:href="#md7efaf3aec" x="358.808648" y="399.691704" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5100 |
+
<use ns4:href="#md7efaf3aec" x="389.457776" y="399.640167" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5101 |
+
<use ns4:href="#md7efaf3aec" x="420.106904" y="318.131109" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5102 |
+
<use ns4:href="#md7efaf3aec" x="450.756032" y="400.455853" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5103 |
+
<use ns4:href="#md7efaf3aec" x="481.40516" y="400.197701" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5104 |
+
<use ns4:href="#md7efaf3aec" x="512.054288" y="399.907221" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5105 |
+
<use ns4:href="#md7efaf3aec" x="542.703416" y="399.860838" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5106 |
+
<use ns4:href="#md7efaf3aec" x="573.352544" y="400.20754" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5107 |
+
<use ns4:href="#md7efaf3aec" x="604.001672" y="400.567828" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5108 |
+
<use ns4:href="#md7efaf3aec" x="634.6508" y="399.780722" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5109 |
+
<use ns4:href="#md7efaf3aec" x="665.299928" y="400.403848" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5110 |
+
<use ns4:href="#md7efaf3aec" x="695.949056" y="399.312675" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5111 |
+
<use ns4:href="#md7efaf3aec" x="726.598184" y="400.328885" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5112 |
+
<use ns4:href="#md7efaf3aec" x="757.247312" y="320.371082" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5113 |
<use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5114 |
</g>
|
| 5115 |
</g>
|
| 5116 |
<g id="series--torch-eager" class="series">
|
| 5117 |
+
<path d="M 82.966497 358.443125 L 113.615625 339.472461 L 144.264753 340.171018 L 174.913881 340.100272 L 205.563009 340.151809 L 236.212137 340.65312 L 266.861265 340.953439 L 297.510393 340.452127 L 328.159521 340.709811 L 358.808648 340.704657 L 389.457776 340.901902 L 420.106904 336.056983 L 450.756032 340.784305 L 481.40516 339.590059 L 512.054288 339.749354 L 542.703416 339.036742 L 573.352544 338.708781 L 604.001672 339.515096 L 634.6508 339.566633 L 665.299928 340.438072 L 695.949056 339.247574 L 726.598184 340.442288 L 757.247312 333.026156 L 787.896439 137.089198 " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 5118 |
<defs>
|
| 5119 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 5120 |
</defs>
|
| 5121 |
<g clip-path="url(#p088c925177)">
|
| 5122 |
+
<use ns4:href="#m9b8c54d372" x="82.966497" y="358.443125" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5123 |
+
<use ns4:href="#m9b8c54d372" x="113.615625" y="339.472461" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5124 |
+
<use ns4:href="#m9b8c54d372" x="144.264753" y="340.171018" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5125 |
+
<use ns4:href="#m9b8c54d372" x="174.913881" y="340.100272" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5126 |
+
<use ns4:href="#m9b8c54d372" x="205.563009" y="340.151809" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5127 |
+
<use ns4:href="#m9b8c54d372" x="236.212137" y="340.65312" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5128 |
+
<use ns4:href="#m9b8c54d372" x="266.861265" y="340.953439" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5129 |
+
<use ns4:href="#m9b8c54d372" x="297.510393" y="340.452127" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5130 |
+
<use ns4:href="#m9b8c54d372" x="328.159521" y="340.709811" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5131 |
+
<use ns4:href="#m9b8c54d372" x="358.808648" y="340.704657" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5132 |
+
<use ns4:href="#m9b8c54d372" x="389.457776" y="340.901902" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5133 |
+
<use ns4:href="#m9b8c54d372" x="420.106904" y="336.056983" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5134 |
+
<use ns4:href="#m9b8c54d372" x="450.756032" y="340.784305" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5135 |
+
<use ns4:href="#m9b8c54d372" x="481.40516" y="339.590059" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5136 |
+
<use ns4:href="#m9b8c54d372" x="512.054288" y="339.749354" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5137 |
+
<use ns4:href="#m9b8c54d372" x="542.703416" y="339.036742" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5138 |
+
<use ns4:href="#m9b8c54d372" x="573.352544" y="338.708781" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5139 |
+
<use ns4:href="#m9b8c54d372" x="604.001672" y="339.515096" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5140 |
+
<use ns4:href="#m9b8c54d372" x="634.6508" y="339.566633" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5141 |
+
<use ns4:href="#m9b8c54d372" x="665.299928" y="340.438072" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5142 |
+
<use ns4:href="#m9b8c54d372" x="695.949056" y="339.247574" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5143 |
+
<use ns4:href="#m9b8c54d372" x="726.598184" y="340.442288" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5144 |
+
<use ns4:href="#m9b8c54d372" x="757.247312" y="333.026156" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5145 |
+
<use ns4:href="#m9b8c54d372" x="787.896439" y="137.089198" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5146 |
</g>
|
| 5147 |
</g>
|
| 5148 |
<g id="patch_3">
|