Upload folder using huggingface_hub
Browse files- activation/impls/artifacts/benchmark/activation.jsonl +3 -0
- activation/impls/cells/benchmark.py +2 -2
- activation/impls/compiled_swiglu.html +172 -35
- activation/impls/hf_kernels_swiglu.html +139 -35
- activation/impls/torch_swiglu.html +138 -34
- activation/results/artifacts/combine/latency.svg +3 -0
- activation/results/cells/combine.py +27 -0
- activation/results/combined_results.html +0 -0
- flash_attn/impls/artifacts/benchmark/attn.jsonl +6 -6
- flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl +6 -6
- flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl +6 -6
- flash_attn/impls/cells/benchmark.py +11 -13
- flash_attn/impls/cells/benchmark_default.py +2 -2
- flash_attn/impls/cells/benchmark_max_autotune.py +2 -2
- flash_attn/impls/compiled_variants.html +336 -31
- flash_attn/impls/flash_attention.html +262 -34
- flash_attn/impls/hf_kernels_flash_attn.html +214 -29
- flash_attn/impls/hf_kernels_flash_attn3.html +203 -30
- flash_attn/impls/mem_efficient_attention.html +252 -30
- flash_attn/impls/sage_attention.html +130 -31
- flash_attn/impls/xformers.html +219 -31
- flash_attn/results/artifacts/combine/latency.svg +2 -2
- flash_attn/results/cells/combine.py +39 -289
- flash_attn/results/combined_results.html +0 -0
- layer_norm/impls/artifacts/benchmark/ln.jsonl +8 -0
- layer_norm/impls/cells/benchmark.py +2 -2
- layer_norm/impls/hf_kernels_layer_norm.html +78 -31
- layer_norm/impls/torch_layer_norm.html +79 -35
- layer_norm/results/artifacts/combine/latency.svg +3 -0
- layer_norm/results/cells/combine.py +19 -0
- layer_norm/results/combined_results.html +0 -0
activation/impls/artifacts/benchmark/activation.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-23T17:22:01Z", "run": "2868ab5dc1ce4d49ac015295dd5ab8d5", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "triton", "compile": "none"}, "wl": {"name": "llama_T512_D4096", "num_tokens": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.023811000119167147, "p50": 0.024261000135084032, "p90": 0.024421000034635654, "mean": 0.024255200014522416, "iqr": 0.00023000006876827683, "raw_times": [0.024261000135084032, 0.023811000119167147, 0.024190999965867377, 0.024591999817857868, 0.024421000034635654], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03041099989786744, "peak_bytes": 46139392, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-23T17:22:01Z", "run": "2868ab5dc1ce4d49ac015295dd5ab8d5", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "triton", "compile": "none"}, "wl": {"name": "llama_T512_D8192", "num_tokens": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.030561000130546745, "p50": 0.031221000199366244, "p90": 0.031622000051356736, "mean": 0.031125600116865826, "iqr": 0.001030000021273736, "raw_times": [0.030561000130546745, 0.031221000199366244, 0.030592000030083, 0.031622000051356736, 0.031632000172976404], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03244200001972786, "peak_bytes": 92276736, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-23T17:22:01Z", "run": "2868ab5dc1ce4d49ac015295dd5ab8d5", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "triton", "compile": "none"}, "wl": {"name": "llama_T512_D11008", "num_tokens": 512, "hidden_dim": 11008, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0339219998295448, "p50": 0.03464199994596129, "p90": 0.0347420000252896, "mean": 0.03469179991952842, "iqr": 0.00024100017981254496, "raw_times": [0.0339219998295448, 0.0347420000252896, 0.03464199994596129, 0.03565199995136936, 0.034500999845477054], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03648099982456188, "peak_bytes": 124520448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
|
activation/impls/cells/benchmark.py
CHANGED
|
@@ -2,13 +2,13 @@
|
|
| 2 |
# requires-python = ">=3.10"
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
-
# "torch",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
| 11 |
-
# kernels-benchmark-tools = { path = "
|
| 12 |
# ///
|
| 13 |
import torch
|
| 14 |
import sys
|
|
|
|
| 2 |
# requires-python = ">=3.10"
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
# ///
|
| 13 |
import torch
|
| 14 |
import sys
|
activation/impls/compiled_swiglu.html
CHANGED
|
@@ -706,6 +706,29 @@
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
.cell-stderr {
|
| 710 |
background: var(--bg-error);
|
| 711 |
border-left: 2px solid var(--border-error);
|
|
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3556 |
if(output){
|
| 3557 |
output.classList.remove('output-stale');
|
| 3558 |
let html='';
|
| 3559 |
-
if(data.stdout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3560 |
console.log('UV Logs:', data);
|
| 3561 |
if(data.stderr) {
|
| 3562 |
// Split UV logs from regular stderr
|
|
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3678 |
}
|
| 3679 |
}
|
| 3680 |
|
| 3681 |
-
// Live reload functionality (robust SSE handling)
|
| 3682 |
-
(function(){
|
| 3683 |
-
|
| 3684 |
-
|
| 3685 |
-
|
| 3686 |
-
|
| 3687 |
-
|
| 3688 |
-
|
| 3689 |
-
|
| 3690 |
-
|
| 3691 |
-
|
| 3692 |
-
|
| 3693 |
-
|
| 3694 |
-
|
| 3695 |
-
|
| 3696 |
-
|
| 3697 |
-
|
| 3698 |
-
|
| 3699 |
-
|
| 3700 |
-
|
| 3701 |
-
})();
|
| 3702 |
|
| 3703 |
|
| 3704 |
document.addEventListener('DOMContentLoaded', function() {
|
|
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.10.244-240.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
|
@@ -3843,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3843 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: nv | 0.
|
| 3847 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3849 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3859,7 +3887,7 @@ Cell: nv | 0.25s
|
|
| 3859 |
</div>
|
| 3860 |
</div>
|
| 3861 |
<div id="output-nv" class="cell-output">
|
| 3862 |
-
<div class="cell-stdout">
|
| 3863 |
+-----------------------------------------------------------------------------------------+
|
| 3864 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3865 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3868,7 +3896,7 @@ Cell: nv | 0.25s
|
|
| 3868 |
| | | MIG M. |
|
| 3869 |
|=========================================+========================+======================|
|
| 3870 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3871 |
-
| N/A
|
| 3872 |
| | | N/A |
|
| 3873 |
+-----------------------------------------+------------------------+----------------------+
|
| 3874 |
|
|
@@ -3880,19 +3908,19 @@ Cell: nv | 0.25s
|
|
| 3880 |
| No running processes found |
|
| 3881 |
+-----------------------------------------------------------------------------------------+
|
| 3882 |
|
| 3883 |
-
</div>
|
| 3884 |
</div>
|
| 3885 |
</div>
|
| 3886 |
|
| 3887 |
<h2>SwiGLU Benchmark (torch.compile)</h2>
|
| 3888 |
-
<div class="cell
|
| 3889 |
<div class="cell-header">
|
| 3890 |
<span class="collapse-indicators">
|
| 3891 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3892 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3893 |
-
<span id="uv-indicator-benchmark" style="cursor:
|
| 3894 |
</span> |
|
| 3895 |
-
Cell: benchmark |
|
| 3896 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3897 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3898 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3904,12 +3932,12 @@ Cell: benchmark | 0.05s | FAILED
|
|
| 3904 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3905 |
<span class="c1"># dependencies = [</span>
|
| 3906 |
<span class="c1"># "numpy",</span>
|
| 3907 |
-
<span class="c1"># "torch",</span>
|
| 3908 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3909 |
<span class="c1"># ]</span>
|
| 3910 |
<span class="c1">#</span>
|
| 3911 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3912 |
-
<span class="c1"># kernels-benchmark-tools = { path = "
|
| 3913 |
<span class="c1"># ///</span>
|
| 3914 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3915 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3967,9 +3995,118 @@ Cell: benchmark | 0.05s | FAILED
|
|
| 3967 |
</div>
|
| 3968 |
</div>
|
| 3969 |
<div id="output-benchmark" class="cell-output">
|
| 3970 |
-
<div class="cell-
|
| 3971 |
-
|
| 3972 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3973 |
</div>
|
| 3974 |
</div>
|
| 3975 |
</div>
|
|
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
| 709 |
+
|
| 710 |
+
.cell-stdout {
|
| 711 |
+
background: var(--bg-tertiary);
|
| 712 |
+
padding: 0.75rem;
|
| 713 |
+
border-radius: 1px;
|
| 714 |
+
font-family: inherit;
|
| 715 |
+
font-size: 0.9rem;
|
| 716 |
+
color: var(--text-primary);
|
| 717 |
+
|
| 718 |
+
/* key bits */
|
| 719 |
+
overflow: auto; /* show scrollbars when needed */
|
| 720 |
+
max-width: 100%; /* respects whatever layout width you give it */
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
.cell-stdout .stdout-text {
|
| 724 |
+
margin: 0; /* reset pre default margin */
|
| 725 |
+
white-space: pre; /* keep line breaks, NO wrapping */
|
| 726 |
+
display: inline-block; /* shrink-to-content */
|
| 727 |
+
min-width: max-content; /* allow very long lines to define intrinsic width */
|
| 728 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
|
| 729 |
+
tab-size: 2;
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
.cell-stderr {
|
| 733 |
background: var(--bg-error);
|
| 734 |
border-left: 2px solid var(--border-error);
|
|
|
|
| 3579 |
if(output){
|
| 3580 |
output.classList.remove('output-stale');
|
| 3581 |
let html='';
|
| 3582 |
+
if (data.stdout) {
|
| 3583 |
+
html += '<div class="cell-stdout"><pre class="stdout-text">'
|
| 3584 |
+
+ escapeHtml(data.stdout)
|
| 3585 |
+
+ '</pre></div>';
|
| 3586 |
+
}
|
| 3587 |
+
|
| 3588 |
console.log('UV Logs:', data);
|
| 3589 |
if(data.stderr) {
|
| 3590 |
// Split UV logs from regular stderr
|
|
|
|
| 3706 |
}
|
| 3707 |
}
|
| 3708 |
|
| 3709 |
+
// // Live reload functionality (robust SSE handling)
|
| 3710 |
+
// (function(){
|
| 3711 |
+
// if (!('EventSource' in window)) {
|
| 3712 |
+
// console.warn('SSE not supported in this browser');
|
| 3713 |
+
// return;
|
| 3714 |
+
// }
|
| 3715 |
+
// let source = new EventSource('/events');
|
| 3716 |
+
// let isOpen = false;
|
| 3717 |
+
// source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
|
| 3718 |
+
// source.onmessage = function(e){
|
| 3719 |
+
// const msg=(e.data||'').trim(); if(!msg) return;
|
| 3720 |
+
// console.log('SSE message:', msg);
|
| 3721 |
+
// if (msg==='reload' || msg==='incremental') { location.reload(); }
|
| 3722 |
+
// // Ignore 'loading' to avoid premature reload loops
|
| 3723 |
+
// };
|
| 3724 |
+
// source.onerror = function(e){
|
| 3725 |
+
// // Let EventSource auto-reconnect instead of forcing a reload
|
| 3726 |
+
// if (isOpen) console.warn('SSE error after open, retrying...', e);
|
| 3727 |
+
// };
|
| 3728 |
+
// window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
|
| 3729 |
+
// })();
|
| 3730 |
|
| 3731 |
|
| 3732 |
document.addEventListener('DOMContentLoaded', function() {
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.23s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
+
<div class="cell-stdout"><pre class="stdout-text">Thu Oct 23 17:21:49 2025
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
+
| N/A 37C P0 80W / 350W | 0MiB / 46068MiB | 13% Default |
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
|
|
| 3908 |
| No running processes found |
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
|
| 3911 |
+
</pre></div>
|
| 3912 |
</div>
|
| 3913 |
</div>
|
| 3914 |
|
| 3915 |
<h2>SwiGLU Benchmark (torch.compile)</h2>
|
| 3916 |
+
<div class="cell" id="cell-benchmark">
|
| 3917 |
<div class="cell-header">
|
| 3918 |
<span class="collapse-indicators">
|
| 3919 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
+
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
+
Cell: benchmark | 14.79s
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3932 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3933 |
<span class="c1"># dependencies = [</span>
|
| 3934 |
<span class="c1"># "numpy",</span>
|
| 3935 |
+
<span class="c1"># "torch==2.8.0",</span>
|
| 3936 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3937 |
<span class="c1"># ]</span>
|
| 3938 |
<span class="c1">#</span>
|
| 3939 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3940 |
+
<span class="c1"># kernels-benchmark-tools = { path = "../../../../../tools", editable = true }</span>
|
| 3941 |
<span class="c1"># ///</span>
|
| 3942 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3943 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 3995 |
</div>
|
| 3996 |
</div>
|
| 3997 |
<div id="output-benchmark" class="cell-output">
|
| 3998 |
+
<div class="cell-stdout"><pre class="stdout-text">Running SwiGLU benchmarks on cuda with bfloat16
|
| 3999 |
+
Testing 3 workloads
|
| 4000 |
+
|
| 4001 |
+
======================================================================
|
| 4002 |
+
PROFILE TRACE: compiled_swiglu_max_autotune | llama_T512_D4096
|
| 4003 |
+
======================================================================
|
| 4004 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4006 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
+
compiled_swiglu_max_autotune 0.00% 0.000us 0.00% 0.000us 0.000us 1.851ms 5297.74% 1.851ms 925.622us 2
|
| 4008 |
+
compiled_swiglu_max_autotune 0.10% 159.779us 99.99% 166.375ms 166.375ms 0.000us 0.00% 38.816us 38.816us 1
|
| 4009 |
+
Torch-Compiled Region: 0/1 1.45% 2.415ms 99.86% 166.157ms 55.386ms 11.007us 31.50% 38.816us 12.939us 3
|
| 4010 |
+
aten::_foreach_copy_ 0.02% 39.542us 0.05% 87.165us 29.055us 21.600us 61.81% 21.600us 7.200us 3
|
| 4011 |
+
void at::native::(anonymous namespace)::multi_tensor... 0.00% 0.000us 0.00% 0.000us 0.000us 21.600us 61.81% 21.600us 7.200us 3
|
| 4012 |
+
CUDAGraphNode.record (dynamo_timed) 0.00% 0.000us 0.00% 0.000us 0.000us 20.673us 59.16% 20.673us 20.673us 1
|
| 4013 |
+
triton_poi_fused_mul_silu_0 0.00% 0.000us 0.00% 0.000us 0.000us 11.007us 31.50% 11.007us 3.669us 3
|
| 4014 |
+
Activity Buffer Request 0.86% 1.424ms 0.86% 1.424ms 1.424ms 3.872us 11.08% 3.872us 3.872us 1
|
| 4015 |
+
CUDAGraphNode.record (dynamo_timed) 96.87% 161.185ms 97.39% 162.045ms 162.045ms 0.000us 0.00% 2.337us 2.337us 1
|
| 4016 |
+
aten::fill_ 0.02% 34.251us 0.05% 74.934us 37.467us 2.337us 6.69% 2.337us 1.168us 2
|
| 4017 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.337us 6.69% 2.337us 1.168us 2
|
| 4018 |
+
TorchDynamo Cache Lookup 0.03% 57.633us 0.03% 57.633us 19.211us 0.000us 0.00% 0.000us 0.000us 3
|
| 4019 |
+
Pregraph bytecode 0.01% 12.280us 0.01% 12.280us 4.093us 0.000us 0.00% 0.000us 0.000us 3
|
| 4020 |
+
AOTDispatcher Runtime Wrapper Prologue 0.01% 21.352us 0.01% 21.352us 7.117us 0.000us 0.00% 0.000us 0.000us 3
|
| 4021 |
+
cudaDeviceSynchronize 0.07% 111.205us 0.07% 111.205us 18.534us 0.000us 0.00% 0.000us 0.000us 6
|
| 4022 |
+
cudaStreamIsCapturing 0.01% 10.600us 0.01% 10.600us 0.815us 0.000us 0.00% 0.000us 0.000us 13
|
| 4023 |
+
cudaEventRecordWithFlags 0.00% 4.751us 0.00% 4.751us 1.584us 0.000us 0.00% 0.000us 0.000us 3
|
| 4024 |
+
cudaStreamWaitEvent 0.00% 4.550us 0.00% 4.550us 1.517us 0.000us 0.00% 0.000us 0.000us 3
|
| 4025 |
+
aten::empty_strided 0.01% 14.680us 0.01% 14.680us 4.893us 0.000us 0.00% 0.000us 0.000us 3
|
| 4026 |
+
cudaLaunchKernel 0.05% 88.306us 0.05% 88.306us 17.661us 0.000us 0.00% 0.000us 0.000us 5
|
| 4027 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
+
Self CPU time total: 166.389ms
|
| 4029 |
+
Self CUDA time total: 34.944us
|
| 4030 |
+
|
| 4031 |
+
|
| 4032 |
+
|
| 4033 |
+
======================================================================
|
| 4034 |
+
PROFILE TRACE: compiled_swiglu_max_autotune | llama_T512_D8192
|
| 4035 |
+
======================================================================
|
| 4036 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
+
compiled_swiglu_max_autotune 0.00% 0.000us 0.00% 0.000us 0.000us 1.882ms 2857.54% 1.882ms 940.918us 2
|
| 4040 |
+
compiled_swiglu_max_autotune 0.08% 131.855us 99.99% 174.569ms 174.569ms 0.000us 0.00% 72.799us 72.799us 1
|
| 4041 |
+
Torch-Compiled Region: 0/3 1.26% 2.204ms 99.89% 174.392ms 58.131ms 18.240us 27.70% 72.799us 24.266us 3
|
| 4042 |
+
aten::_foreach_copy_ 0.02% 39.114us 0.05% 88.345us 29.448us 45.247us 68.71% 45.247us 15.082us 3
|
| 4043 |
+
void at::native::(anonymous namespace)::multi_tensor... 0.00% 0.000us 0.00% 0.000us 0.000us 45.247us 68.71% 45.247us 15.082us 3
|
| 4044 |
+
CUDAGraphNode.record (dynamo_timed) 0.00% 0.000us 0.00% 0.000us 0.000us 19.904us 30.22% 19.904us 19.904us 1
|
| 4045 |
+
triton_poi_fused_mul_silu_0 0.00% 0.000us 0.00% 0.000us 0.000us 18.240us 27.70% 18.240us 6.080us 3
|
| 4046 |
+
Activity Buffer Request 0.83% 1.441ms 0.83% 1.441ms 1.441ms 6.944us 10.54% 6.944us 6.944us 1
|
| 4047 |
+
CUDAGraphNode.record (dynamo_timed) 96.65% 168.746ms 97.67% 170.521ms 170.521ms 0.000us 0.00% 2.368us 2.368us 1
|
| 4048 |
+
aten::fill_ 0.02% 36.482us 0.04% 78.354us 39.177us 2.368us 3.60% 2.368us 1.184us 2
|
| 4049 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.368us 3.60% 2.368us 1.184us 2
|
| 4050 |
+
TorchDynamo Cache Lookup 0.03% 45.013us 0.03% 45.013us 15.004us 0.000us 0.00% 0.000us 0.000us 3
|
| 4051 |
+
Pregraph bytecode 0.01% 9.190us 0.01% 9.190us 3.063us 0.000us 0.00% 0.000us 0.000us 3
|
| 4052 |
+
AOTDispatcher Runtime Wrapper Prologue 0.01% 17.071us 0.01% 17.071us 5.690us 0.000us 0.00% 0.000us 0.000us 3
|
| 4053 |
+
cudaDeviceSynchronize 0.04% 76.533us 0.04% 76.533us 12.755us 0.000us 0.00% 0.000us 0.000us 6
|
| 4054 |
+
cudaStreamIsCapturing 0.01% 9.681us 0.01% 9.681us 0.745us 0.000us 0.00% 0.000us 0.000us 13
|
| 4055 |
+
cudaEventRecordWithFlags 0.00% 3.672us 0.00% 3.672us 1.224us 0.000us 0.00% 0.000us 0.000us 3
|
| 4056 |
+
cudaStreamWaitEvent 0.00% 3.040us 0.00% 3.040us 1.013us 0.000us 0.00% 0.000us 0.000us 3
|
| 4057 |
+
aten::empty_strided 0.01% 12.061us 0.01% 12.061us 4.020us 0.000us 0.00% 0.000us 0.000us 3
|
| 4058 |
+
cudaLaunchKernel 0.05% 91.103us 0.05% 91.103us 18.221us 0.000us 0.00% 0.000us 0.000us 5
|
| 4059 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4060 |
+
Self CPU time total: 174.590ms
|
| 4061 |
+
Self CUDA time total: 65.855us
|
| 4062 |
+
|
| 4063 |
+
|
| 4064 |
+
|
| 4065 |
+
======================================================================
|
| 4066 |
+
PROFILE TRACE: compiled_swiglu_max_autotune | llama_T512_D11008
|
| 4067 |
+
======================================================================
|
| 4068 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4069 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4070 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
+
compiled_swiglu_max_autotune 0.00% 0.000us 0.00% 0.000us 0.000us 1.863ms 1771.89% 1.863ms 931.590us 2
|
| 4072 |
+
compiled_swiglu_max_autotune 0.07% 121.234us 99.99% 174.986ms 174.986ms 0.000us 0.00% 113.760us 113.760us 1
|
| 4073 |
+
Torch-Compiled Region: 0/5 1.21% 2.117ms 99.90% 174.826ms 58.275ms 24.864us 23.65% 113.760us 37.920us 3
|
| 4074 |
+
aten::_foreach_copy_ 0.02% 36.152us 0.05% 83.124us 27.708us 78.144us 74.32% 78.144us 26.048us 3
|
| 4075 |
+
void at::native::(anonymous namespace)::multi_tensor... 0.00% 0.000us 0.00% 0.000us 0.000us 78.144us 74.32% 78.144us 26.048us 3
|
| 4076 |
+
triton_poi_fused_mul_silu_0 0.00% 0.000us 0.00% 0.000us 0.000us 24.864us 23.65% 24.864us 8.288us 3
|
| 4077 |
+
CUDAGraphNode.record (dynamo_timed) 0.00% 0.000us 0.00% 0.000us 0.000us 19.776us 18.81% 19.776us 19.776us 1
|
| 4078 |
+
Activity Buffer Request 0.77% 1.349ms 0.77% 1.349ms 1.349ms 8.608us 8.19% 8.608us 8.608us 1
|
| 4079 |
+
CUDAGraphNode.record (dynamo_timed) 96.23% 168.408ms 97.80% 171.145ms 171.145ms 0.000us 0.00% 2.144us 2.144us 1
|
| 4080 |
+
aten::fill_ 0.02% 32.121us 0.04% 72.933us 36.467us 2.144us 2.04% 2.144us 1.072us 2
|
| 4081 |
+
void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.144us 2.04% 2.144us 1.072us 2
|
| 4082 |
+
TorchDynamo Cache Lookup 0.02% 38.274us 0.02% 38.274us 12.758us 0.000us 0.00% 0.000us 0.000us 3
|
| 4083 |
+
Pregraph bytecode 0.01% 9.421us 0.01% 9.421us 3.140us 0.000us 0.00% 0.000us 0.000us 3
|
| 4084 |
+
AOTDispatcher Runtime Wrapper Prologue 0.01% 14.201us 0.01% 14.201us 4.734us 0.000us 0.00% 0.000us 0.000us 3
|
| 4085 |
+
cudaDeviceSynchronize 0.04% 73.664us 0.04% 73.664us 12.277us 0.000us 0.00% 0.000us 0.000us 6
|
| 4086 |
+
cudaStreamIsCapturing 0.01% 9.722us 0.01% 9.722us 0.748us 0.000us 0.00% 0.000us 0.000us 13
|
| 4087 |
+
cudaEventRecordWithFlags 0.00% 3.409us 0.00% 3.409us 1.136us 0.000us 0.00% 0.000us 0.000us 3
|
| 4088 |
+
cudaStreamWaitEvent 0.00% 2.910us 0.00% 2.910us 0.970us 0.000us 0.00% 0.000us 0.000us 3
|
| 4089 |
+
aten::empty_strided 0.01% 11.600us 0.01% 11.600us 3.867us 0.000us 0.00% 0.000us 0.000us 3
|
| 4090 |
+
cudaLaunchKernel 0.05% 87.784us 0.05% 87.784us 17.557us 0.000us 0.00% 0.000us 0.000us 5
|
| 4091 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4092 |
+
Self CPU time total: 175.003ms
|
| 4093 |
+
Self CUDA time total: 105.152us
|
| 4094 |
+
|
| 4095 |
+
|
| 4096 |
+
impl wl p50(ms) ok
|
| 4097 |
+
compiled_swiglu_max_autotune llama_T512_D11008 0.11 True
|
| 4098 |
+
compiled_swiglu_max_autotune llama_T512_D4096 0.10 True
|
| 4099 |
+
compiled_swiglu_max_autotune llama_T512_D8192 0.11 True
|
| 4100 |
+
</pre></div>
|
| 4101 |
+
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4102 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4103 |
+
<div class="uv-logs-content" style="display: none;">
|
| 4104 |
+
Installed 37 packages in 247ms
|
| 4105 |
+
</div>
|
| 4106 |
+
</div>
|
| 4107 |
+
<div class="cell-artifacts">
|
| 4108 |
+
<h4>Artifacts:</h4>
|
| 4109 |
+
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
| 4110 |
</div>
|
| 4111 |
</div>
|
| 4112 |
</div>
|
activation/impls/hf_kernels_swiglu.html
CHANGED
|
@@ -706,6 +706,29 @@
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
.cell-stderr {
|
| 710 |
background: var(--bg-error);
|
| 711 |
border-left: 2px solid var(--border-error);
|
|
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3556 |
if(output){
|
| 3557 |
output.classList.remove('output-stale');
|
| 3558 |
let html='';
|
| 3559 |
-
if(data.stdout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3560 |
console.log('UV Logs:', data);
|
| 3561 |
if(data.stderr) {
|
| 3562 |
// Split UV logs from regular stderr
|
|
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3678 |
}
|
| 3679 |
}
|
| 3680 |
|
| 3681 |
-
// Live reload functionality (robust SSE handling)
|
| 3682 |
-
(function(){
|
| 3683 |
-
|
| 3684 |
-
|
| 3685 |
-
|
| 3686 |
-
|
| 3687 |
-
|
| 3688 |
-
|
| 3689 |
-
|
| 3690 |
-
|
| 3691 |
-
|
| 3692 |
-
|
| 3693 |
-
|
| 3694 |
-
|
| 3695 |
-
|
| 3696 |
-
|
| 3697 |
-
|
| 3698 |
-
|
| 3699 |
-
|
| 3700 |
-
|
| 3701 |
-
})();
|
| 3702 |
|
| 3703 |
|
| 3704 |
document.addEventListener('DOMContentLoaded', function() {
|
|
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.10.244-240.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
|
@@ -3843,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3843 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: nv | 0.
|
| 3847 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3849 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3859,7 +3887,7 @@ Cell: nv | 0.25s
|
|
| 3859 |
</div>
|
| 3860 |
</div>
|
| 3861 |
<div id="output-nv" class="cell-output">
|
| 3862 |
-
<div class="cell-stdout">
|
| 3863 |
+-----------------------------------------------------------------------------------------+
|
| 3864 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3865 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3868,7 +3896,7 @@ Cell: nv | 0.25s
|
|
| 3868 |
| | | MIG M. |
|
| 3869 |
|=========================================+========================+======================|
|
| 3870 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3871 |
-
| N/A
|
| 3872 |
| | | N/A |
|
| 3873 |
+-----------------------------------------+------------------------+----------------------+
|
| 3874 |
|
|
@@ -3880,19 +3908,19 @@ Cell: nv | 0.25s
|
|
| 3880 |
| No running processes found |
|
| 3881 |
+-----------------------------------------------------------------------------------------+
|
| 3882 |
|
| 3883 |
-
</div>
|
| 3884 |
</div>
|
| 3885 |
</div>
|
| 3886 |
|
| 3887 |
<h2>SwiGLU Benchmark</h2>
|
| 3888 |
-
<div class="cell
|
| 3889 |
<div class="cell-header">
|
| 3890 |
<span class="collapse-indicators">
|
| 3891 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3892 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3893 |
-
<span id="uv-indicator-benchmark" style="cursor:
|
| 3894 |
</span> |
|
| 3895 |
-
Cell: benchmark |
|
| 3896 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3897 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3898 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3904,13 +3932,13 @@ Cell: benchmark | 0.01s | FAILED
|
|
| 3904 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3905 |
<span class="c1"># dependencies = [</span>
|
| 3906 |
<span class="c1"># "numpy",</span>
|
| 3907 |
-
<span class="c1"># "torch",</span>
|
| 3908 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3909 |
<span class="c1"># "kernels",</span>
|
| 3910 |
<span class="c1"># ]</span>
|
| 3911 |
<span class="c1">#</span>
|
| 3912 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3913 |
-
<span class="c1"># kernels-benchmark-tools = { path = "
|
| 3914 |
<span class="c1"># ///</span>
|
| 3915 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3916 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3970,9 +3998,85 @@ Cell: benchmark | 0.01s | FAILED
|
|
| 3970 |
</div>
|
| 3971 |
</div>
|
| 3972 |
<div id="output-benchmark" class="cell-output">
|
| 3973 |
-
<div class="cell-
|
| 3974 |
-
|
| 3975 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3976 |
</div>
|
| 3977 |
</div>
|
| 3978 |
</div>
|
|
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
| 709 |
+
|
| 710 |
+
.cell-stdout {
|
| 711 |
+
background: var(--bg-tertiary);
|
| 712 |
+
padding: 0.75rem;
|
| 713 |
+
border-radius: 1px;
|
| 714 |
+
font-family: inherit;
|
| 715 |
+
font-size: 0.9rem;
|
| 716 |
+
color: var(--text-primary);
|
| 717 |
+
|
| 718 |
+
/* key bits */
|
| 719 |
+
overflow: auto; /* show scrollbars when needed */
|
| 720 |
+
max-width: 100%; /* respects whatever layout width you give it */
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
.cell-stdout .stdout-text {
|
| 724 |
+
margin: 0; /* reset pre default margin */
|
| 725 |
+
white-space: pre; /* keep line breaks, NO wrapping */
|
| 726 |
+
display: inline-block; /* shrink-to-content */
|
| 727 |
+
min-width: max-content; /* allow very long lines to define intrinsic width */
|
| 728 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
|
| 729 |
+
tab-size: 2;
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
.cell-stderr {
|
| 733 |
background: var(--bg-error);
|
| 734 |
border-left: 2px solid var(--border-error);
|
|
|
|
| 3579 |
if(output){
|
| 3580 |
output.classList.remove('output-stale');
|
| 3581 |
let html='';
|
| 3582 |
+
if (data.stdout) {
|
| 3583 |
+
html += '<div class="cell-stdout"><pre class="stdout-text">'
|
| 3584 |
+
+ escapeHtml(data.stdout)
|
| 3585 |
+
+ '</pre></div>';
|
| 3586 |
+
}
|
| 3587 |
+
|
| 3588 |
console.log('UV Logs:', data);
|
| 3589 |
if(data.stderr) {
|
| 3590 |
// Split UV logs from regular stderr
|
|
|
|
| 3706 |
}
|
| 3707 |
}
|
| 3708 |
|
| 3709 |
+
// // Live reload functionality (robust SSE handling)
|
| 3710 |
+
// (function(){
|
| 3711 |
+
// if (!('EventSource' in window)) {
|
| 3712 |
+
// console.warn('SSE not supported in this browser');
|
| 3713 |
+
// return;
|
| 3714 |
+
// }
|
| 3715 |
+
// let source = new EventSource('/events');
|
| 3716 |
+
// let isOpen = false;
|
| 3717 |
+
// source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
|
| 3718 |
+
// source.onmessage = function(e){
|
| 3719 |
+
// const msg=(e.data||'').trim(); if(!msg) return;
|
| 3720 |
+
// console.log('SSE message:', msg);
|
| 3721 |
+
// if (msg==='reload' || msg==='incremental') { location.reload(); }
|
| 3722 |
+
// // Ignore 'loading' to avoid premature reload loops
|
| 3723 |
+
// };
|
| 3724 |
+
// source.onerror = function(e){
|
| 3725 |
+
// // Let EventSource auto-reconnect instead of forcing a reload
|
| 3726 |
+
// if (isOpen) console.warn('SSE error after open, retrying...', e);
|
| 3727 |
+
// };
|
| 3728 |
+
// window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
|
| 3729 |
+
// })();
|
| 3730 |
|
| 3731 |
|
| 3732 |
document.addEventListener('DOMContentLoaded', function() {
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.23s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
+
<div class="cell-stdout"><pre class="stdout-text">Thu Oct 23 17:21:49 2025
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
+
| N/A 37C P0 80W / 350W | 0MiB / 46068MiB | 13% Default |
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
|
|
| 3908 |
| No running processes found |
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
|
| 3911 |
+
</pre></div>
|
| 3912 |
</div>
|
| 3913 |
</div>
|
| 3914 |
|
| 3915 |
<h2>SwiGLU Benchmark</h2>
|
| 3916 |
+
<div class="cell" id="cell-benchmark">
|
| 3917 |
<div class="cell-header">
|
| 3918 |
<span class="collapse-indicators">
|
| 3919 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
+
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
+
Cell: benchmark | 4.10s
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3932 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3933 |
<span class="c1"># dependencies = [</span>
|
| 3934 |
<span class="c1"># "numpy",</span>
|
| 3935 |
+
<span class="c1"># "torch==2.8.0",</span>
|
| 3936 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3937 |
<span class="c1"># "kernels",</span>
|
| 3938 |
<span class="c1"># ]</span>
|
| 3939 |
<span class="c1">#</span>
|
| 3940 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3941 |
+
<span class="c1"># kernels-benchmark-tools = { path = "../../../../../tools", editable = true }</span>
|
| 3942 |
<span class="c1"># ///</span>
|
| 3943 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3944 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 3998 |
</div>
|
| 3999 |
</div>
|
| 4000 |
<div id="output-benchmark" class="cell-output">
|
| 4001 |
+
<div class="cell-stdout"><pre class="stdout-text">Running SwiGLU benchmarks on cuda with bfloat16
|
| 4002 |
+
Testing 3 workloads
|
| 4003 |
+
|
| 4004 |
+
======================================================================
|
| 4005 |
+
PROFILE TRACE: hf_kernels_swiglu | llama_T512_D4096
|
| 4006 |
+
======================================================================
|
| 4007 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4009 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4010 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 77.600us 379.50% 77.600us 77.600us 1
|
| 4011 |
+
hf_kernels_swiglu 9.39% 165.439us 99.61% 1.754ms 1.754ms 0.000us 0.00% 27.360us 27.360us 1
|
| 4012 |
+
_activation_beeaae6::silu_and_mul 1.24% 21.822us 87.35% 1.539ms 512.861us 20.448us 100.00% 27.360us 9.120us 3
|
| 4013 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 20.448us 100.00% 20.448us 6.816us 3
|
| 4014 |
+
Activity Buffer Request 83.94% 1.478ms 83.94% 1.478ms 1.478ms 6.912us 33.80% 6.912us 6.912us 1
|
| 4015 |
+
aten::empty 2.87% 50.462us 2.87% 50.462us 16.821us 0.000us 0.00% 0.000us 0.000us 3
|
| 4016 |
+
cudaLaunchKernel 2.17% 38.291us 2.17% 38.291us 12.764us 0.000us 0.00% 0.000us 0.000us 3
|
| 4017 |
+
cudaDeviceSynchronize 0.39% 6.830us 0.39% 6.830us 6.830us 0.000us 0.00% 0.000us 0.000us 1
|
| 4018 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
+
Self CPU time total: 1.761ms
|
| 4020 |
+
Self CUDA time total: 20.448us
|
| 4021 |
+
|
| 4022 |
+
|
| 4023 |
+
|
| 4024 |
+
======================================================================
|
| 4025 |
+
PROFILE TRACE: hf_kernels_swiglu | llama_T512_D8192
|
| 4026 |
+
======================================================================
|
| 4027 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4029 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4030 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.720us 154.22% 70.720us 70.720us 1
|
| 4031 |
+
hf_kernels_swiglu 5.60% 88.845us 99.68% 1.581ms 1.581ms 0.000us 0.00% 69.152us 69.152us 1
|
| 4032 |
+
_activation_beeaae6::silu_and_mul 1.32% 20.881us 92.90% 1.474ms 491.244us 45.856us 100.00% 69.152us 23.051us 3
|
| 4033 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 45.856us 100.00% 45.856us 15.285us 3
|
| 4034 |
+
Activity Buffer Request 89.94% 1.427ms 89.94% 1.427ms 1.427ms 23.296us 50.80% 23.296us 23.296us 1
|
| 4035 |
+
aten::empty 1.18% 18.690us 1.18% 18.690us 6.230us 0.000us 0.00% 0.000us 0.000us 3
|
| 4036 |
+
cudaLaunchKernel 1.64% 25.971us 1.64% 25.971us 8.657us 0.000us 0.00% 0.000us 0.000us 3
|
| 4037 |
+
cudaDeviceSynchronize 0.32% 5.141us 0.32% 5.141us 5.141us 0.000us 0.00% 0.000us 0.000us 1
|
| 4038 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
+
Self CPU time total: 1.586ms
|
| 4040 |
+
Self CUDA time total: 45.856us
|
| 4041 |
+
|
| 4042 |
+
|
| 4043 |
+
|
| 4044 |
+
======================================================================
|
| 4045 |
+
PROFILE TRACE: hf_kernels_swiglu | llama_T512_D11008
|
| 4046 |
+
======================================================================
|
| 4047 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4049 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4050 |
+
hf_kernels_swiglu 5.54% 88.883us 99.68% 1.600ms 1.600ms 0.000us 0.00% 123.326us 123.326us 1
|
| 4051 |
+
_activation_beeaae6::silu_and_mul 1.34% 21.482us 92.90% 1.491ms 497.111us 75.967us 100.00% 123.326us 41.109us 3
|
| 4052 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 81.632us 107.46% 81.632us 81.632us 1
|
| 4053 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 75.967us 100.00% 75.967us 25.322us 3
|
| 4054 |
+
Activity Buffer Request 89.90% 1.443ms 89.90% 1.443ms 1.443ms 47.359us 62.34% 47.359us 47.359us 1
|
| 4055 |
+
aten::empty 1.25% 19.991us 1.25% 19.991us 6.664us 0.000us 0.00% 0.000us 0.000us 3
|
| 4056 |
+
cudaLaunchKernel 1.65% 26.561us 1.65% 26.561us 8.854us 0.000us 0.00% 0.000us 0.000us 3
|
| 4057 |
+
cudaDeviceSynchronize 0.32% 5.170us 0.32% 5.170us 5.170us 0.000us 0.00% 0.000us 0.000us 1
|
| 4058 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
+
Self CPU time total: 1.605ms
|
| 4060 |
+
Self CUDA time total: 75.967us
|
| 4061 |
+
|
| 4062 |
+
|
| 4063 |
+
impl wl p50(ms) ok
|
| 4064 |
+
hf_kernels_swiglu llama_T512_D11008 0.03 True
|
| 4065 |
+
hf_kernels_swiglu llama_T512_D4096 0.02 True
|
| 4066 |
+
hf_kernels_swiglu llama_T512_D8192 0.03 True
|
| 4067 |
+
</pre></div>
|
| 4068 |
+
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4069 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4070 |
+
<div class="uv-logs-content" style="display: none;">
|
| 4071 |
+
Installed 10 packages in 14ms
|
| 4072 |
+
</div>
|
| 4073 |
+
</div>
|
| 4074 |
+
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4075 |
+
Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 14.56it/s]
|
| 4076 |
+
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 20.37it/s]</div>
|
| 4077 |
+
<div class="cell-artifacts">
|
| 4078 |
+
<h4>Artifacts:</h4>
|
| 4079 |
+
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
| 4080 |
</div>
|
| 4081 |
</div>
|
| 4082 |
</div>
|
activation/impls/torch_swiglu.html
CHANGED
|
@@ -706,6 +706,29 @@
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
.cell-stderr {
|
| 710 |
background: var(--bg-error);
|
| 711 |
border-left: 2px solid var(--border-error);
|
|
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3556 |
if(output){
|
| 3557 |
output.classList.remove('output-stale');
|
| 3558 |
let html='';
|
| 3559 |
-
if(data.stdout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3560 |
console.log('UV Logs:', data);
|
| 3561 |
if(data.stderr) {
|
| 3562 |
// Split UV logs from regular stderr
|
|
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3678 |
}
|
| 3679 |
}
|
| 3680 |
|
| 3681 |
-
// Live reload functionality (robust SSE handling)
|
| 3682 |
-
(function(){
|
| 3683 |
-
|
| 3684 |
-
|
| 3685 |
-
|
| 3686 |
-
|
| 3687 |
-
|
| 3688 |
-
|
| 3689 |
-
|
| 3690 |
-
|
| 3691 |
-
|
| 3692 |
-
|
| 3693 |
-
|
| 3694 |
-
|
| 3695 |
-
|
| 3696 |
-
|
| 3697 |
-
|
| 3698 |
-
|
| 3699 |
-
|
| 3700 |
-
|
| 3701 |
-
})();
|
| 3702 |
|
| 3703 |
|
| 3704 |
document.addEventListener('DOMContentLoaded', function() {
|
|
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.10.244-240.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
|
@@ -3843,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3843 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: nv | 0.
|
| 3847 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3849 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3859,7 +3887,7 @@ Cell: nv | 0.25s
|
|
| 3859 |
</div>
|
| 3860 |
</div>
|
| 3861 |
<div id="output-nv" class="cell-output">
|
| 3862 |
-
<div class="cell-stdout">
|
| 3863 |
+-----------------------------------------------------------------------------------------+
|
| 3864 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3865 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3868,7 +3896,7 @@ Cell: nv | 0.25s
|
|
| 3868 |
| | | MIG M. |
|
| 3869 |
|=========================================+========================+======================|
|
| 3870 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3871 |
-
| N/A
|
| 3872 |
| | | N/A |
|
| 3873 |
+-----------------------------------------+------------------------+----------------------+
|
| 3874 |
|
|
@@ -3880,19 +3908,19 @@ Cell: nv | 0.25s
|
|
| 3880 |
| No running processes found |
|
| 3881 |
+-----------------------------------------------------------------------------------------+
|
| 3882 |
|
| 3883 |
-
</div>
|
| 3884 |
</div>
|
| 3885 |
</div>
|
| 3886 |
|
| 3887 |
<h2>SwiGLU Benchmark (PyTorch Native)</h2>
|
| 3888 |
-
<div class="cell
|
| 3889 |
<div class="cell-header">
|
| 3890 |
<span class="collapse-indicators">
|
| 3891 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3892 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3893 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3894 |
</span> |
|
| 3895 |
-
Cell: benchmark |
|
| 3896 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3897 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3898 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3904,12 +3932,12 @@ Cell: benchmark | 0.02s | FAILED
|
|
| 3904 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3905 |
<span class="c1"># dependencies = [</span>
|
| 3906 |
<span class="c1"># "numpy",</span>
|
| 3907 |
-
<span class="c1"># "torch",</span>
|
| 3908 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3909 |
<span class="c1"># ]</span>
|
| 3910 |
<span class="c1">#</span>
|
| 3911 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3912 |
-
<span class="c1"># kernels-benchmark-tools = { path = "
|
| 3913 |
<span class="c1"># ///</span>
|
| 3914 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3915 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3966,9 +3994,85 @@ Cell: benchmark | 0.02s | FAILED
|
|
| 3966 |
</div>
|
| 3967 |
</div>
|
| 3968 |
<div id="output-benchmark" class="cell-output">
|
| 3969 |
-
<div class="cell-
|
| 3970 |
-
|
| 3971 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3972 |
</div>
|
| 3973 |
</div>
|
| 3974 |
</div>
|
|
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
| 709 |
+
|
| 710 |
+
.cell-stdout {
|
| 711 |
+
background: var(--bg-tertiary);
|
| 712 |
+
padding: 0.75rem;
|
| 713 |
+
border-radius: 1px;
|
| 714 |
+
font-family: inherit;
|
| 715 |
+
font-size: 0.9rem;
|
| 716 |
+
color: var(--text-primary);
|
| 717 |
+
|
| 718 |
+
/* key bits */
|
| 719 |
+
overflow: auto; /* show scrollbars when needed */
|
| 720 |
+
max-width: 100%; /* respects whatever layout width you give it */
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
.cell-stdout .stdout-text {
|
| 724 |
+
margin: 0; /* reset pre default margin */
|
| 725 |
+
white-space: pre; /* keep line breaks, NO wrapping */
|
| 726 |
+
display: inline-block; /* shrink-to-content */
|
| 727 |
+
min-width: max-content; /* allow very long lines to define intrinsic width */
|
| 728 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
|
| 729 |
+
tab-size: 2;
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
.cell-stderr {
|
| 733 |
background: var(--bg-error);
|
| 734 |
border-left: 2px solid var(--border-error);
|
|
|
|
| 3579 |
if(output){
|
| 3580 |
output.classList.remove('output-stale');
|
| 3581 |
let html='';
|
| 3582 |
+
if (data.stdout) {
|
| 3583 |
+
html += '<div class="cell-stdout"><pre class="stdout-text">'
|
| 3584 |
+
+ escapeHtml(data.stdout)
|
| 3585 |
+
+ '</pre></div>';
|
| 3586 |
+
}
|
| 3587 |
+
|
| 3588 |
console.log('UV Logs:', data);
|
| 3589 |
if(data.stderr) {
|
| 3590 |
// Split UV logs from regular stderr
|
|
|
|
| 3706 |
}
|
| 3707 |
}
|
| 3708 |
|
| 3709 |
+
// // Live reload functionality (robust SSE handling)
|
| 3710 |
+
// (function(){
|
| 3711 |
+
// if (!('EventSource' in window)) {
|
| 3712 |
+
// console.warn('SSE not supported in this browser');
|
| 3713 |
+
// return;
|
| 3714 |
+
// }
|
| 3715 |
+
// let source = new EventSource('/events');
|
| 3716 |
+
// let isOpen = false;
|
| 3717 |
+
// source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
|
| 3718 |
+
// source.onmessage = function(e){
|
| 3719 |
+
// const msg=(e.data||'').trim(); if(!msg) return;
|
| 3720 |
+
// console.log('SSE message:', msg);
|
| 3721 |
+
// if (msg==='reload' || msg==='incremental') { location.reload(); }
|
| 3722 |
+
// // Ignore 'loading' to avoid premature reload loops
|
| 3723 |
+
// };
|
| 3724 |
+
// source.onerror = function(e){
|
| 3725 |
+
// // Let EventSource auto-reconnect instead of forcing a reload
|
| 3726 |
+
// if (isOpen) console.warn('SSE error after open, retrying...', e);
|
| 3727 |
+
// };
|
| 3728 |
+
// window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
|
| 3729 |
+
// })();
|
| 3730 |
|
| 3731 |
|
| 3732 |
document.addEventListener('DOMContentLoaded', function() {
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.23s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
+
<div class="cell-stdout"><pre class="stdout-text">Thu Oct 23 17:21:49 2025
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
+
| N/A 37C P0 80W / 350W | 0MiB / 46068MiB | 13% Default |
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
|
|
| 3908 |
| No running processes found |
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
|
| 3911 |
+
</pre></div>
|
| 3912 |
</div>
|
| 3913 |
</div>
|
| 3914 |
|
| 3915 |
<h2>SwiGLU Benchmark (PyTorch Native)</h2>
|
| 3916 |
+
<div class="cell" id="cell-benchmark">
|
| 3917 |
<div class="cell-header">
|
| 3918 |
<span class="collapse-indicators">
|
| 3919 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
+
Cell: benchmark | 3.41s
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3932 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3933 |
<span class="c1"># dependencies = [</span>
|
| 3934 |
<span class="c1"># "numpy",</span>
|
| 3935 |
+
<span class="c1"># "torch==2.8.0",</span>
|
| 3936 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3937 |
<span class="c1"># ]</span>
|
| 3938 |
<span class="c1">#</span>
|
| 3939 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3940 |
+
<span class="c1"># kernels-benchmark-tools = { path = "../../../../../tools", editable = true }</span>
|
| 3941 |
<span class="c1"># ///</span>
|
| 3942 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3943 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 3994 |
</div>
|
| 3995 |
</div>
|
| 3996 |
<div id="output-benchmark" class="cell-output">
|
| 3997 |
+
<div class="cell-stdout"><pre class="stdout-text">Running SwiGLU benchmarks on cuda with bfloat16
|
| 3998 |
+
Testing 3 workloads
|
| 3999 |
+
|
| 4000 |
+
======================================================================
|
| 4001 |
+
PROFILE TRACE: torch_swiglu | llama_T512_D4096
|
| 4002 |
+
======================================================================
|
| 4003 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4005 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4006 |
+
torch_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 170.400us 513.50% 170.400us 170.400us 1
|
| 4007 |
+
torch_swiglu 10.35% 190.189us 99.61% 1.830ms 1.830ms 0.000us 0.00% 39.104us 39.104us 1
|
| 4008 |
+
aten::silu 3.11% 57.064us 83.35% 1.532ms 510.522us 17.280us 52.07% 23.200us 7.733us 3
|
| 4009 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 17.280us 52.07% 17.280us 5.760us 3
|
| 4010 |
+
aten::mul 2.20% 40.433us 3.25% 59.723us 19.908us 15.904us 47.93% 15.904us 5.301us 3
|
| 4011 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 15.904us 47.93% 15.904us 5.301us 3
|
| 4012 |
+
Activity Buffer Request 77.87% 1.431ms 77.87% 1.431ms 1.431ms 5.920us 17.84% 5.920us 5.920us 1
|
| 4013 |
+
aten::slice 2.14% 39.352us 2.66% 48.892us 8.149us 0.000us 0.00% 0.000us 0.000us 6
|
| 4014 |
+
aten::as_strided 0.52% 9.540us 0.52% 9.540us 1.590us 0.000us 0.00% 0.000us 0.000us 6
|
| 4015 |
+
cudaLaunchKernel 3.42% 62.871us 3.42% 62.871us 10.479us 0.000us 0.00% 0.000us 0.000us 6
|
| 4016 |
+
cudaDeviceSynchronize 0.39% 7.170us 0.39% 7.170us 7.170us 0.000us 0.00% 0.000us 0.000us 1
|
| 4017 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
+
Self CPU time total: 1.838ms
|
| 4019 |
+
Self CUDA time total: 33.184us
|
| 4020 |
+
|
| 4021 |
+
|
| 4022 |
+
|
| 4023 |
+
======================================================================
|
| 4024 |
+
PROFILE TRACE: torch_swiglu | llama_T512_D8192
|
| 4025 |
+
======================================================================
|
| 4026 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4027 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4028 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4029 |
+
torch_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 144.478us 207.68% 144.478us 144.478us 1
|
| 4030 |
+
torch_swiglu 6.51% 109.976us 99.67% 1.683ms 1.683ms 0.000us 0.00% 87.038us 87.038us 1
|
| 4031 |
+
aten::silu 2.61% 44.013us 89.15% 1.506ms 501.918us 36.351us 52.25% 53.823us 17.941us 3
|
| 4032 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 36.351us 52.25% 36.351us 12.117us 3
|
| 4033 |
+
aten::mul 1.57% 26.450us 2.46% 41.521us 13.840us 33.215us 47.75% 33.215us 11.072us 3
|
| 4034 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 33.215us 47.75% 33.215us 11.072us 3
|
| 4035 |
+
Activity Buffer Request 84.91% 1.434ms 84.91% 1.434ms 1.434ms 17.472us 25.12% 17.472us 17.472us 1
|
| 4036 |
+
aten::slice 1.23% 20.821us 1.55% 26.141us 4.357us 0.000us 0.00% 0.000us 0.000us 6
|
| 4037 |
+
aten::as_strided 0.31% 5.320us 0.31% 5.320us 0.887us 0.000us 0.00% 0.000us 0.000us 6
|
| 4038 |
+
cudaLaunchKernel 2.52% 42.602us 2.52% 42.602us 7.100us 0.000us 0.00% 0.000us 0.000us 6
|
| 4039 |
+
cudaDeviceSynchronize 0.33% 5.630us 0.33% 5.630us 5.630us 0.000us 0.00% 0.000us 0.000us 1
|
| 4040 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4041 |
+
Self CPU time total: 1.689ms
|
| 4042 |
+
Self CUDA time total: 69.566us
|
| 4043 |
+
|
| 4044 |
+
|
| 4045 |
+
|
| 4046 |
+
======================================================================
|
| 4047 |
+
PROFILE TRACE: torch_swiglu | llama_T512_D11008
|
| 4048 |
+
======================================================================
|
| 4049 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4050 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4051 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4052 |
+
torch_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 147.999us 151.09% 147.999us 147.999us 1
|
| 4053 |
+
torch_swiglu 7.64% 131.036us 99.70% 1.710ms 1.710ms 0.000us 0.00% 124.063us 124.063us 1
|
| 4054 |
+
aten::silu 2.56% 43.903us 88.06% 1.510ms 503.475us 50.015us 51.06% 76.127us 25.376us 3
|
| 4055 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 50.015us 51.06% 50.015us 16.672us 3
|
| 4056 |
+
aten::mul 1.50% 25.771us 2.43% 41.641us 13.880us 47.936us 48.94% 47.936us 15.979us 3
|
| 4057 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.936us 48.94% 47.936us 15.979us 3
|
| 4058 |
+
Activity Buffer Request 83.94% 1.440ms 83.94% 1.440ms 1.440ms 26.112us 26.66% 26.112us 26.112us 1
|
| 4059 |
+
aten::slice 1.28% 22.003us 1.58% 27.082us 4.514us 0.000us 0.00% 0.000us 0.000us 6
|
| 4060 |
+
aten::as_strided 0.30% 5.079us 0.30% 5.079us 0.846us 0.000us 0.00% 0.000us 0.000us 6
|
| 4061 |
+
cudaLaunchKernel 2.48% 42.561us 2.48% 42.561us 7.093us 0.000us 0.00% 0.000us 0.000us 6
|
| 4062 |
+
cudaDeviceSynchronize 0.30% 5.120us 0.30% 5.120us 5.120us 0.000us 0.00% 0.000us 0.000us 1
|
| 4063 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4064 |
+
Self CPU time total: 1.715ms
|
| 4065 |
+
Self CUDA time total: 97.951us
|
| 4066 |
+
|
| 4067 |
+
|
| 4068 |
+
impl wl p50(ms) ok
|
| 4069 |
+
torch_swiglu llama_T512_D11008 0.05 True
|
| 4070 |
+
torch_swiglu llama_T512_D4096 0.04 True
|
| 4071 |
+
torch_swiglu llama_T512_D8192 0.05 True
|
| 4072 |
+
</pre></div>
|
| 4073 |
+
<div class="cell-artifacts">
|
| 4074 |
+
<h4>Artifacts:</h4>
|
| 4075 |
+
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
| 4076 |
</div>
|
| 4077 |
</div>
|
| 4078 |
</div>
|
activation/results/artifacts/combine/latency.svg
ADDED
|
|
Git LFS Details
|
activation/results/cells/combine.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "matplotlib",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
+
# ///
|
| 13 |
+
from kernels_benchmark_tools.core.visuals import generate_combined_results
|
| 14 |
+
|
| 15 |
+
# Map display names to uvnote environment variables
|
| 16 |
+
cache_env_map = {
|
| 17 |
+
"HF Kernels SwiGLU": "UVNOTE_FILE_HF_KERNELS_SWIGLU_BENCHMARK",
|
| 18 |
+
"PyTorch SwiGLU": "UVNOTE_FILE_TORCH_SWIGLU_BENCHMARK",
|
| 19 |
+
"Compiled SwiGLU": "UVNOTE_FILE_COMPILED_SWIGLU_BENCHMARK",
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
# Generate combined results with visualization
|
| 23 |
+
generate_combined_results(
|
| 24 |
+
cache_env_map=cache_env_map,
|
| 25 |
+
output_filename="activation.jsonl",
|
| 26 |
+
svg_filename="latency.svg"
|
| 27 |
+
)
|
activation/results/combined_results.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/artifacts/benchmark/attn.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-23T17:22:14Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17766900009519304, "p50": 0.1805790000162233, "p90": 0.1809689999845432, "mean": 0.18065700000988727, "iqr": 0.0005199999577598646, "raw_times": [0.17766900009519304, 0.18044900002678332, 0.1836189999266935, 0.1805790000162233, 0.1809689999845432], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18813999986377894, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-23T17:22:14Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2040300000771822, "p50": 0.208629999860932, "p90": 0.20883999991383462, "mean": 0.2071937999517104, "iqr": 0.004771000021719374, "raw_times": [0.208629999860932, 0.2104000000144879, 0.20883999991383462, 0.20406899989211524, 0.2040300000771822], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2113399998506793, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-23T17:22:15Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21214000003055844, "p50": 0.22414099998968595, "p90": 0.22725099984199915, "mean": 0.22296499996627972, "iqr": 0.014549999832524918, "raw_times": [0.22414099998968595, 0.21270100000947423, 0.23859199995968083, 0.21214000003055844, 0.22725099984199915], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.215200000184268, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-23T17:22:15Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2172510000946204, "p50": 0.21762999995189602, "p90": 0.229150999984995, "mean": 0.22471280003628635, "iqr": 0.011839999842777615, "raw_times": [0.2172510000946204, 0.229150999984995, 0.21762999995189602, 0.24222100000770297, 0.21731100014221738], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22035099982531392, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-23T17:22:15Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2690430001166533, "p50": 0.2719639999213541, "p90": 0.2809840000281838, "mean": 0.27520160006133665, "iqr": 0.011710999842762249, "raw_times": [0.2719639999213541, 0.26927300018542155, 0.2690430001166533, 0.2809840000281838, 0.2847440000550705], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.26890300000559364, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-23T17:22:15Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.27566299991121923, "p50": 0.2808829999594309, "p90": 0.29306400006134936, "mean": 0.2846773999863217, "iqr": 0.01699100016594457, "raw_times": [0.29306400006134936, 0.2760729998954048, 0.2808829999594309, 0.29770400010420417, 0.27566299991121923], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2820939998855465, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-23T17:21:26Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.19892000000254484, "p50": 0.20128900018789864, "p90": 0.20218000008753734, "mean": 0.20126180006627692, "iqr": 0.0013400001535046613, "raw_times": [0.19892000000254484, 0.20083999993403268, 0.20128900018789864, 0.2030800001193711, 0.20218000008753734], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2210709999417304, "peak_bytes": 152174592, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-23T17:21:26Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.229150999984995, "p50": 0.22967100017012854, "p90": 0.23078200001691584, "mean": 0.23312540001825255, "iqr": 0.0012210000477352878, "raw_times": [0.23078200001691584, 0.22956099996918056, 0.22967100017012854, 0.2464619999500428, 0.229150999984995], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2712529999371327, "peak_bytes": 163971072, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-23T17:21:27Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2344019999327429, "p50": 0.23504099999627215, "p90": 0.23719199998595286, "mean": 0.23960979997355025, "iqr": 0.0026000000161729986, "raw_times": [0.2568219999830035, 0.23719199998595286, 0.2344019999327429, 0.23459199996977986, 0.23504099999627215], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.24443200004498067, "peak_bytes": 167116800, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-23T17:21:27Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.23659099997530575, "p50": 0.23880100002315885, "p90": 0.23884200004431477, "mean": 0.23843920002946106, "iqr": 0.0014209999790182337, "raw_times": [0.23880100002315885, 0.2405410000392294, 0.23742100006529654, 0.23884200004431477, 0.23659099997530575], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25097200000345765, "peak_bytes": 169345024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-23T17:21:27Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "FailOnRecompileLimitHit", "msg": "recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value."}}
|
| 6 |
+
{"ts": "2025-10-23T17:21:27Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "FailOnRecompileLimitHit", "msg": "recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value."}}
|
flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-23T17:21:09Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.18720899993240891, "p50": 0.19000999986928946, "p90": 0.1910489997953846, "mean": 0.1901993999581464, "iqr": 0.002129999757016776, "raw_times": [0.19381000015528116, 0.1889190000383678, 0.1910489997953846, 0.19000999986928946, 0.18720899993240891], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.3206559999853198, "peak_bytes": 143131648, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-23T17:21:11Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.1962300000286632, "p50": 0.19820000011350203, "p90": 0.20246899998710433, "mean": 0.19939980002163793, "iqr": 0.00514900011694408, "raw_times": [0.19731999987016025, 0.19820000011350203, 0.20246899998710433, 0.1962300000286632, 0.20278000010875985], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.30226499984564725, "peak_bytes": 147850240, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-23T17:21:13Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2074609999453969, "p50": 0.20947000007254246, "p90": 0.21126000001459033, "mean": 0.2095743999689148, "iqr": 0.0030890000743966084, "raw_times": [0.2115099998718506, 0.21126000001459033, 0.20947000007254246, 0.2074609999453969, 0.20817099994019372], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.31840599990573537, "peak_bytes": 150209536, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-23T17:21:15Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21225999989837874, "p50": 0.21317000005183218, "p90": 0.21427999990919488, "mean": 0.21412619998955051, "iqr": 0.0015599998732795939, "raw_times": [0.21225999989837874, 0.21317000005183218, 0.21427999990919488, 0.2127200000359153, 0.21820100005243148], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.3197160001491284, "peak_bytes": 152568832, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-23T17:21:15Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "FailOnRecompileLimitHit", "msg": "recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value."}}
|
| 6 |
+
{"ts": "2025-10-23T17:21:15Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "FailOnRecompileLimitHit", "msg": "recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value."}}
|
flash_attn/impls/cells/benchmark.py
CHANGED
|
@@ -2,38 +2,36 @@
|
|
| 2 |
# requires-python = ">=3.10"
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
-
# "torch",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
-
# "xformers",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
| 11 |
-
# kernels-benchmark-tools = { path = "
|
| 12 |
# ///
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
import os
|
| 16 |
import kernels_benchmark_tools as kbt
|
| 17 |
-
import xformers.ops as xops
|
| 18 |
|
| 19 |
|
| 20 |
-
def
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
|
| 26 |
kbt.add(
|
| 27 |
-
"
|
| 28 |
-
|
| 29 |
-
tags={"family": "
|
| 30 |
)
|
| 31 |
|
| 32 |
if __name__ == "__main__":
|
| 33 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 34 |
dtype = "float32" if device == "cpu" else "bfloat16"
|
| 35 |
|
| 36 |
-
# Flux-like workloads
|
| 37 |
base = 1024 if device == "cuda" else 512
|
| 38 |
flux_sizes = (
|
| 39 |
[128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
|
|
|
|
| 2 |
# requires-python = ">=3.10"
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
| 10 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 11 |
# ///
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
import os
|
| 15 |
import kernels_benchmark_tools as kbt
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
+
def torch_flash(q, k, v):
|
| 19 |
+
qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
|
| 20 |
+
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
|
| 21 |
+
o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
|
| 22 |
+
return o.transpose(1, 2).contiguous()
|
| 23 |
|
| 24 |
kbt.add(
|
| 25 |
+
"torch_flash_ma",
|
| 26 |
+
torch_flash,
|
| 27 |
+
tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
|
| 28 |
)
|
| 29 |
|
| 30 |
if __name__ == "__main__":
|
| 31 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 32 |
dtype = "float32" if device == "cpu" else "bfloat16"
|
| 33 |
|
| 34 |
+
# Flux-like workloads scaled down for CPU testing
|
| 35 |
base = 1024 if device == "cuda" else 512
|
| 36 |
flux_sizes = (
|
| 37 |
[128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
|
flash_attn/impls/cells/benchmark_default.py
CHANGED
|
@@ -2,12 +2,12 @@
|
|
| 2 |
# requires-python = ">=3.10"
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
-
# "torch",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
| 10 |
-
# kernels-benchmark-tools = { path = "
|
| 11 |
# ///
|
| 12 |
import torch
|
| 13 |
import sys
|
|
|
|
| 2 |
# requires-python = ">=3.10"
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
| 10 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 11 |
# ///
|
| 12 |
import torch
|
| 13 |
import sys
|
flash_attn/impls/cells/benchmark_max_autotune.py
CHANGED
|
@@ -2,12 +2,12 @@
|
|
| 2 |
# requires-python = ">=3.10"
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
-
# "torch",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
| 10 |
-
# kernels-benchmark-tools = {
|
| 11 |
# ///
|
| 12 |
import torch
|
| 13 |
import sys
|
|
|
|
| 2 |
# requires-python = ">=3.10"
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
| 10 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 11 |
# ///
|
| 12 |
import torch
|
| 13 |
import sys
|
flash_attn/impls/compiled_variants.html
CHANGED
|
@@ -706,6 +706,29 @@
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
.cell-stderr {
|
| 710 |
background: var(--bg-error);
|
| 711 |
border-left: 2px solid var(--border-error);
|
|
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3556 |
if(output){
|
| 3557 |
output.classList.remove('output-stale');
|
| 3558 |
let html='';
|
| 3559 |
-
if(data.stdout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3560 |
console.log('UV Logs:', data);
|
| 3561 |
if(data.stderr) {
|
| 3562 |
// Split UV logs from regular stderr
|
|
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3678 |
}
|
| 3679 |
}
|
| 3680 |
|
| 3681 |
-
// Live reload functionality (robust SSE handling)
|
| 3682 |
-
(function(){
|
| 3683 |
-
|
| 3684 |
-
|
| 3685 |
-
|
| 3686 |
-
|
| 3687 |
-
|
| 3688 |
-
|
| 3689 |
-
|
| 3690 |
-
|
| 3691 |
-
|
| 3692 |
-
|
| 3693 |
-
|
| 3694 |
-
|
| 3695 |
-
|
| 3696 |
-
|
| 3697 |
-
|
| 3698 |
-
|
| 3699 |
-
|
| 3700 |
-
|
| 3701 |
-
})();
|
| 3702 |
|
| 3703 |
|
| 3704 |
document.addEventListener('DOMContentLoaded', function() {
|
|
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.10.244-240.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
|
@@ -3837,14 +3865,14 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3837 |
<h1>Torch Compile Variants!</h1>
|
| 3838 |
<p>This file benchmarks Flash Attention with different torch.compile modes.</p>
|
| 3839 |
<h2>Flash Attention with torch.compile(mode="default")</h2>
|
| 3840 |
-
<div class="cell
|
| 3841 |
<div class="cell-header">
|
| 3842 |
<span class="collapse-indicators">
|
| 3843 |
<span onclick="toggleCode('benchmark_default')" style="cursor: pointer;">▼ code</span>
|
| 3844 |
<span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
|
| 3845 |
-
<span id="uv-indicator-benchmark_default" style="cursor:
|
| 3846 |
</span> |
|
| 3847 |
-
Cell: benchmark_default |
|
| 3848 |
| <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
|
| 3849 |
<button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
|
| 3850 |
<a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3856,12 +3884,12 @@ Cell: benchmark_default | 0.02s | FAILED
|
|
| 3856 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3857 |
<span class="c1"># dependencies = [</span>
|
| 3858 |
<span class="c1"># "numpy",</span>
|
| 3859 |
-
<span class="c1"># "torch",</span>
|
| 3860 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3861 |
<span class="c1"># ]</span>
|
| 3862 |
<span class="c1">#</span>
|
| 3863 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3864 |
-
<span class="c1"># kernels-benchmark-tools = { path = "
|
| 3865 |
<span class="c1"># ///</span>
|
| 3866 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3867 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3929,14 +3957,291 @@ Cell: benchmark_default | 0.02s | FAILED
|
|
| 3929 |
</div>
|
| 3930 |
</div>
|
| 3931 |
<div id="output-benchmark_default" class="cell-output">
|
| 3932 |
-
<div class="cell-
|
| 3933 |
-
|
| 3934 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3935 |
</div>
|
| 3936 |
</div>
|
| 3937 |
</div>
|
| 3938 |
|
| 3939 |
<h2>Flash Attention with torch.compile(mode="max-autotune")</h2>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3940 |
</div>
|
| 3941 |
|
| 3942 |
</body>
|
|
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
| 709 |
+
|
| 710 |
+
.cell-stdout {
|
| 711 |
+
background: var(--bg-tertiary);
|
| 712 |
+
padding: 0.75rem;
|
| 713 |
+
border-radius: 1px;
|
| 714 |
+
font-family: inherit;
|
| 715 |
+
font-size: 0.9rem;
|
| 716 |
+
color: var(--text-primary);
|
| 717 |
+
|
| 718 |
+
/* key bits */
|
| 719 |
+
overflow: auto; /* show scrollbars when needed */
|
| 720 |
+
max-width: 100%; /* respects whatever layout width you give it */
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
.cell-stdout .stdout-text {
|
| 724 |
+
margin: 0; /* reset pre default margin */
|
| 725 |
+
white-space: pre; /* keep line breaks, NO wrapping */
|
| 726 |
+
display: inline-block; /* shrink-to-content */
|
| 727 |
+
min-width: max-content; /* allow very long lines to define intrinsic width */
|
| 728 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
|
| 729 |
+
tab-size: 2;
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
.cell-stderr {
|
| 733 |
background: var(--bg-error);
|
| 734 |
border-left: 2px solid var(--border-error);
|
|
|
|
| 3579 |
if(output){
|
| 3580 |
output.classList.remove('output-stale');
|
| 3581 |
let html='';
|
| 3582 |
+
if (data.stdout) {
|
| 3583 |
+
html += '<div class="cell-stdout"><pre class="stdout-text">'
|
| 3584 |
+
+ escapeHtml(data.stdout)
|
| 3585 |
+
+ '</pre></div>';
|
| 3586 |
+
}
|
| 3587 |
+
|
| 3588 |
console.log('UV Logs:', data);
|
| 3589 |
if(data.stderr) {
|
| 3590 |
// Split UV logs from regular stderr
|
|
|
|
| 3706 |
}
|
| 3707 |
}
|
| 3708 |
|
| 3709 |
+
// // Live reload functionality (robust SSE handling)
|
| 3710 |
+
// (function(){
|
| 3711 |
+
// if (!('EventSource' in window)) {
|
| 3712 |
+
// console.warn('SSE not supported in this browser');
|
| 3713 |
+
// return;
|
| 3714 |
+
// }
|
| 3715 |
+
// let source = new EventSource('/events');
|
| 3716 |
+
// let isOpen = false;
|
| 3717 |
+
// source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
|
| 3718 |
+
// source.onmessage = function(e){
|
| 3719 |
+
// const msg=(e.data||'').trim(); if(!msg) return;
|
| 3720 |
+
// console.log('SSE message:', msg);
|
| 3721 |
+
// if (msg==='reload' || msg==='incremental') { location.reload(); }
|
| 3722 |
+
// // Ignore 'loading' to avoid premature reload loops
|
| 3723 |
+
// };
|
| 3724 |
+
// source.onerror = function(e){
|
| 3725 |
+
// // Let EventSource auto-reconnect instead of forcing a reload
|
| 3726 |
+
// if (isOpen) console.warn('SSE error after open, retrying...', e);
|
| 3727 |
+
// };
|
| 3728 |
+
// window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
|
| 3729 |
+
// })();
|
| 3730 |
|
| 3731 |
|
| 3732 |
document.addEventListener('DOMContentLoaded', function() {
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3865 |
<h1>Torch Compile Variants!</h1>
|
| 3866 |
<p>This file benchmarks Flash Attention with different torch.compile modes.</p>
|
| 3867 |
<h2>Flash Attention with torch.compile(mode="default")</h2>
|
| 3868 |
+
<div class="cell" id="cell-benchmark_default">
|
| 3869 |
<div class="cell-header">
|
| 3870 |
<span class="collapse-indicators">
|
| 3871 |
<span onclick="toggleCode('benchmark_default')" style="cursor: pointer;">▼ code</span>
|
| 3872 |
<span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
|
| 3873 |
+
<span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3874 |
</span> |
|
| 3875 |
+
Cell: benchmark_default | 12.08s
|
| 3876 |
| <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
|
| 3877 |
<button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
|
| 3878 |
<a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3884 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3885 |
<span class="c1"># dependencies = [</span>
|
| 3886 |
<span class="c1"># "numpy",</span>
|
| 3887 |
+
<span class="c1"># "torch==2.8.0",</span>
|
| 3888 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3889 |
<span class="c1"># ]</span>
|
| 3890 |
<span class="c1">#</span>
|
| 3891 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3892 |
+
<span class="c1"># kernels-benchmark-tools = { path = "../../../../../tools", editable = true }</span>
|
| 3893 |
<span class="c1"># ///</span>
|
| 3894 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3895 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 3957 |
</div>
|
| 3958 |
</div>
|
| 3959 |
<div id="output-benchmark_default" class="cell-output">
|
| 3960 |
+
<div class="cell-stdout"><pre class="stdout-text">
|
| 3961 |
+
======================================================================
|
| 3962 |
+
PROFILE TRACE: torch_flash_compiled_default | flux_L128
|
| 3963 |
+
======================================================================
|
| 3964 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3965 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3966 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3967 |
+
torch_flash_compiled_default 0.00% 0.000us 0.00% 0.000us 0.000us 967.332us 298.12% 967.332us 967.332us 1
|
| 3968 |
+
torch_flash_compiled_default 5.37% 154.798us 99.77% 2.878ms 2.878ms 0.000us 0.00% 324.481us 324.481us 1
|
| 3969 |
+
Torch-Compiled Region: 0/1 20.96% 604.478us 92.49% 2.668ms 889.236us 0.000us 0.00% 324.481us 108.160us 3
|
| 3970 |
+
aten::_scaled_dot_product_flash_attention 1.54% 44.432us 8.35% 240.853us 80.284us 0.000us 0.00% 276.257us 92.086us 3
|
| 3971 |
+
aten::_flash_attention_forward 1.64% 47.371us 5.29% 152.657us 50.886us 276.257us 85.14% 276.257us 92.086us 3
|
| 3972 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 276.257us 85.14% 276.257us 92.086us 3
|
| 3973 |
+
triton_poi_fused__scaled_dot_product_flash_attention... 3.50% 100.807us 6.04% 174.309us 19.368us 36.704us 11.31% 36.704us 4.078us 9
|
| 3974 |
+
triton_poi_fused__scaled_dot_product_flash_attention... 0.00% 0.000us 0.00% 0.000us 0.000us 36.704us 11.31% 36.704us 4.078us 9
|
| 3975 |
+
triton_poi_fused_clone_1 1.27% 36.672us 2.17% 62.583us 20.861us 11.520us 3.55% 11.520us 3.840us 3
|
| 3976 |
+
triton_poi_fused_clone_1 0.00% 0.000us 0.00% 0.000us 0.000us 11.520us 3.55% 11.520us 3.840us 3
|
| 3977 |
+
TorchDynamo Cache Lookup 1.91% 55.093us 1.91% 55.093us 18.364us 0.000us 0.00% 0.000us 0.000us 3
|
| 3978 |
+
Pregraph bytecode 0.36% 10.400us 0.36% 10.400us 3.467us 0.000us 0.00% 0.000us 0.000us 3
|
| 3979 |
+
AOTDispatcher Runtime Wrapper Prologue 0.70% 20.280us 0.70% 20.280us 6.760us 0.000us 0.00% 0.000us 0.000us 3
|
| 3980 |
+
Activity Buffer Request 53.91% 1.555ms 53.91% 1.555ms 1.555ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3981 |
+
cuLaunchKernel 3.45% 99.413us 3.45% 99.413us 8.284us 0.000us 0.00% 0.000us 0.000us 12
|
| 3982 |
+
aten::transpose 1.19% 34.395us 1.52% 43.764us 3.647us 0.000us 0.00% 0.000us 0.000us 12
|
| 3983 |
+
aten::as_strided 0.32% 9.369us 0.32% 9.369us 0.781us 0.000us 0.00% 0.000us 0.000us 12
|
| 3984 |
+
aten::empty_like 0.44% 12.621us 1.20% 34.732us 11.577us 0.000us 0.00% 0.000us 0.000us 3
|
| 3985 |
+
aten::empty_strided 0.77% 22.111us 0.77% 22.111us 7.370us 0.000us 0.00% 0.000us 0.000us 3
|
| 3986 |
+
aten::empty 1.24% 35.841us 1.24% 35.841us 2.987us 0.000us 0.00% 0.000us 0.000us 12
|
| 3987 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
+
Self CPU time total: 2.884ms
|
| 3989 |
+
Self CUDA time total: 324.481us
|
| 3990 |
+
|
| 3991 |
+
|
| 3992 |
+
|
| 3993 |
+
======================================================================
|
| 3994 |
+
PROFILE TRACE: torch_flash_compiled_default | flux_L256
|
| 3995 |
+
======================================================================
|
| 3996 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3997 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3998 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
+
torch_flash_compiled_default 0.00% 0.000us 0.00% 0.000us 0.000us 834.378us 233.60% 834.378us 834.378us 1
|
| 4000 |
+
torch_flash_compiled_default 4.04% 97.294us 99.68% 2.400ms 2.400ms 0.000us 0.00% 357.190us 357.190us 1
|
| 4001 |
+
Torch-Compiled Region: 0/3 19.97% 480.803us 94.43% 2.274ms 757.987us 0.000us 0.00% 357.190us 119.063us 3
|
| 4002 |
+
aten::_scaled_dot_product_flash_attention 1.08% 25.983us 7.33% 176.640us 58.880us 0.000us 0.00% 300.165us 100.055us 3
|
| 4003 |
+
aten::_flash_attention_forward 1.50% 36.164us 5.01% 120.717us 40.239us 300.165us 84.04% 300.165us 100.055us 3
|
| 4004 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 300.165us 84.04% 300.165us 100.055us 3
|
| 4005 |
+
triton_poi_fused__scaled_dot_product_flash_attention... 3.30% 79.496us 6.27% 150.937us 16.771us 40.161us 11.24% 40.161us 4.462us 9
|
| 4006 |
+
triton_poi_fused__scaled_dot_product_flash_attention... 0.00% 0.000us 0.00% 0.000us 0.000us 40.161us 11.24% 40.161us 4.462us 9
|
| 4007 |
+
triton_poi_fused_clone_1 2.33% 56.123us 3.38% 81.404us 27.135us 16.864us 4.72% 16.864us 5.621us 3
|
| 4008 |
+
triton_poi_fused_clone_1 0.00% 0.000us 0.00% 0.000us 0.000us 16.864us 4.72% 16.864us 5.621us 3
|
| 4009 |
+
TorchDynamo Cache Lookup 1.21% 29.133us 1.21% 29.133us 9.711us 0.000us 0.00% 0.000us 0.000us 3
|
| 4010 |
+
Pregraph bytecode 0.32% 7.730us 0.32% 7.730us 2.577us 0.000us 0.00% 0.000us 0.000us 3
|
| 4011 |
+
AOTDispatcher Runtime Wrapper Prologue 0.49% 11.750us 0.49% 11.750us 3.917us 0.000us 0.00% 0.000us 0.000us 3
|
| 4012 |
+
Activity Buffer Request 56.67% 1.365ms 56.67% 1.365ms 1.365ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4013 |
+
cuLaunchKernel 4.02% 96.722us 4.02% 96.722us 8.060us 0.000us 0.00% 0.000us 0.000us 12
|
| 4014 |
+
aten::transpose 0.90% 21.580us 1.24% 29.940us 2.495us 0.000us 0.00% 0.000us 0.000us 12
|
| 4015 |
+
aten::as_strided 0.35% 8.360us 0.35% 8.360us 0.697us 0.000us 0.00% 0.000us 0.000us 12
|
| 4016 |
+
aten::empty_like 0.27% 6.480us 1.00% 23.971us 7.990us 0.000us 0.00% 0.000us 0.000us 3
|
| 4017 |
+
aten::empty_strided 0.73% 17.491us 0.73% 17.491us 5.830us 0.000us 0.00% 0.000us 0.000us 3
|
| 4018 |
+
aten::empty 1.24% 29.800us 1.24% 29.800us 2.483us 0.000us 0.00% 0.000us 0.000us 12
|
| 4019 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
+
Self CPU time total: 2.408ms
|
| 4021 |
+
Self CUDA time total: 357.190us
|
| 4022 |
+
|
| 4023 |
+
|
| 4024 |
+
|
| 4025 |
+
======================================================================
|
| 4026 |
+
PROFILE TRACE: torch_flash_compiled_default | flux_L320
|
| 4027 |
+
======================================================================
|
| 4028 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4029 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4030 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
+
torch_flash_compiled_default 0.00% 0.000us 0.00% 0.000us 0.000us 876.295us 230.02% 876.295us 876.295us 1
|
| 4032 |
+
torch_flash_compiled_default 3.99% 99.235us 99.67% 2.477ms 2.477ms 0.000us 0.00% 380.963us 380.963us 1
|
| 4033 |
+
Torch-Compiled Region: 0/5 19.71% 489.623us 94.50% 2.348ms 782.708us 0.000us 0.00% 380.963us 126.988us 3
|
| 4034 |
+
aten::_scaled_dot_product_flash_attention 1.15% 28.583us 7.58% 188.458us 62.819us 0.000us 0.00% 323.107us 107.702us 3
|
| 4035 |
+
aten::_flash_attention_forward 1.61% 40.110us 5.06% 125.615us 41.872us 323.107us 84.81% 323.107us 107.702us 3
|
| 4036 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 323.107us 84.81% 323.107us 107.702us 3
|
| 4037 |
+
triton_poi_fused__scaled_dot_product_flash_attention... 3.47% 86.344us 6.19% 153.807us 17.090us 44.448us 11.67% 44.448us 4.939us 9
|
| 4038 |
+
triton_poi_fused__scaled_dot_product_flash_attention... 0.00% 0.000us 0.00% 0.000us 0.000us 44.448us 11.67% 44.448us 4.939us 9
|
| 4039 |
+
triton_poi_fused_clone_1 1.44% 35.902us 2.40% 59.634us 19.878us 13.408us 3.52% 13.408us 4.469us 3
|
| 4040 |
+
triton_poi_fused_clone_1 0.00% 0.000us 0.00% 0.000us 0.000us 13.408us 3.52% 13.408us 4.469us 3
|
| 4041 |
+
TorchDynamo Cache Lookup 1.18% 29.223us 1.18% 29.223us 9.741us 0.000us 0.00% 0.000us 0.000us 3
|
| 4042 |
+
Pregraph bytecode 0.30% 7.450us 0.30% 7.450us 2.483us 0.000us 0.00% 0.000us 0.000us 3
|
| 4043 |
+
AOTDispatcher Runtime Wrapper Prologue 0.46% 11.502us 0.46% 11.502us 3.834us 0.000us 0.00% 0.000us 0.000us 3
|
| 4044 |
+
Activity Buffer Request 57.86% 1.438ms 57.86% 1.438ms 1.438ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4045 |
+
cuLaunchKernel 3.67% 91.195us 3.67% 91.195us 7.600us 0.000us 0.00% 0.000us 0.000us 12
|
| 4046 |
+
aten::transpose 0.95% 23.681us 1.38% 34.260us 2.855us 0.000us 0.00% 0.000us 0.000us 12
|
| 4047 |
+
aten::as_strided 0.43% 10.579us 0.43% 10.579us 0.882us 0.000us 0.00% 0.000us 0.000us 12
|
| 4048 |
+
aten::empty_like 0.27% 6.811us 0.93% 23.051us 7.684us 0.000us 0.00% 0.000us 0.000us 3
|
| 4049 |
+
aten::empty_strided 0.65% 16.240us 0.65% 16.240us 5.413us 0.000us 0.00% 0.000us 0.000us 3
|
| 4050 |
+
aten::empty 1.30% 32.232us 1.30% 32.232us 2.686us 0.000us 0.00% 0.000us 0.000us 12
|
| 4051 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4052 |
+
Self CPU time total: 2.485ms
|
| 4053 |
+
Self CUDA time total: 380.963us
|
| 4054 |
+
|
| 4055 |
+
|
| 4056 |
+
|
| 4057 |
+
======================================================================
|
| 4058 |
+
PROFILE TRACE: torch_flash_compiled_default | flux_L384
|
| 4059 |
+
======================================================================
|
| 4060 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4061 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4062 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4063 |
+
torch_flash_compiled_default 0.00% 0.000us 0.00% 0.000us 0.000us 900.385us 224.95% 900.385us 900.385us 1
|
| 4064 |
+
torch_flash_compiled_default 3.56% 101.756us 99.74% 2.848ms 2.848ms 0.000us 0.00% 400.258us 400.258us 1
|
| 4065 |
+
Torch-Compiled Region: 0/7 18.27% 521.655us 95.19% 2.718ms 906.103us 0.000us 0.00% 400.258us 133.419us 3
|
| 4066 |
+
aten::_scaled_dot_product_flash_attention 0.99% 28.253us 6.33% 180.729us 60.243us 0.000us 0.00% 336.352us 112.117us 3
|
| 4067 |
+
aten::_flash_attention_forward 1.29% 36.890us 4.19% 119.565us 39.855us 336.352us 84.03% 336.352us 112.117us 3
|
| 4068 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 336.352us 84.03% 336.352us 112.117us 3
|
| 4069 |
+
triton_poi_fused__scaled_dot_product_flash_attention... 3.07% 87.777us 16.12% 460.302us 51.145us 49.985us 12.49% 49.985us 5.554us 9
|
| 4070 |
+
triton_poi_fused__scaled_dot_product_flash_attention... 0.00% 0.000us 0.00% 0.000us 0.000us 49.985us 12.49% 49.985us 5.554us 9
|
| 4071 |
+
triton_poi_fused_clone_1 1.24% 35.330us 2.05% 58.492us 19.497us 13.921us 3.48% 13.921us 4.640us 3
|
| 4072 |
+
triton_poi_fused_clone_1 0.00% 0.000us 0.00% 0.000us 0.000us 13.921us 3.48% 13.921us 4.640us 3
|
| 4073 |
+
TorchDynamo Cache Lookup 0.99% 28.213us 0.99% 28.213us 9.404us 0.000us 0.00% 0.000us 0.000us 3
|
| 4074 |
+
Pregraph bytecode 0.25% 7.170us 0.25% 7.170us 2.390us 0.000us 0.00% 0.000us 0.000us 3
|
| 4075 |
+
AOTDispatcher Runtime Wrapper Prologue 0.43% 12.361us 0.43% 12.361us 4.120us 0.000us 0.00% 0.000us 0.000us 3
|
| 4076 |
+
Activity Buffer Request 51.74% 1.478ms 51.74% 1.478ms 1.478ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4077 |
+
cuLaunchKernel 13.86% 395.687us 13.86% 395.687us 32.974us 0.000us 0.00% 0.000us 0.000us 12
|
| 4078 |
+
aten::transpose 0.83% 23.691us 1.15% 32.911us 2.743us 0.000us 0.00% 0.000us 0.000us 12
|
| 4079 |
+
aten::as_strided 0.32% 9.220us 0.32% 9.220us 0.768us 0.000us 0.00% 0.000us 0.000us 12
|
| 4080 |
+
aten::empty_like 0.23% 6.600us 0.78% 22.311us 7.437us 0.000us 0.00% 0.000us 0.000us 3
|
| 4081 |
+
aten::empty_strided 0.55% 15.711us 0.55% 15.711us 5.237us 0.000us 0.00% 0.000us 0.000us 3
|
| 4082 |
+
aten::empty 1.03% 29.502us 1.03% 29.502us 2.459us 0.000us 0.00% 0.000us 0.000us 12
|
| 4083 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4084 |
+
Self CPU time total: 2.856ms
|
| 4085 |
+
Self CUDA time total: 400.258us
|
| 4086 |
+
|
| 4087 |
+
|
| 4088 |
+
impl wl p50(ms) ok
|
| 4089 |
+
torch_flash_compiled_default flux_L128 0.20 True
|
| 4090 |
+
torch_flash_compiled_default flux_L256 0.23 True
|
| 4091 |
+
torch_flash_compiled_default flux_L320 0.24 True
|
| 4092 |
+
torch_flash_compiled_default flux_L384 0.24 True
|
| 4093 |
+
torch_flash_compiled_default flux_L448 FAIL False
|
| 4094 |
+
Error: recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value.
|
| 4095 |
+
torch_flash_compiled_default flux_L512 FAIL False
|
| 4096 |
+
Error: recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value.
|
| 4097 |
+
</pre></div>
|
| 4098 |
+
<div class="uv-install-logs" id="uv-logs-benchmark_default">
|
| 4099 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4100 |
+
<div class="uv-logs-content" style="display: none;">
|
| 4101 |
+
Installed 37 packages in 247ms
|
| 4102 |
+
</div>
|
| 4103 |
+
</div>
|
| 4104 |
+
<div class="cell-stderr">W1023 17:21:27.942000 6833 torch/_dynamo/convert_frame.py:1016] [0/8] torch._dynamo hit config.recompile_limit (8)
|
| 4105 |
+
W1023 17:21:27.942000 6833 torch/_dynamo/convert_frame.py:1016] [0/8] function: 'torch_flash_base' (/__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cells/benchmark_default.py:18)
|
| 4106 |
+
W1023 17:21:27.942000 6833 torch/_dynamo/convert_frame.py:1016] [0/8] last reason: 0/7: GLOBAL_STATE changed: num_threads
|
| 4107 |
+
W1023 17:21:27.942000 6833 torch/_dynamo/convert_frame.py:1016] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
|
| 4108 |
+
W1023 17:21:27.942000 6833 torch/_dynamo/convert_frame.py:1016] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.
|
| 4109 |
+
W1023 17:21:27.948000 6833 torch/_dynamo/convert_frame.py:1016] [0/9] torch._dynamo hit config.recompile_limit (8)
|
| 4110 |
+
W1023 17:21:27.948000 6833 torch/_dynamo/convert_frame.py:1016] [0/9] function: 'torch_flash_base' (/__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cells/benchmark_default.py:18)
|
| 4111 |
+
W1023 17:21:27.948000 6833 torch/_dynamo/convert_frame.py:1016] [0/9] last reason: 0/7: GLOBAL_STATE changed: num_threads
|
| 4112 |
+
W1023 17:21:27.948000 6833 torch/_dynamo/convert_frame.py:1016] [0/9] To log all recompilation reasons, use TORCH_LOGS="recompiles".
|
| 4113 |
+
W1023 17:21:27.948000 6833 torch/_dynamo/convert_frame.py:1016] [0/9] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.</div>
|
| 4114 |
+
<div class="cell-artifacts">
|
| 4115 |
+
<h4>Artifacts:</h4>
|
| 4116 |
+
<a href="artifacts/benchmark_default/attn_default.jsonl" class="artifact" target="_blank">attn_default.jsonl</a>
|
| 4117 |
</div>
|
| 4118 |
</div>
|
| 4119 |
</div>
|
| 4120 |
|
| 4121 |
<h2>Flash Attention with torch.compile(mode="max-autotune")</h2>
|
| 4122 |
+
<div class="cell" id="cell-benchmark_max_autotune">
|
| 4123 |
+
<div class="cell-header">
|
| 4124 |
+
<span class="collapse-indicators">
|
| 4125 |
+
<span onclick="toggleCode('benchmark_max_autotune')" style="cursor: pointer;">▼ code</span>
|
| 4126 |
+
<span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
|
| 4127 |
+
<span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4128 |
+
</span> |
|
| 4129 |
+
Cell: benchmark_max_autotune | 18.98s
|
| 4130 |
+
| <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
|
| 4131 |
+
<button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
|
| 4132 |
+
<a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
|
| 4133 |
+
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/compiled_variants.md" target="_blank" class="github-btn">GitHub</a>
|
| 4134 |
+
</div>
|
| 4135 |
+
<div id="code-benchmark_max_autotune" class="cell-code" data-lines="70">
|
| 4136 |
+
<div class="code-wrap">
|
| 4137 |
+
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 4138 |
+
<span class="c1"># requires-python = ">=3.10"</span>
|
| 4139 |
+
<span class="c1"># dependencies = [</span>
|
| 4140 |
+
<span class="c1"># "numpy",</span>
|
| 4141 |
+
<span class="c1"># "torch==2.8.0",</span>
|
| 4142 |
+
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 4143 |
+
<span class="c1"># ]</span>
|
| 4144 |
+
<span class="c1">#</span>
|
| 4145 |
+
<span class="c1"># [tool.uv.sources]</span>
|
| 4146 |
+
<span class="c1"># kernels-benchmark-tools = { path = "../../../../../tools", editable = true }</span>
|
| 4147 |
+
<span class="c1"># ///</span>
|
| 4148 |
+
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 4149 |
+
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
| 4150 |
+
<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
|
| 4151 |
+
<span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
|
| 4152 |
+
|
| 4153 |
+
|
| 4154 |
+
<span class="k">def</span><span class="w"> </span><span class="nf">torch_flash_base</span><span class="p">(</span><span class="n">q</span><span class="p">,</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span><span class="p">):</span>
|
| 4155 |
+
<span class="n">qt</span><span class="p">,</span> <span class="n">kt</span><span class="p">,</span> <span class="n">vt</span> <span class="o">=</span> <span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="p">(</span><span class="n">q</span><span class="p">,</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span><span class="p">))</span>
|
| 4156 |
+
<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">sdpa_kernel</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">SDPBackend</span><span class="o">.</span><span class="n">FLASH_ATTENTION</span><span class="p">):</span>
|
| 4157 |
+
<span class="n">o</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">scaled_dot_product_attention</span><span class="p">(</span><span class="n">qt</span><span class="p">,</span> <span class="n">kt</span><span class="p">,</span> <span class="n">vt</span><span class="p">)</span>
|
| 4158 |
+
<span class="k">return</span> <span class="n">o</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span>
|
| 4159 |
+
|
| 4160 |
+
|
| 4161 |
+
<span class="c1"># Compile with max-autotune mode</span>
|
| 4162 |
+
<span class="n">compiled_flash_max_autotune</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">torch_flash_base</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">"max-autotune"</span><span class="p">,</span> <span class="n">fullgraph</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">dynamic</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
|
| 4163 |
+
|
| 4164 |
+
<span class="n">kbt</span><span class="o">.</span><span class="n">add</span><span class="p">(</span>
|
| 4165 |
+
<span class="s2">"torch_flash_compiled_max_autotune"</span><span class="p">,</span>
|
| 4166 |
+
<span class="n">compiled_flash_max_autotune</span><span class="p">,</span>
|
| 4167 |
+
<span class="n">tags</span><span class="o">=</span><span class="p">{</span><span class="s2">"family"</span><span class="p">:</span> <span class="s2">"torch-sdpa"</span><span class="p">,</span> <span class="s2">"backend"</span><span class="p">:</span> <span class="s2">"FLASH"</span><span class="p">,</span> <span class="s2">"compile"</span><span class="p">:</span> <span class="s2">"max-autotune"</span><span class="p">},</span>
|
| 4168 |
+
<span class="p">)</span>
|
| 4169 |
+
|
| 4170 |
+
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span>
|
| 4171 |
+
<span class="n">device</span> <span class="o">=</span> <span class="s2">"cuda"</span> <span class="k">if</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">is_available</span><span class="p">()</span> <span class="k">else</span> <span class="s2">"cpu"</span>
|
| 4172 |
+
<span class="n">dtype</span> <span class="o">=</span> <span class="s2">"float32"</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">"cpu"</span> <span class="k">else</span> <span class="s2">"bfloat16"</span>
|
| 4173 |
+
|
| 4174 |
+
<span class="c1"># Flux-like workloads</span>
|
| 4175 |
+
<span class="n">base</span> <span class="o">=</span> <span class="mi">1024</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">"cuda"</span> <span class="k">else</span> <span class="mi">512</span>
|
| 4176 |
+
<span class="n">flux_sizes</span> <span class="o">=</span> <span class="p">(</span>
|
| 4177 |
+
<span class="p">[</span><span class="mi">128</span><span class="p">,</span> <span class="mi">256</span><span class="p">,</span> <span class="mi">320</span><span class="p">,</span> <span class="mi">384</span><span class="p">,</span> <span class="mi">448</span><span class="p">,</span> <span class="mi">512</span><span class="p">]</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">"cuda"</span> <span class="k">else</span> <span class="p">[</span><span class="mi">64</span><span class="p">,</span> <span class="mi">128</span><span class="p">,</span> <span class="mi">192</span><span class="p">,</span> <span class="mi">256</span><span class="p">]</span>
|
| 4178 |
+
<span class="p">)</span>
|
| 4179 |
+
<span class="n">heads</span> <span class="o">=</span> <span class="mi">24</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">"cuda"</span> <span class="k">else</span> <span class="mi">8</span>
|
| 4180 |
+
<span class="n">head_dim</span> <span class="o">=</span> <span class="mi">128</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">"cuda"</span> <span class="k">else</span> <span class="mi">64</span>
|
| 4181 |
+
|
| 4182 |
+
<span class="n">wl</span> <span class="o">=</span> <span class="p">[]</span>
|
| 4183 |
+
<span class="k">for</span> <span class="n">L</span> <span class="ow">in</span> <span class="n">flux_sizes</span><span class="p">:</span>
|
| 4184 |
+
<span class="n">wl</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
|
| 4185 |
+
<span class="p">{</span>
|
| 4186 |
+
<span class="s2">"name"</span><span class="p">:</span> <span class="sa">f</span><span class="s2">"flux_L</span><span class="si">{</span><span class="n">L</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
|
| 4187 |
+
<span class="s2">"batch"</span><span class="p">:</span> <span class="mi">1</span><span class="p">,</span>
|
| 4188 |
+
<span class="s2">"seq_len"</span><span class="p">:</span> <span class="n">base</span> <span class="o">+</span> <span class="n">L</span><span class="p">,</span>
|
| 4189 |
+
<span class="s2">"heads"</span><span class="p">:</span> <span class="n">heads</span><span class="p">,</span>
|
| 4190 |
+
<span class="s2">"head_dim"</span><span class="p">:</span> <span class="n">head_dim</span><span class="p">,</span>
|
| 4191 |
+
<span class="s2">"dtype"</span><span class="p">:</span> <span class="n">dtype</span><span class="p">,</span>
|
| 4192 |
+
<span class="s2">"device"</span><span class="p">:</span> <span class="n">device</span><span class="p">,</span>
|
| 4193 |
+
<span class="s2">"seed"</span><span class="p">:</span> <span class="mi">0</span><span class="p">,</span>
|
| 4194 |
+
<span class="p">}</span>
|
| 4195 |
+
<span class="p">)</span>
|
| 4196 |
+
|
| 4197 |
+
<span class="n">kbt</span><span class="o">.</span><span class="n">run</span><span class="p">(</span>
|
| 4198 |
+
<span class="n">wl</span><span class="p">,</span>
|
| 4199 |
+
<span class="n">jsonl</span><span class="o">=</span><span class="s2">"attn_max_autotune.jsonl"</span><span class="p">,</span>
|
| 4200 |
+
<span class="n">reps</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
|
| 4201 |
+
<span class="n">warmup</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
|
| 4202 |
+
<span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
|
| 4203 |
+
<span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
|
| 4204 |
+
<span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
|
| 4205 |
+
<span class="p">)</span>
|
| 4206 |
+
<span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">"attn_max_autotune.jsonl"</span><span class="p">])</span>
|
| 4207 |
+
</pre></div>
|
| 4208 |
+
|
| 4209 |
+
<div class="code-line-highlight" id="line-highlight-benchmark_max_autotune"></div>
|
| 4210 |
+
</div>
|
| 4211 |
+
</div>
|
| 4212 |
+
<div id="output-benchmark_max_autotune" class="cell-output">
|
| 4213 |
+
<div class="cell-stdout"><pre class="stdout-text">impl wl p50(ms) ok
|
| 4214 |
+
torch_flash_compiled_max_autotune flux_L128 0.19 True
|
| 4215 |
+
torch_flash_compiled_max_autotune flux_L256 0.20 True
|
| 4216 |
+
torch_flash_compiled_max_autotune flux_L320 0.21 True
|
| 4217 |
+
torch_flash_compiled_max_autotune flux_L384 0.21 True
|
| 4218 |
+
torch_flash_compiled_max_autotune flux_L448 FAIL False
|
| 4219 |
+
Error: recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value.
|
| 4220 |
+
torch_flash_compiled_max_autotune flux_L512 FAIL False
|
| 4221 |
+
Error: recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value.
|
| 4222 |
+
</pre></div>
|
| 4223 |
+
<div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
|
| 4224 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4225 |
+
<div class="uv-logs-content" style="display: none;">
|
| 4226 |
+
Installed 37 packages in 208ms
|
| 4227 |
+
</div>
|
| 4228 |
+
</div>
|
| 4229 |
+
<div class="cell-stderr">W1023 17:21:15.860000 6116 torch/_dynamo/convert_frame.py:1016] [0/8] torch._dynamo hit config.recompile_limit (8)
|
| 4230 |
+
W1023 17:21:15.860000 6116 torch/_dynamo/convert_frame.py:1016] [0/8] function: 'torch_flash_base' (/__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cells/benchmark_max_autotune.py:18)
|
| 4231 |
+
W1023 17:21:15.860000 6116 torch/_dynamo/convert_frame.py:1016] [0/8] last reason: 0/7: GLOBAL_STATE changed: num_threads
|
| 4232 |
+
W1023 17:21:15.860000 6116 torch/_dynamo/convert_frame.py:1016] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
|
| 4233 |
+
W1023 17:21:15.860000 6116 torch/_dynamo/convert_frame.py:1016] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.
|
| 4234 |
+
W1023 17:21:15.866000 6116 torch/_dynamo/convert_frame.py:1016] [0/9] torch._dynamo hit config.recompile_limit (8)
|
| 4235 |
+
W1023 17:21:15.866000 6116 torch/_dynamo/convert_frame.py:1016] [0/9] function: 'torch_flash_base' (/__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cells/benchmark_max_autotune.py:18)
|
| 4236 |
+
W1023 17:21:15.866000 6116 torch/_dynamo/convert_frame.py:1016] [0/9] last reason: 0/7: GLOBAL_STATE changed: num_threads
|
| 4237 |
+
W1023 17:21:15.866000 6116 torch/_dynamo/convert_frame.py:1016] [0/9] To log all recompilation reasons, use TORCH_LOGS="recompiles".
|
| 4238 |
+
W1023 17:21:15.866000 6116 torch/_dynamo/convert_frame.py:1016] [0/9] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.</div>
|
| 4239 |
+
<div class="cell-artifacts">
|
| 4240 |
+
<h4>Artifacts:</h4>
|
| 4241 |
+
<a href="artifacts/benchmark_max_autotune/attn_max_autotune.jsonl" class="artifact" target="_blank">attn_max_autotune.jsonl</a>
|
| 4242 |
+
</div>
|
| 4243 |
+
</div>
|
| 4244 |
+
</div>
|
| 4245 |
</div>
|
| 4246 |
|
| 4247 |
</body>
|
flash_attn/impls/flash_attention.html
CHANGED
|
@@ -706,6 +706,29 @@
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
.cell-stderr {
|
| 710 |
background: var(--bg-error);
|
| 711 |
border-left: 2px solid var(--border-error);
|
|
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3556 |
if(output){
|
| 3557 |
output.classList.remove('output-stale');
|
| 3558 |
let html='';
|
| 3559 |
-
if(data.stdout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3560 |
console.log('UV Logs:', data);
|
| 3561 |
if(data.stderr) {
|
| 3562 |
// Split UV logs from regular stderr
|
|
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3678 |
}
|
| 3679 |
}
|
| 3680 |
|
| 3681 |
-
// Live reload functionality (robust SSE handling)
|
| 3682 |
-
(function(){
|
| 3683 |
-
|
| 3684 |
-
|
| 3685 |
-
|
| 3686 |
-
|
| 3687 |
-
|
| 3688 |
-
|
| 3689 |
-
|
| 3690 |
-
|
| 3691 |
-
|
| 3692 |
-
|
| 3693 |
-
|
| 3694 |
-
|
| 3695 |
-
|
| 3696 |
-
|
| 3697 |
-
|
| 3698 |
-
|
| 3699 |
-
|
| 3700 |
-
|
| 3701 |
-
})();
|
| 3702 |
|
| 3703 |
|
| 3704 |
document.addEventListener('DOMContentLoaded', function() {
|
|
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.10.244-240.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
|
@@ -3843,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3843 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: nv | 0.
|
| 3847 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3849 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3860,7 +3888,7 @@ Cell: nv | 0.23s
|
|
| 3860 |
</div>
|
| 3861 |
</div>
|
| 3862 |
<div id="output-nv" class="cell-output">
|
| 3863 |
-
<div class="cell-stdout">
|
| 3864 |
+-----------------------------------------------------------------------------------------+
|
| 3865 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3866 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3869,7 +3897,7 @@ Cell: nv | 0.23s
|
|
| 3869 |
| | | MIG M. |
|
| 3870 |
|=========================================+========================+======================|
|
| 3871 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3872 |
-
| N/A
|
| 3873 |
| | | N/A |
|
| 3874 |
+-----------------------------------------+------------------------+----------------------+
|
| 3875 |
|
|
@@ -3881,19 +3909,19 @@ Cell: nv | 0.23s
|
|
| 3881 |
| No running processes found |
|
| 3882 |
+-----------------------------------------------------------------------------------------+
|
| 3883 |
|
| 3884 |
-
</div>
|
| 3885 |
</div>
|
| 3886 |
</div>
|
| 3887 |
|
| 3888 |
<h2>Flash Attention Benchmark</h2>
|
| 3889 |
-
<div class="cell
|
| 3890 |
<div class="cell-header">
|
| 3891 |
<span class="collapse-indicators">
|
| 3892 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3893 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3894 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3895 |
</span> |
|
| 3896 |
-
Cell: benchmark |
|
| 3897 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3898 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3899 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3905,12 +3933,12 @@ Cell: benchmark | 0.01s | FAILED
|
|
| 3905 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3906 |
<span class="c1"># dependencies = [</span>
|
| 3907 |
<span class="c1"># "numpy",</span>
|
| 3908 |
-
<span class="c1"># "torch",</span>
|
| 3909 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3910 |
<span class="c1"># ]</span>
|
| 3911 |
<span class="c1">#</span>
|
| 3912 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3913 |
-
<span class="c1"># kernels-benchmark-tools = { path = "
|
| 3914 |
<span class="c1"># ///</span>
|
| 3915 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3916 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3974,9 +4002,209 @@ Cell: benchmark | 0.01s | FAILED
|
|
| 3974 |
</div>
|
| 3975 |
</div>
|
| 3976 |
<div id="output-benchmark" class="cell-output">
|
| 3977 |
-
<div class="cell-
|
| 3978 |
-
|
| 3979 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3980 |
</div>
|
| 3981 |
</div>
|
| 3982 |
</div>
|
|
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
| 709 |
+
|
| 710 |
+
.cell-stdout {
|
| 711 |
+
background: var(--bg-tertiary);
|
| 712 |
+
padding: 0.75rem;
|
| 713 |
+
border-radius: 1px;
|
| 714 |
+
font-family: inherit;
|
| 715 |
+
font-size: 0.9rem;
|
| 716 |
+
color: var(--text-primary);
|
| 717 |
+
|
| 718 |
+
/* key bits */
|
| 719 |
+
overflow: auto; /* show scrollbars when needed */
|
| 720 |
+
max-width: 100%; /* respects whatever layout width you give it */
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
.cell-stdout .stdout-text {
|
| 724 |
+
margin: 0; /* reset pre default margin */
|
| 725 |
+
white-space: pre; /* keep line breaks, NO wrapping */
|
| 726 |
+
display: inline-block; /* shrink-to-content */
|
| 727 |
+
min-width: max-content; /* allow very long lines to define intrinsic width */
|
| 728 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
|
| 729 |
+
tab-size: 2;
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
.cell-stderr {
|
| 733 |
background: var(--bg-error);
|
| 734 |
border-left: 2px solid var(--border-error);
|
|
|
|
| 3579 |
if(output){
|
| 3580 |
output.classList.remove('output-stale');
|
| 3581 |
let html='';
|
| 3582 |
+
if (data.stdout) {
|
| 3583 |
+
html += '<div class="cell-stdout"><pre class="stdout-text">'
|
| 3584 |
+
+ escapeHtml(data.stdout)
|
| 3585 |
+
+ '</pre></div>';
|
| 3586 |
+
}
|
| 3587 |
+
|
| 3588 |
console.log('UV Logs:', data);
|
| 3589 |
if(data.stderr) {
|
| 3590 |
// Split UV logs from regular stderr
|
|
|
|
| 3706 |
}
|
| 3707 |
}
|
| 3708 |
|
| 3709 |
+
// // Live reload functionality (robust SSE handling)
|
| 3710 |
+
// (function(){
|
| 3711 |
+
// if (!('EventSource' in window)) {
|
| 3712 |
+
// console.warn('SSE not supported in this browser');
|
| 3713 |
+
// return;
|
| 3714 |
+
// }
|
| 3715 |
+
// let source = new EventSource('/events');
|
| 3716 |
+
// let isOpen = false;
|
| 3717 |
+
// source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
|
| 3718 |
+
// source.onmessage = function(e){
|
| 3719 |
+
// const msg=(e.data||'').trim(); if(!msg) return;
|
| 3720 |
+
// console.log('SSE message:', msg);
|
| 3721 |
+
// if (msg==='reload' || msg==='incremental') { location.reload(); }
|
| 3722 |
+
// // Ignore 'loading' to avoid premature reload loops
|
| 3723 |
+
// };
|
| 3724 |
+
// source.onerror = function(e){
|
| 3725 |
+
// // Let EventSource auto-reconnect instead of forcing a reload
|
| 3726 |
+
// if (isOpen) console.warn('SSE error after open, retrying...', e);
|
| 3727 |
+
// };
|
| 3728 |
+
// window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
|
| 3729 |
+
// })();
|
| 3730 |
|
| 3731 |
|
| 3732 |
document.addEventListener('DOMContentLoaded', function() {
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.21s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3888 |
</div>
|
| 3889 |
</div>
|
| 3890 |
<div id="output-nv" class="cell-output">
|
| 3891 |
+
<div class="cell-stdout"><pre class="stdout-text">Thu Oct 23 17:22:15 2025
|
| 3892 |
+-----------------------------------------------------------------------------------------+
|
| 3893 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3894 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3897 |
| | | MIG M. |
|
| 3898 |
|=========================================+========================+======================|
|
| 3899 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3900 |
+
| N/A 37C P0 91W / 350W | 0MiB / 46068MiB | 26% Default |
|
| 3901 |
| | | N/A |
|
| 3902 |
+-----------------------------------------+------------------------+----------------------+
|
| 3903 |
|
|
|
|
| 3909 |
| No running processes found |
|
| 3910 |
+-----------------------------------------------------------------------------------------+
|
| 3911 |
|
| 3912 |
+
</pre></div>
|
| 3913 |
</div>
|
| 3914 |
</div>
|
| 3915 |
|
| 3916 |
<h2>Flash Attention Benchmark</h2>
|
| 3917 |
+
<div class="cell" id="cell-benchmark">
|
| 3918 |
<div class="cell-header">
|
| 3919 |
<span class="collapse-indicators">
|
| 3920 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3921 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3922 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3923 |
</span> |
|
| 3924 |
+
Cell: benchmark | 3.60s
|
| 3925 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3926 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3927 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3933 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3934 |
<span class="c1"># dependencies = [</span>
|
| 3935 |
<span class="c1"># "numpy",</span>
|
| 3936 |
+
<span class="c1"># "torch==2.8.0",</span>
|
| 3937 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3938 |
<span class="c1"># ]</span>
|
| 3939 |
<span class="c1">#</span>
|
| 3940 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3941 |
+
<span class="c1"># kernels-benchmark-tools = { path = "../../../../../tools", editable = true }</span>
|
| 3942 |
<span class="c1"># ///</span>
|
| 3943 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3944 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 4002 |
</div>
|
| 4003 |
</div>
|
| 4004 |
<div id="output-benchmark" class="cell-output">
|
| 4005 |
+
<div class="cell-stdout"><pre class="stdout-text">
|
| 4006 |
+
======================================================================
|
| 4007 |
+
PROFILE TRACE: torch_flash_ma | flux_L128
|
| 4008 |
+
======================================================================
|
| 4009 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4010 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4011 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4012 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 799.070us 225.43% 799.070us 799.070us 1
|
| 4013 |
+
torch_flash_ma 14.65% 361.148us 99.74% 2.458ms 2.458ms 0.000us 0.00% 362.241us 362.241us 1
|
| 4014 |
+
aten::scaled_dot_product_attention 1.75% 43.042us 9.34% 230.141us 76.714us 0.000us 0.00% 266.207us 88.736us 3
|
| 4015 |
+
aten::_scaled_dot_product_flash_attention 1.09% 26.961us 7.59% 187.099us 62.366us 0.000us 0.00% 266.207us 88.736us 3
|
| 4016 |
+
aten::_flash_attention_forward 1.68% 41.361us 5.54% 136.527us 45.509us 266.207us 75.10% 266.207us 88.736us 3
|
| 4017 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 266.207us 75.10% 266.207us 88.736us 3
|
| 4018 |
+
aten::contiguous 0.64% 15.860us 72.86% 1.796ms 149.661us 0.000us 0.00% 96.034us 8.003us 12
|
| 4019 |
+
aten::clone 1.71% 42.134us 72.21% 1.780ms 148.339us 0.000us 0.00% 96.034us 8.003us 12
|
| 4020 |
+
aten::copy_ 3.86% 95.153us 66.84% 1.648ms 137.298us 88.258us 24.90% 96.034us 8.003us 12
|
| 4021 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 88.258us 24.90% 88.258us 7.355us 12
|
| 4022 |
+
Activity Buffer Request 58.01% 1.430ms 58.01% 1.430ms 1.430ms 7.776us 2.19% 7.776us 7.776us 1
|
| 4023 |
+
aten::transpose 2.95% 72.712us 3.85% 94.884us 3.954us 0.000us 0.00% 0.000us 0.000us 24
|
| 4024 |
+
aten::as_strided 0.90% 22.172us 0.90% 22.172us 0.924us 0.000us 0.00% 0.000us 0.000us 24
|
| 4025 |
+
aten::empty_like 1.13% 27.832us 4.55% 112.245us 7.483us 0.000us 0.00% 0.000us 0.000us 15
|
| 4026 |
+
aten::empty 4.09% 100.886us 4.09% 100.886us 4.204us 0.000us 0.00% 0.000us 0.000us 24
|
| 4027 |
+
cudaLaunchKernel 5.96% 146.998us 5.96% 146.998us 9.800us 0.000us 0.00% 0.000us 0.000us 15
|
| 4028 |
+
aten::empty_strided 0.65% 15.960us 0.65% 15.960us 5.320us 0.000us 0.00% 0.000us 0.000us 3
|
| 4029 |
+
cudaDeviceGetAttribute 0.12% 2.850us 0.12% 2.850us 0.475us 0.000us 0.00% 0.000us 0.000us 6
|
| 4030 |
+
cudaFuncSetAttribute 0.54% 13.411us 0.54% 13.411us 4.470us 0.000us 0.00% 0.000us 0.000us 3
|
| 4031 |
+
cudaDeviceSynchronize 0.26% 6.530us 0.26% 6.530us 6.530us 0.000us 0.00% 0.000us 0.000us 1
|
| 4032 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4033 |
+
Self CPU time total: 2.465ms
|
| 4034 |
+
Self CUDA time total: 354.465us
|
| 4035 |
+
|
| 4036 |
+
|
| 4037 |
+
|
| 4038 |
+
======================================================================
|
| 4039 |
+
PROFILE TRACE: torch_flash_ma | flux_L256
|
| 4040 |
+
======================================================================
|
| 4041 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4042 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4043 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4044 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 680.541us 161.63% 680.541us 680.541us 1
|
| 4045 |
+
torch_flash_ma 11.51% 254.710us 99.74% 2.208ms 2.208ms 0.000us 0.00% 430.783us 430.783us 1
|
| 4046 |
+
aten::scaled_dot_product_attention 1.09% 24.080us 8.33% 184.408us 61.469us 0.000us 0.00% 312.064us 104.021us 3
|
| 4047 |
+
aten::_scaled_dot_product_flash_attention 0.81% 17.821us 7.24% 160.328us 53.443us 0.000us 0.00% 312.064us 104.021us 3
|
| 4048 |
+
aten::_flash_attention_forward 1.85% 41.011us 5.37% 118.956us 39.652us 312.064us 74.11% 312.064us 104.021us 3
|
| 4049 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 312.064us 74.11% 312.064us 104.021us 3
|
| 4050 |
+
aten::contiguous 0.42% 9.258us 77.80% 1.722ms 143.509us 0.000us 0.00% 118.719us 9.893us 12
|
| 4051 |
+
aten::clone 1.32% 29.284us 77.38% 1.713ms 142.737us 0.000us 0.00% 118.719us 9.893us 12
|
| 4052 |
+
aten::copy_ 3.64% 80.568us 73.02% 1.616ms 134.703us 108.991us 25.89% 118.719us 9.893us 12
|
| 4053 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 108.991us 25.89% 108.991us 9.083us 12
|
| 4054 |
+
Activity Buffer Request 65.56% 1.451ms 65.56% 1.451ms 1.451ms 9.728us 2.31% 9.728us 9.728us 1
|
| 4055 |
+
aten::transpose 2.36% 52.224us 3.17% 70.126us 2.922us 0.000us 0.00% 0.000us 0.000us 24
|
| 4056 |
+
aten::as_strided 0.81% 17.902us 0.81% 17.902us 0.746us 0.000us 0.00% 0.000us 0.000us 24
|
| 4057 |
+
aten::empty_like 0.96% 21.191us 3.98% 88.123us 5.875us 0.000us 0.00% 0.000us 0.000us 15
|
| 4058 |
+
aten::empty 3.58% 79.273us 3.58% 79.273us 3.303us 0.000us 0.00% 0.000us 0.000us 24
|
| 4059 |
+
cudaLaunchKernel 4.85% 107.363us 4.85% 107.363us 7.158us 0.000us 0.00% 0.000us 0.000us 15
|
| 4060 |
+
aten::empty_strided 0.70% 15.410us 0.70% 15.410us 5.137us 0.000us 0.00% 0.000us 0.000us 3
|
| 4061 |
+
cudaDeviceGetAttribute 0.09% 2.071us 0.09% 2.071us 0.345us 0.000us 0.00% 0.000us 0.000us 6
|
| 4062 |
+
cudaFuncSetAttribute 0.20% 4.321us 0.20% 4.321us 1.440us 0.000us 0.00% 0.000us 0.000us 3
|
| 4063 |
+
cudaDeviceSynchronize 0.26% 5.841us 0.26% 5.841us 5.841us 0.000us 0.00% 0.000us 0.000us 1
|
| 4064 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4065 |
+
Self CPU time total: 2.214ms
|
| 4066 |
+
Self CUDA time total: 421.055us
|
| 4067 |
+
|
| 4068 |
+
|
| 4069 |
+
|
| 4070 |
+
======================================================================
|
| 4071 |
+
PROFILE TRACE: torch_flash_ma | flux_L320
|
| 4072 |
+
======================================================================
|
| 4073 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4074 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4075 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4076 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 690.203us 159.06% 690.203us 690.203us 1
|
| 4077 |
+
torch_flash_ma 11.42% 254.276us 99.18% 2.209ms 2.209ms 0.000us 0.00% 443.100us 443.100us 1
|
| 4078 |
+
aten::scaled_dot_product_attention 1.09% 24.201us 8.13% 181.079us 60.360us 0.000us 0.00% 330.557us 110.186us 3
|
| 4079 |
+
aten::_scaled_dot_product_flash_attention 0.78% 17.350us 7.04% 156.878us 52.293us 0.000us 0.00% 330.557us 110.186us 3
|
| 4080 |
+
aten::_flash_attention_forward 1.80% 40.093us 5.30% 118.035us 39.345us 330.557us 76.18% 330.557us 110.186us 3
|
| 4081 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 330.557us 76.18% 330.557us 110.186us 3
|
| 4082 |
+
aten::contiguous 0.42% 9.369us 77.58% 1.728ms 143.991us 0.000us 0.00% 112.543us 9.379us 12
|
| 4083 |
+
aten::clone 1.34% 29.740us 77.16% 1.719ms 143.210us 0.000us 0.00% 112.543us 9.379us 12
|
| 4084 |
+
aten::copy_ 3.81% 84.905us 72.90% 1.624ms 135.305us 103.359us 23.82% 112.543us 9.379us 12
|
| 4085 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 103.359us 23.82% 103.359us 8.613us 12
|
| 4086 |
+
Activity Buffer Request 65.38% 1.456ms 65.38% 1.456ms 1.456ms 9.184us 2.12% 9.184us 9.184us 1
|
| 4087 |
+
aten::transpose 2.26% 50.400us 3.02% 67.214us 2.801us 0.000us 0.00% 0.000us 0.000us 24
|
| 4088 |
+
aten::as_strided 0.75% 16.814us 0.75% 16.814us 0.701us 0.000us 0.00% 0.000us 0.000us 24
|
| 4089 |
+
aten::empty_like 0.96% 21.489us 3.82% 85.044us 5.670us 0.000us 0.00% 0.000us 0.000us 15
|
| 4090 |
+
aten::empty 3.43% 76.464us 3.43% 76.464us 3.186us 0.000us 0.00% 0.000us 0.000us 24
|
| 4091 |
+
cudaLaunchKernel 4.82% 107.405us 4.82% 107.405us 7.160us 0.000us 0.00% 0.000us 0.000us 15
|
| 4092 |
+
aten::empty_strided 0.66% 14.631us 0.66% 14.631us 4.877us 0.000us 0.00% 0.000us 0.000us 3
|
| 4093 |
+
cudaDeviceGetAttribute 0.08% 1.710us 0.08% 1.710us 0.285us 0.000us 0.00% 0.000us 0.000us 6
|
| 4094 |
+
cudaFuncSetAttribute 0.18% 3.930us 0.18% 3.930us 1.310us 0.000us 0.00% 0.000us 0.000us 3
|
| 4095 |
+
cudaDeviceSynchronize 0.82% 18.331us 0.82% 18.331us 18.331us 0.000us 0.00% 0.000us 0.000us 1
|
| 4096 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4097 |
+
Self CPU time total: 2.227ms
|
| 4098 |
+
Self CUDA time total: 433.916us
|
| 4099 |
+
|
| 4100 |
+
|
| 4101 |
+
|
| 4102 |
+
======================================================================
|
| 4103 |
+
PROFILE TRACE: torch_flash_ma | flux_L384
|
| 4104 |
+
======================================================================
|
| 4105 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4106 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4107 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4108 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 691.645us 147.68% 691.645us 691.645us 1
|
| 4109 |
+
torch_flash_ma 10.40% 252.243us 99.18% 2.405ms 2.405ms 0.000us 0.00% 481.117us 481.117us 1
|
| 4110 |
+
aten::scaled_dot_product_attention 1.00% 24.352us 7.27% 176.289us 58.763us 0.000us 0.00% 341.277us 113.759us 3
|
| 4111 |
+
aten::_scaled_dot_product_flash_attention 0.73% 17.811us 6.27% 151.937us 50.646us 0.000us 0.00% 341.277us 113.759us 3
|
| 4112 |
+
aten::_flash_attention_forward 1.38% 33.540us 4.54% 110.186us 36.729us 341.277us 72.87% 341.277us 113.759us 3
|
| 4113 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 341.277us 72.87% 341.277us 113.759us 3
|
| 4114 |
+
aten::contiguous 0.39% 9.522us 79.59% 1.930ms 160.818us 0.000us 0.00% 139.840us 11.653us 12
|
| 4115 |
+
aten::clone 1.25% 30.240us 79.20% 1.920ms 160.024us 0.000us 0.00% 139.840us 11.653us 12
|
| 4116 |
+
aten::copy_ 3.35% 81.274us 75.28% 1.825ms 152.111us 127.072us 27.13% 139.840us 11.653us 12
|
| 4117 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 127.072us 27.13% 127.072us 10.589us 12
|
| 4118 |
+
Activity Buffer Request 59.91% 1.453ms 59.91% 1.453ms 1.453ms 12.768us 2.73% 12.768us 12.768us 1
|
| 4119 |
+
aten::transpose 2.18% 52.871us 2.90% 70.271us 2.928us 0.000us 0.00% 0.000us 0.000us 24
|
| 4120 |
+
aten::as_strided 0.72% 17.400us 0.72% 17.400us 0.725us 0.000us 0.00% 0.000us 0.000us 24
|
| 4121 |
+
aten::empty_like 0.83% 20.083us 3.47% 84.148us 5.610us 0.000us 0.00% 0.000us 0.000us 15
|
| 4122 |
+
aten::empty 3.18% 77.125us 3.18% 77.125us 3.214us 0.000us 0.00% 0.000us 0.000us 24
|
| 4123 |
+
cudaLaunchKernel 13.00% 315.205us 13.00% 315.205us 21.014us 0.000us 0.00% 0.000us 0.000us 15
|
| 4124 |
+
aten::empty_strided 0.61% 14.781us 0.61% 14.781us 4.927us 0.000us 0.00% 0.000us 0.000us 3
|
| 4125 |
+
cudaDeviceGetAttribute 0.07% 1.670us 0.07% 1.670us 0.278us 0.000us 0.00% 0.000us 0.000us 6
|
| 4126 |
+
cudaFuncSetAttribute 0.16% 3.970us 0.16% 3.970us 1.323us 0.000us 0.00% 0.000us 0.000us 3
|
| 4127 |
+
cudaDeviceSynchronize 0.82% 19.911us 0.82% 19.911us 19.911us 0.000us 0.00% 0.000us 0.000us 1
|
| 4128 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4129 |
+
Self CPU time total: 2.425ms
|
| 4130 |
+
Self CUDA time total: 468.349us
|
| 4131 |
+
|
| 4132 |
+
|
| 4133 |
+
|
| 4134 |
+
======================================================================
|
| 4135 |
+
PROFILE TRACE: torch_flash_ma | flux_L448
|
| 4136 |
+
======================================================================
|
| 4137 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4138 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4139 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4140 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 799.966us 130.76% 799.966us 799.966us 1
|
| 4141 |
+
torch_flash_ma 12.25% 304.685us 97.28% 2.419ms 2.419ms 0.000us 0.00% 624.638us 624.638us 1
|
| 4142 |
+
aten::scaled_dot_product_attention 0.97% 24.122us 7.38% 183.559us 61.186us 0.000us 0.00% 485.886us 161.962us 3
|
| 4143 |
+
aten::_scaled_dot_product_flash_attention 0.71% 17.700us 6.41% 159.437us 53.146us 0.000us 0.00% 485.886us 161.962us 3
|
| 4144 |
+
aten::_flash_attention_forward 1.59% 39.459us 4.74% 117.796us 39.265us 485.886us 79.42% 485.886us 161.962us 3
|
| 4145 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 485.886us 79.42% 485.886us 161.962us 3
|
| 4146 |
+
aten::contiguous 0.39% 9.743us 75.79% 1.885ms 157.075us 0.000us 0.00% 138.752us 11.563us 12
|
| 4147 |
+
aten::clone 1.21% 30.098us 75.40% 1.875ms 156.263us 0.000us 0.00% 138.752us 11.563us 12
|
| 4148 |
+
aten::copy_ 3.39% 84.237us 71.41% 1.776ms 147.998us 125.888us 20.58% 138.752us 11.563us 12
|
| 4149 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 125.888us 20.58% 125.888us 10.491us 12
|
| 4150 |
+
Activity Buffer Request 58.51% 1.455ms 58.51% 1.455ms 1.455ms 12.864us 2.10% 12.864us 12.864us 1
|
| 4151 |
+
aten::transpose 2.11% 52.456us 2.81% 69.984us 2.916us 0.000us 0.00% 0.000us 0.000us 24
|
| 4152 |
+
aten::as_strided 0.70% 17.528us 0.70% 17.528us 0.730us 0.000us 0.00% 0.000us 0.000us 24
|
| 4153 |
+
aten::empty_like 0.83% 20.690us 3.57% 88.794us 5.920us 0.000us 0.00% 0.000us 0.000us 15
|
| 4154 |
+
aten::empty 3.29% 81.917us 3.29% 81.917us 3.413us 0.000us 0.00% 0.000us 0.000us 24
|
| 4155 |
+
cudaLaunchKernel 10.48% 260.751us 10.48% 260.751us 17.383us 0.000us 0.00% 0.000us 0.000us 15
|
| 4156 |
+
aten::empty_strided 0.58% 14.540us 0.58% 14.540us 4.847us 0.000us 0.00% 0.000us 0.000us 3
|
| 4157 |
+
cudaDeviceGetAttribute 0.09% 2.170us 0.09% 2.170us 0.362us 0.000us 0.00% 0.000us 0.000us 6
|
| 4158 |
+
cudaFuncSetAttribute 0.16% 3.911us 0.16% 3.911us 1.304us 0.000us 0.00% 0.000us 0.000us 3
|
| 4159 |
+
cudaDeviceSynchronize 2.72% 67.754us 2.72% 67.754us 67.754us 0.000us 0.00% 0.000us 0.000us 1
|
| 4160 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4161 |
+
Self CPU time total: 2.487ms
|
| 4162 |
+
Self CUDA time total: 611.774us
|
| 4163 |
+
|
| 4164 |
+
|
| 4165 |
+
|
| 4166 |
+
======================================================================
|
| 4167 |
+
PROFILE TRACE: torch_flash_ma | flux_L512
|
| 4168 |
+
======================================================================
|
| 4169 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4170 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4171 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4172 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 754.076us 118.52% 754.076us 754.076us 1
|
| 4173 |
+
torch_flash_ma 10.33% 251.863us 96.72% 2.358ms 2.358ms 0.000us 0.00% 647.964us 647.964us 1
|
| 4174 |
+
aten::scaled_dot_product_attention 1.02% 24.850us 7.50% 182.789us 60.930us 0.000us 0.00% 507.517us 169.172us 3
|
| 4175 |
+
aten::_scaled_dot_product_flash_attention 0.72% 17.614us 6.48% 157.939us 52.646us 0.000us 0.00% 507.517us 169.172us 3
|
| 4176 |
+
aten::_flash_attention_forward 1.67% 40.594us 4.82% 117.465us 39.155us 507.517us 79.77% 507.517us 169.172us 3
|
| 4177 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 507.517us 79.77% 507.517us 169.172us 3
|
| 4178 |
+
aten::contiguous 0.38% 9.202us 77.00% 1.877ms 156.434us 0.000us 0.00% 140.447us 11.704us 12
|
| 4179 |
+
aten::clone 1.22% 29.851us 76.63% 1.868ms 155.667us 0.000us 0.00% 140.447us 11.704us 12
|
| 4180 |
+
aten::copy_ 3.45% 84.032us 72.63% 1.771ms 147.547us 128.703us 20.23% 140.447us 11.704us 12
|
| 4181 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 128.703us 20.23% 128.703us 10.725us 12
|
| 4182 |
+
Activity Buffer Request 59.63% 1.454ms 59.63% 1.454ms 1.454ms 11.744us 1.85% 11.744us 11.744us 1
|
| 4183 |
+
aten::transpose 2.09% 51.002us 2.82% 68.782us 2.866us 0.000us 0.00% 0.000us 0.000us 24
|
| 4184 |
+
aten::as_strided 0.73% 17.780us 0.73% 17.780us 0.741us 0.000us 0.00% 0.000us 0.000us 24
|
| 4185 |
+
aten::empty_like 0.85% 20.819us 3.58% 87.161us 5.811us 0.000us 0.00% 0.000us 0.000us 15
|
| 4186 |
+
aten::empty 3.27% 79.813us 3.27% 79.813us 3.326us 0.000us 0.00% 0.000us 0.000us 24
|
| 4187 |
+
cudaLaunchKernel 10.50% 256.026us 10.50% 256.026us 17.068us 0.000us 0.00% 0.000us 0.000us 15
|
| 4188 |
+
aten::empty_strided 0.59% 14.340us 0.59% 14.340us 4.780us 0.000us 0.00% 0.000us 0.000us 3
|
| 4189 |
+
cudaDeviceGetAttribute 0.08% 1.949us 0.08% 1.949us 0.325us 0.000us 0.00% 0.000us 0.000us 6
|
| 4190 |
+
cudaFuncSetAttribute 0.18% 4.440us 0.18% 4.440us 1.480us 0.000us 0.00% 0.000us 0.000us 3
|
| 4191 |
+
cudaDeviceSynchronize 3.28% 80.003us 3.28% 80.003us 80.003us 0.000us 0.00% 0.000us 0.000us 1
|
| 4192 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4193 |
+
Self CPU time total: 2.438ms
|
| 4194 |
+
Self CUDA time total: 636.220us
|
| 4195 |
+
|
| 4196 |
+
|
| 4197 |
+
impl wl p50(ms) ok
|
| 4198 |
+
torch_flash_ma flux_L128 0.18 True
|
| 4199 |
+
torch_flash_ma flux_L256 0.21 True
|
| 4200 |
+
torch_flash_ma flux_L320 0.22 True
|
| 4201 |
+
torch_flash_ma flux_L384 0.22 True
|
| 4202 |
+
torch_flash_ma flux_L448 0.27 True
|
| 4203 |
+
torch_flash_ma flux_L512 0.28 True
|
| 4204 |
+
</pre></div>
|
| 4205 |
+
<div class="cell-artifacts">
|
| 4206 |
+
<h4>Artifacts:</h4>
|
| 4207 |
+
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
| 4208 |
</div>
|
| 4209 |
</div>
|
| 4210 |
</div>
|
flash_attn/impls/hf_kernels_flash_attn.html
CHANGED
|
@@ -706,6 +706,29 @@
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
.cell-stderr {
|
| 710 |
background: var(--bg-error);
|
| 711 |
border-left: 2px solid var(--border-error);
|
|
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3556 |
if(output){
|
| 3557 |
output.classList.remove('output-stale');
|
| 3558 |
let html='';
|
| 3559 |
-
if(data.stdout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3560 |
console.log('UV Logs:', data);
|
| 3561 |
if(data.stderr) {
|
| 3562 |
// Split UV logs from regular stderr
|
|
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3678 |
}
|
| 3679 |
}
|
| 3680 |
|
| 3681 |
-
// Live reload functionality (robust SSE handling)
|
| 3682 |
-
(function(){
|
| 3683 |
-
|
| 3684 |
-
|
| 3685 |
-
|
| 3686 |
-
|
| 3687 |
-
|
| 3688 |
-
|
| 3689 |
-
|
| 3690 |
-
|
| 3691 |
-
|
| 3692 |
-
|
| 3693 |
-
|
| 3694 |
-
|
| 3695 |
-
|
| 3696 |
-
|
| 3697 |
-
|
| 3698 |
-
|
| 3699 |
-
|
| 3700 |
-
|
| 3701 |
-
})();
|
| 3702 |
|
| 3703 |
|
| 3704 |
document.addEventListener('DOMContentLoaded', function() {
|
|
@@ -3829,21 +3857,21 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.10.244-240.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
| 3836 |
<div class="main-content">
|
| 3837 |
<h1>HF Kernels - Flash Attention</h1>
|
| 3838 |
<h2>HuggingFace Kernels Flash Attention Benchmark</h2>
|
| 3839 |
-
<div class="cell
|
| 3840 |
<div class="cell-header">
|
| 3841 |
<span class="collapse-indicators">
|
| 3842 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3843 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: benchmark |
|
| 3847 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3849 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3862,7 +3890,7 @@ Cell: benchmark | 0.01s | FAILED
|
|
| 3862 |
<span class="c1"># ]</span>
|
| 3863 |
<span class="c1">#</span>
|
| 3864 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3865 |
-
<span class="c1"># kernels-benchmark-tools = { path = "
|
| 3866 |
<span class="c1"># ///</span>
|
| 3867 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3868 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3931,9 +3959,166 @@ Cell: benchmark | 0.01s | FAILED
|
|
| 3931 |
</div>
|
| 3932 |
</div>
|
| 3933 |
<div id="output-benchmark" class="cell-output">
|
| 3934 |
-
<div class="cell-
|
| 3935 |
-
|
| 3936 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3937 |
</div>
|
| 3938 |
</div>
|
| 3939 |
</div>
|
|
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
| 709 |
+
|
| 710 |
+
.cell-stdout {
|
| 711 |
+
background: var(--bg-tertiary);
|
| 712 |
+
padding: 0.75rem;
|
| 713 |
+
border-radius: 1px;
|
| 714 |
+
font-family: inherit;
|
| 715 |
+
font-size: 0.9rem;
|
| 716 |
+
color: var(--text-primary);
|
| 717 |
+
|
| 718 |
+
/* key bits */
|
| 719 |
+
overflow: auto; /* show scrollbars when needed */
|
| 720 |
+
max-width: 100%; /* respects whatever layout width you give it */
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
.cell-stdout .stdout-text {
|
| 724 |
+
margin: 0; /* reset pre default margin */
|
| 725 |
+
white-space: pre; /* keep line breaks, NO wrapping */
|
| 726 |
+
display: inline-block; /* shrink-to-content */
|
| 727 |
+
min-width: max-content; /* allow very long lines to define intrinsic width */
|
| 728 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
|
| 729 |
+
tab-size: 2;
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
.cell-stderr {
|
| 733 |
background: var(--bg-error);
|
| 734 |
border-left: 2px solid var(--border-error);
|
|
|
|
| 3579 |
if(output){
|
| 3580 |
output.classList.remove('output-stale');
|
| 3581 |
let html='';
|
| 3582 |
+
if (data.stdout) {
|
| 3583 |
+
html += '<div class="cell-stdout"><pre class="stdout-text">'
|
| 3584 |
+
+ escapeHtml(data.stdout)
|
| 3585 |
+
+ '</pre></div>';
|
| 3586 |
+
}
|
| 3587 |
+
|
| 3588 |
console.log('UV Logs:', data);
|
| 3589 |
if(data.stderr) {
|
| 3590 |
// Split UV logs from regular stderr
|
|
|
|
| 3706 |
}
|
| 3707 |
}
|
| 3708 |
|
| 3709 |
+
// // Live reload functionality (robust SSE handling)
|
| 3710 |
+
// (function(){
|
| 3711 |
+
// if (!('EventSource' in window)) {
|
| 3712 |
+
// console.warn('SSE not supported in this browser');
|
| 3713 |
+
// return;
|
| 3714 |
+
// }
|
| 3715 |
+
// let source = new EventSource('/events');
|
| 3716 |
+
// let isOpen = false;
|
| 3717 |
+
// source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
|
| 3718 |
+
// source.onmessage = function(e){
|
| 3719 |
+
// const msg=(e.data||'').trim(); if(!msg) return;
|
| 3720 |
+
// console.log('SSE message:', msg);
|
| 3721 |
+
// if (msg==='reload' || msg==='incremental') { location.reload(); }
|
| 3722 |
+
// // Ignore 'loading' to avoid premature reload loops
|
| 3723 |
+
// };
|
| 3724 |
+
// source.onerror = function(e){
|
| 3725 |
+
// // Let EventSource auto-reconnect instead of forcing a reload
|
| 3726 |
+
// if (isOpen) console.warn('SSE error after open, retrying...', e);
|
| 3727 |
+
// };
|
| 3728 |
+
// window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
|
| 3729 |
+
// })();
|
| 3730 |
|
| 3731 |
|
| 3732 |
document.addEventListener('DOMContentLoaded', function() {
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
| 3864 |
<div class="main-content">
|
| 3865 |
<h1>HF Kernels - Flash Attention</h1>
|
| 3866 |
<h2>HuggingFace Kernels Flash Attention Benchmark</h2>
|
| 3867 |
+
<div class="cell" id="cell-benchmark">
|
| 3868 |
<div class="cell-header">
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 5.95s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3890 |
<span class="c1"># ]</span>
|
| 3891 |
<span class="c1">#</span>
|
| 3892 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3893 |
+
<span class="c1"># kernels-benchmark-tools = { path = "../../../../../tools", editable = true }</span>
|
| 3894 |
<span class="c1"># ///</span>
|
| 3895 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3896 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 3959 |
</div>
|
| 3960 |
</div>
|
| 3961 |
<div id="output-benchmark" class="cell-output">
|
| 3962 |
+
<div class="cell-stdout"><pre class="stdout-text">
|
| 3963 |
+
======================================================================
|
| 3964 |
+
PROFILE TRACE: hf_kernels_flash_attn | flux_L128
|
| 3965 |
+
======================================================================
|
| 3966 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3967 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3968 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3969 |
+
hf_kernels_flash_attn 8.36% 154.078us 96.88% 1.786ms 1.786ms 0.000us 0.00% 362.493us 362.493us 1
|
| 3970 |
+
_flash_attn_9e27194::fwd 3.99% 73.523us 88.52% 1.632ms 543.906us 271.102us 100.00% 362.493us 120.831us 3
|
| 3971 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 272.638us 100.57% 272.638us 272.638us 1
|
| 3972 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 271.102us 100.00% 271.102us 90.367us 3
|
| 3973 |
+
Activity Buffer Request 76.97% 1.419ms 76.97% 1.419ms 1.419ms 91.391us 33.71% 91.391us 91.391us 1
|
| 3974 |
+
cudaDeviceGetAttribute 0.25% 4.549us 0.25% 4.549us 0.303us 0.000us 0.00% 0.000us 0.000us 15
|
| 3975 |
+
aten::empty_like 0.95% 17.511us 2.83% 52.153us 17.384us 0.000us 0.00% 0.000us 0.000us 3
|
| 3976 |
+
aten::empty_strided 1.88% 34.642us 1.88% 34.642us 11.547us 0.000us 0.00% 0.000us 0.000us 3
|
| 3977 |
+
aten::empty 1.44% 26.603us 1.44% 26.603us 2.956us 0.000us 0.00% 0.000us 0.000us 9
|
| 3978 |
+
cudaFuncSetAttribute 0.78% 14.320us 0.78% 14.320us 4.773us 0.000us 0.00% 0.000us 0.000us 3
|
| 3979 |
+
cudaLaunchKernel 2.27% 41.882us 2.27% 41.882us 13.961us 0.000us 0.00% 0.000us 0.000us 3
|
| 3980 |
+
cudaDeviceSynchronize 3.12% 57.433us 3.12% 57.433us 57.433us 0.000us 0.00% 0.000us 0.000us 1
|
| 3981 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3982 |
+
Self CPU time total: 1.843ms
|
| 3983 |
+
Self CUDA time total: 271.102us
|
| 3984 |
+
|
| 3985 |
+
|
| 3986 |
+
|
| 3987 |
+
======================================================================
|
| 3988 |
+
PROFILE TRACE: hf_kernels_flash_attn | flux_L256
|
| 3989 |
+
======================================================================
|
| 3990 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3991 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3992 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3993 |
+
hf_kernels_flash_attn 6.38% 115.656us 91.71% 1.662ms 1.662ms 0.000us 0.00% 396.671us 396.671us 1
|
| 3994 |
+
_flash_attn_9e27194::fwd 2.82% 51.131us 85.33% 1.547ms 515.555us 298.303us 100.00% 396.671us 132.224us 3
|
| 3995 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 299.743us 100.48% 299.743us 299.743us 1
|
| 3996 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 298.303us 100.00% 298.303us 99.434us 3
|
| 3997 |
+
Activity Buffer Request 77.99% 1.414ms 77.99% 1.414ms 1.414ms 98.368us 32.98% 98.368us 98.368us 1
|
| 3998 |
+
cudaDeviceGetAttribute 0.22% 3.931us 0.22% 3.931us 0.262us 0.000us 0.00% 0.000us 0.000us 15
|
| 3999 |
+
aten::empty_like 0.40% 7.190us 1.33% 24.041us 8.014us 0.000us 0.00% 0.000us 0.000us 3
|
| 4000 |
+
aten::empty_strided 0.93% 16.851us 0.93% 16.851us 5.617us 0.000us 0.00% 0.000us 0.000us 3
|
| 4001 |
+
aten::empty 1.25% 22.681us 1.25% 22.681us 2.520us 0.000us 0.00% 0.000us 0.000us 9
|
| 4002 |
+
cudaFuncSetAttribute 0.21% 3.730us 0.21% 3.730us 1.243us 0.000us 0.00% 0.000us 0.000us 3
|
| 4003 |
+
cudaLaunchKernel 1.51% 27.451us 1.51% 27.451us 9.150us 0.000us 0.00% 0.000us 0.000us 3
|
| 4004 |
+
cudaDeviceSynchronize 8.29% 150.237us 8.29% 150.237us 150.237us 0.000us 0.00% 0.000us 0.000us 1
|
| 4005 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4006 |
+
Self CPU time total: 1.813ms
|
| 4007 |
+
Self CUDA time total: 298.303us
|
| 4008 |
+
|
| 4009 |
+
|
| 4010 |
+
|
| 4011 |
+
======================================================================
|
| 4012 |
+
PROFILE TRACE: hf_kernels_flash_attn | flux_L320
|
| 4013 |
+
======================================================================
|
| 4014 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4015 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4016 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
+
hf_kernels_flash_attn 6.16% 112.885us 90.78% 1.663ms 1.663ms 0.000us 0.00% 427.613us 427.613us 1
|
| 4018 |
+
_flash_attn_9e27194::fwd 2.80% 51.281us 84.62% 1.550ms 516.788us 318.526us 100.00% 427.613us 142.538us 3
|
| 4019 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 319.901us 100.43% 319.901us 319.901us 1
|
| 4020 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 318.526us 100.00% 318.526us 106.175us 3
|
| 4021 |
+
Activity Buffer Request 77.28% 1.416ms 77.28% 1.416ms 1.416ms 109.087us 34.25% 109.087us 109.087us 1
|
| 4022 |
+
cudaDeviceGetAttribute 0.21% 3.930us 0.21% 3.930us 0.262us 0.000us 0.00% 0.000us 0.000us 15
|
| 4023 |
+
aten::empty_like 0.41% 7.431us 1.40% 25.731us 8.577us 0.000us 0.00% 0.000us 0.000us 3
|
| 4024 |
+
aten::empty_strided 1.00% 18.300us 1.00% 18.300us 6.100us 0.000us 0.00% 0.000us 0.000us 3
|
| 4025 |
+
aten::empty 1.26% 23.051us 1.26% 23.051us 2.561us 0.000us 0.00% 0.000us 0.000us 9
|
| 4026 |
+
cudaFuncSetAttribute 0.22% 4.001us 0.22% 4.001us 1.334us 0.000us 0.00% 0.000us 0.000us 3
|
| 4027 |
+
cudaLaunchKernel 1.45% 26.532us 1.45% 26.532us 8.844us 0.000us 0.00% 0.000us 0.000us 3
|
| 4028 |
+
cudaDeviceSynchronize 9.22% 168.858us 9.22% 168.858us 168.858us 0.000us 0.00% 0.000us 0.000us 1
|
| 4029 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4030 |
+
Self CPU time total: 1.832ms
|
| 4031 |
+
Self CUDA time total: 318.526us
|
| 4032 |
+
|
| 4033 |
+
|
| 4034 |
+
|
| 4035 |
+
======================================================================
|
| 4036 |
+
PROFILE TRACE: hf_kernels_flash_attn | flux_L384
|
| 4037 |
+
======================================================================
|
| 4038 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4040 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4041 |
+
hf_kernels_flash_attn 5.43% 111.055us 91.19% 1.866ms 1.866ms 0.000us 0.00% 446.776us 446.776us 1
|
| 4042 |
+
_flash_attn_9e27194::fwd 2.54% 51.901us 85.76% 1.755ms 584.928us 331.162us 100.00% 446.776us 148.925us 3
|
| 4043 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 332.667us 100.45% 332.667us 332.667us 1
|
| 4044 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 331.162us 100.00% 331.162us 110.387us 3
|
| 4045 |
+
Activity Buffer Request 69.78% 1.428ms 69.78% 1.428ms 1.428ms 115.614us 34.91% 115.614us 115.614us 1
|
| 4046 |
+
cudaDeviceGetAttribute 0.19% 3.942us 0.19% 3.942us 0.263us 0.000us 0.00% 0.000us 0.000us 15
|
| 4047 |
+
aten::empty_like 0.39% 8.070us 1.24% 25.461us 8.487us 0.000us 0.00% 0.000us 0.000us 3
|
| 4048 |
+
aten::empty_strided 0.85% 17.391us 0.85% 17.391us 5.797us 0.000us 0.00% 0.000us 0.000us 3
|
| 4049 |
+
aten::empty 1.08% 22.080us 1.08% 22.080us 2.453us 0.000us 0.00% 0.000us 0.000us 9
|
| 4050 |
+
cudaFuncSetAttribute 0.19% 3.861us 0.19% 3.861us 1.287us 0.000us 0.00% 0.000us 0.000us 3
|
| 4051 |
+
cudaLaunchKernel 10.75% 219.880us 10.75% 219.880us 73.293us 0.000us 0.00% 0.000us 0.000us 3
|
| 4052 |
+
cudaDeviceSynchronize 8.81% 180.219us 8.81% 180.219us 180.219us 0.000us 0.00% 0.000us 0.000us 1
|
| 4053 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
+
Self CPU time total: 2.046ms
|
| 4055 |
+
Self CUDA time total: 331.162us
|
| 4056 |
+
|
| 4057 |
+
|
| 4058 |
+
|
| 4059 |
+
======================================================================
|
| 4060 |
+
PROFILE TRACE: hf_kernels_flash_attn | flux_L448
|
| 4061 |
+
======================================================================
|
| 4062 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4063 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4064 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4065 |
+
hf_kernels_flash_attn 4.92% 108.784us 84.29% 1.864ms 1.864ms 0.000us 0.00% 663.288us 663.288us 1
|
| 4066 |
+
_flash_attn_9e27194::fwd 2.26% 49.951us 79.37% 1.755ms 585.135us 493.882us 100.00% 663.288us 221.096us 3
|
| 4067 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 495.418us 100.31% 495.418us 495.418us 1
|
| 4068 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 493.882us 100.00% 493.882us 164.627us 3
|
| 4069 |
+
Activity Buffer Request 65.22% 1.442ms 65.22% 1.442ms 1.442ms 169.406us 34.30% 169.406us 169.406us 1
|
| 4070 |
+
cudaDeviceGetAttribute 0.18% 3.990us 0.18% 3.990us 0.266us 0.000us 0.00% 0.000us 0.000us 15
|
| 4071 |
+
aten::empty_like 0.34% 7.522us 1.12% 24.742us 8.247us 0.000us 0.00% 0.000us 0.000us 3
|
| 4072 |
+
aten::empty_strided 0.78% 17.220us 0.78% 17.220us 5.740us 0.000us 0.00% 0.000us 0.000us 3
|
| 4073 |
+
aten::empty 0.96% 21.140us 0.96% 21.140us 2.349us 0.000us 0.00% 0.000us 0.000us 9
|
| 4074 |
+
cudaFuncSetAttribute 0.19% 4.121us 0.19% 4.121us 1.374us 0.000us 0.00% 0.000us 0.000us 3
|
| 4075 |
+
cudaLaunchKernel 9.45% 209.092us 9.45% 209.092us 69.697us 0.000us 0.00% 0.000us 0.000us 3
|
| 4076 |
+
cudaDeviceSynchronize 15.71% 347.407us 15.71% 347.407us 347.407us 0.000us 0.00% 0.000us 0.000us 1
|
| 4077 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4078 |
+
Self CPU time total: 2.212ms
|
| 4079 |
+
Self CUDA time total: 493.882us
|
| 4080 |
+
|
| 4081 |
+
|
| 4082 |
+
|
| 4083 |
+
======================================================================
|
| 4084 |
+
PROFILE TRACE: hf_kernels_flash_attn | flux_L512
|
| 4085 |
+
======================================================================
|
| 4086 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4087 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4088 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4089 |
+
hf_kernels_flash_attn 4.96% 110.355us 83.23% 1.852ms 1.852ms 0.000us 0.00% 697.540us 697.540us 1
|
| 4090 |
+
_flash_attn_9e27194::fwd 2.27% 50.469us 78.28% 1.742ms 580.665us 518.659us 100.00% 697.540us 232.513us 3
|
| 4091 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 520.068us 100.27% 520.068us 520.068us 1
|
| 4092 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 518.659us 100.00% 518.659us 172.886us 3
|
| 4093 |
+
Activity Buffer Request 64.27% 1.430ms 64.27% 1.430ms 1.430ms 178.881us 34.49% 178.881us 178.881us 1
|
| 4094 |
+
cudaDeviceGetAttribute 0.17% 3.832us 0.17% 3.832us 0.255us 0.000us 0.00% 0.000us 0.000us 15
|
| 4095 |
+
aten::empty_like 0.33% 7.341us 1.15% 25.571us 8.524us 0.000us 0.00% 0.000us 0.000us 3
|
| 4096 |
+
aten::empty_strided 0.82% 18.230us 0.82% 18.230us 6.077us 0.000us 0.00% 0.000us 0.000us 3
|
| 4097 |
+
aten::empty 0.94% 20.812us 0.94% 20.812us 2.312us 0.000us 0.00% 0.000us 0.000us 9
|
| 4098 |
+
cudaFuncSetAttribute 0.19% 4.171us 0.19% 4.171us 1.390us 0.000us 0.00% 0.000us 0.000us 3
|
| 4099 |
+
cudaLaunchKernel 9.29% 206.809us 9.29% 206.809us 68.936us 0.000us 0.00% 0.000us 0.000us 3
|
| 4100 |
+
cudaDeviceSynchronize 16.77% 373.119us 16.77% 373.119us 373.119us 0.000us 0.00% 0.000us 0.000us 1
|
| 4101 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4102 |
+
Self CPU time total: 2.225ms
|
| 4103 |
+
Self CUDA time total: 518.659us
|
| 4104 |
+
|
| 4105 |
+
|
| 4106 |
+
impl wl p50(ms) ok
|
| 4107 |
+
hf_kernels_flash_attn flux_L128 0.12 True
|
| 4108 |
+
hf_kernels_flash_attn flux_L256 0.14 True
|
| 4109 |
+
hf_kernels_flash_attn flux_L320 0.14 True
|
| 4110 |
+
hf_kernels_flash_attn flux_L384 0.15 True
|
| 4111 |
+
hf_kernels_flash_attn flux_L448 0.20 True
|
| 4112 |
+
hf_kernels_flash_attn flux_L512 0.20 True
|
| 4113 |
+
</pre></div>
|
| 4114 |
+
<div class="cell-stderr">
|
| 4115 |
+
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4116 |
+
Fetching 20 files: 10%|█ | 2/20 [00:01<00:16, 1.08it/s]
|
| 4117 |
+
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 10.78it/s]
|
| 4118 |
+
</div>
|
| 4119 |
+
<div class="cell-artifacts">
|
| 4120 |
+
<h4>Artifacts:</h4>
|
| 4121 |
+
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
| 4122 |
</div>
|
| 4123 |
</div>
|
| 4124 |
</div>
|
flash_attn/impls/hf_kernels_flash_attn3.html
CHANGED
|
@@ -706,6 +706,29 @@
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
.cell-stderr {
|
| 710 |
background: var(--bg-error);
|
| 711 |
border-left: 2px solid var(--border-error);
|
|
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3556 |
if(output){
|
| 3557 |
output.classList.remove('output-stale');
|
| 3558 |
let html='';
|
| 3559 |
-
if(data.stdout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3560 |
console.log('UV Logs:', data);
|
| 3561 |
if(data.stderr) {
|
| 3562 |
// Split UV logs from regular stderr
|
|
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3678 |
}
|
| 3679 |
}
|
| 3680 |
|
| 3681 |
-
// Live reload functionality (robust SSE handling)
|
| 3682 |
-
(function(){
|
| 3683 |
-
|
| 3684 |
-
|
| 3685 |
-
|
| 3686 |
-
|
| 3687 |
-
|
| 3688 |
-
|
| 3689 |
-
|
| 3690 |
-
|
| 3691 |
-
|
| 3692 |
-
|
| 3693 |
-
|
| 3694 |
-
|
| 3695 |
-
|
| 3696 |
-
|
| 3697 |
-
|
| 3698 |
-
|
| 3699 |
-
|
| 3700 |
-
|
| 3701 |
-
})();
|
| 3702 |
|
| 3703 |
|
| 3704 |
document.addEventListener('DOMContentLoaded', function() {
|
|
@@ -3829,21 +3857,21 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.10.244-240.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
| 3836 |
<div class="main-content">
|
| 3837 |
<h1>HF Kernels - Flash Attention 3</h1>
|
| 3838 |
<h2>HuggingFace Kernels Flash Attention 3 Benchmark</h2>
|
| 3839 |
-
<div class="cell
|
| 3840 |
<div class="cell-header">
|
| 3841 |
<span class="collapse-indicators">
|
| 3842 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3843 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: benchmark |
|
| 3847 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3849 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3856,13 +3884,13 @@ Cell: benchmark | 0.05s | FAILED
|
|
| 3856 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3857 |
<span class="c1"># dependencies = [</span>
|
| 3858 |
<span class="c1"># "numpy",</span>
|
| 3859 |
-
<span class="c1"># "torch",</span>
|
| 3860 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3861 |
<span class="c1"># "kernels",</span>
|
| 3862 |
<span class="c1"># ]</span>
|
| 3863 |
<span class="c1">#</span>
|
| 3864 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3865 |
-
<span class="c1"># kernels-benchmark-tools = { path = "
|
| 3866 |
<span class="c1"># ///</span>
|
| 3867 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3868 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3930,9 +3958,154 @@ Cell: benchmark | 0.05s | FAILED
|
|
| 3930 |
</div>
|
| 3931 |
</div>
|
| 3932 |
<div id="output-benchmark" class="cell-output">
|
| 3933 |
-
<div class="cell-
|
| 3934 |
-
|
| 3935 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3936 |
</div>
|
| 3937 |
</div>
|
| 3938 |
</div>
|
|
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
| 709 |
+
|
| 710 |
+
.cell-stdout {
|
| 711 |
+
background: var(--bg-tertiary);
|
| 712 |
+
padding: 0.75rem;
|
| 713 |
+
border-radius: 1px;
|
| 714 |
+
font-family: inherit;
|
| 715 |
+
font-size: 0.9rem;
|
| 716 |
+
color: var(--text-primary);
|
| 717 |
+
|
| 718 |
+
/* key bits */
|
| 719 |
+
overflow: auto; /* show scrollbars when needed */
|
| 720 |
+
max-width: 100%; /* respects whatever layout width you give it */
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
.cell-stdout .stdout-text {
|
| 724 |
+
margin: 0; /* reset pre default margin */
|
| 725 |
+
white-space: pre; /* keep line breaks, NO wrapping */
|
| 726 |
+
display: inline-block; /* shrink-to-content */
|
| 727 |
+
min-width: max-content; /* allow very long lines to define intrinsic width */
|
| 728 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
|
| 729 |
+
tab-size: 2;
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
.cell-stderr {
|
| 733 |
background: var(--bg-error);
|
| 734 |
border-left: 2px solid var(--border-error);
|
|
|
|
| 3579 |
if(output){
|
| 3580 |
output.classList.remove('output-stale');
|
| 3581 |
let html='';
|
| 3582 |
+
if (data.stdout) {
|
| 3583 |
+
html += '<div class="cell-stdout"><pre class="stdout-text">'
|
| 3584 |
+
+ escapeHtml(data.stdout)
|
| 3585 |
+
+ '</pre></div>';
|
| 3586 |
+
}
|
| 3587 |
+
|
| 3588 |
console.log('UV Logs:', data);
|
| 3589 |
if(data.stderr) {
|
| 3590 |
// Split UV logs from regular stderr
|
|
|
|
| 3706 |
}
|
| 3707 |
}
|
| 3708 |
|
| 3709 |
+
// // Live reload functionality (robust SSE handling)
|
| 3710 |
+
// (function(){
|
| 3711 |
+
// if (!('EventSource' in window)) {
|
| 3712 |
+
// console.warn('SSE not supported in this browser');
|
| 3713 |
+
// return;
|
| 3714 |
+
// }
|
| 3715 |
+
// let source = new EventSource('/events');
|
| 3716 |
+
// let isOpen = false;
|
| 3717 |
+
// source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
|
| 3718 |
+
// source.onmessage = function(e){
|
| 3719 |
+
// const msg=(e.data||'').trim(); if(!msg) return;
|
| 3720 |
+
// console.log('SSE message:', msg);
|
| 3721 |
+
// if (msg==='reload' || msg==='incremental') { location.reload(); }
|
| 3722 |
+
// // Ignore 'loading' to avoid premature reload loops
|
| 3723 |
+
// };
|
| 3724 |
+
// source.onerror = function(e){
|
| 3725 |
+
// // Let EventSource auto-reconnect instead of forcing a reload
|
| 3726 |
+
// if (isOpen) console.warn('SSE error after open, retrying...', e);
|
| 3727 |
+
// };
|
| 3728 |
+
// window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
|
| 3729 |
+
// })();
|
| 3730 |
|
| 3731 |
|
| 3732 |
document.addEventListener('DOMContentLoaded', function() {
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
| 3864 |
<div class="main-content">
|
| 3865 |
<h1>HF Kernels - Flash Attention 3</h1>
|
| 3866 |
<h2>HuggingFace Kernels Flash Attention 3 Benchmark</h2>
|
| 3867 |
+
<div class="cell" id="cell-benchmark">
|
| 3868 |
<div class="cell-header">
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 5.65s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3884 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3885 |
<span class="c1"># dependencies = [</span>
|
| 3886 |
<span class="c1"># "numpy",</span>
|
| 3887 |
+
<span class="c1"># "torch==2.8.0",</span>
|
| 3888 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3889 |
<span class="c1"># "kernels",</span>
|
| 3890 |
<span class="c1"># ]</span>
|
| 3891 |
<span class="c1">#</span>
|
| 3892 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3893 |
+
<span class="c1"># kernels-benchmark-tools = { path = "../../../../../tools", editable = true }</span>
|
| 3894 |
<span class="c1"># ///</span>
|
| 3895 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3896 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 3958 |
</div>
|
| 3959 |
</div>
|
| 3960 |
<div id="output-benchmark" class="cell-output">
|
| 3961 |
+
<div class="cell-stdout"><pre class="stdout-text">
|
| 3962 |
+
======================================================================
|
| 3963 |
+
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L128
|
| 3964 |
+
======================================================================
|
| 3965 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3966 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3967 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3968 |
+
hf_kernels_flash_attn3 9.00% 178.129us 99.63% 1.971ms 1.971ms 0.000us 0.00% 345.823us 345.823us 1
|
| 3969 |
+
FlashAttnFunc 6.66% 131.797us 90.63% 1.793ms 597.659us 0.000us 0.00% 345.823us 115.274us 3
|
| 3970 |
+
_flash_attn3_48fe103_dirty::fwd 4.56% 90.256us 83.97% 1.661ms 553.727us 259.583us 100.00% 345.823us 115.274us 3
|
| 3971 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 292.158us 112.55% 292.158us 292.158us 1
|
| 3972 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 259.583us 100.00% 259.583us 86.528us 3
|
| 3973 |
+
Activity Buffer Request 73.82% 1.460ms 73.82% 1.460ms 1.460ms 86.240us 33.22% 86.240us 86.240us 1
|
| 3974 |
+
aten::empty 2.53% 50.052us 2.53% 50.052us 8.342us 0.000us 0.00% 0.000us 0.000us 6
|
| 3975 |
+
cudaFuncSetAttribute 0.86% 16.921us 0.86% 16.921us 5.640us 0.000us 0.00% 0.000us 0.000us 3
|
| 3976 |
+
cudaLaunchKernel 2.20% 43.551us 2.20% 43.551us 14.517us 0.000us 0.00% 0.000us 0.000us 3
|
| 3977 |
+
cudaDeviceSynchronize 0.37% 7.311us 0.37% 7.311us 7.311us 0.000us 0.00% 0.000us 0.000us 1
|
| 3978 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3979 |
+
Self CPU time total: 1.978ms
|
| 3980 |
+
Self CUDA time total: 259.583us
|
| 3981 |
+
|
| 3982 |
+
|
| 3983 |
+
|
| 3984 |
+
======================================================================
|
| 3985 |
+
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L256
|
| 3986 |
+
======================================================================
|
| 3987 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3989 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
+
hf_kernels_flash_attn3 7.20% 133.787us 96.41% 1.793ms 1.793ms 0.000us 0.00% 393.753us 393.753us 1
|
| 3991 |
+
FlashAttnFunc 5.05% 93.854us 89.22% 1.659ms 552.953us 0.000us 0.00% 393.753us 131.251us 3
|
| 3992 |
+
_flash_attn3_48fe103_dirty::fwd 2.68% 49.913us 84.17% 1.565ms 521.669us 293.595us 100.00% 393.753us 131.251us 3
|
| 3993 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 295.003us 100.48% 295.003us 295.003us 1
|
| 3994 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 293.595us 100.00% 293.595us 97.865us 3
|
| 3995 |
+
Activity Buffer Request 78.08% 1.452ms 78.08% 1.452ms 1.452ms 100.158us 34.11% 100.158us 100.158us 1
|
| 3996 |
+
aten::empty 1.44% 26.770us 1.44% 26.770us 4.462us 0.000us 0.00% 0.000us 0.000us 6
|
| 3997 |
+
cudaFuncSetAttribute 0.31% 5.680us 0.31% 5.680us 1.893us 0.000us 0.00% 0.000us 0.000us 3
|
| 3998 |
+
cudaLaunchKernel 1.66% 30.852us 1.66% 30.852us 10.284us 0.000us 0.00% 0.000us 0.000us 3
|
| 3999 |
+
cudaDeviceSynchronize 3.59% 66.713us 3.59% 66.713us 66.713us 0.000us 0.00% 0.000us 0.000us 1
|
| 4000 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
+
Self CPU time total: 1.859ms
|
| 4002 |
+
Self CUDA time total: 293.595us
|
| 4003 |
+
|
| 4004 |
+
|
| 4005 |
+
|
| 4006 |
+
======================================================================
|
| 4007 |
+
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L320
|
| 4008 |
+
======================================================================
|
| 4009 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4010 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4011 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4012 |
+
hf_kernels_flash_attn3 6.76% 125.695us 94.13% 1.750ms 1.750ms 0.000us 0.00% 430.748us 430.748us 1
|
| 4013 |
+
FlashAttnFunc 4.90% 91.016us 87.37% 1.624ms 541.277us 0.000us 0.00% 430.748us 143.583us 3
|
| 4014 |
+
_flash_attn3_48fe103_dirty::fwd 2.79% 51.770us 82.47% 1.533ms 510.938us 324.541us 100.00% 430.748us 143.583us 3
|
| 4015 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 325.948us 100.43% 325.948us 325.948us 1
|
| 4016 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 324.541us 100.00% 324.541us 108.180us 3
|
| 4017 |
+
Activity Buffer Request 76.46% 1.421ms 76.46% 1.421ms 1.421ms 106.207us 32.73% 106.207us 106.207us 1
|
| 4018 |
+
aten::empty 1.41% 26.162us 1.41% 26.162us 4.360us 0.000us 0.00% 0.000us 0.000us 6
|
| 4019 |
+
cudaFuncSetAttribute 0.27% 5.061us 0.27% 5.061us 1.687us 0.000us 0.00% 0.000us 0.000us 3
|
| 4020 |
+
cudaLaunchKernel 1.55% 28.862us 1.55% 28.862us 9.621us 0.000us 0.00% 0.000us 0.000us 3
|
| 4021 |
+
cudaDeviceSynchronize 5.87% 109.015us 5.87% 109.015us 109.015us 0.000us 0.00% 0.000us 0.000us 1
|
| 4022 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4023 |
+
Self CPU time total: 1.859ms
|
| 4024 |
+
Self CUDA time total: 324.541us
|
| 4025 |
+
|
| 4026 |
+
|
| 4027 |
+
|
| 4028 |
+
======================================================================
|
| 4029 |
+
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L384
|
| 4030 |
+
======================================================================
|
| 4031 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4032 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4033 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4034 |
+
hf_kernels_flash_attn3 6.04% 124.874us 95.07% 1.964ms 1.964ms 0.000us 0.00% 429.567us 429.567us 1
|
| 4035 |
+
FlashAttnFunc 4.57% 94.345us 89.03% 1.840ms 613.174us 0.000us 0.00% 429.567us 143.189us 3
|
| 4036 |
+
_flash_attn3_48fe103_dirty::fwd 2.60% 53.754us 84.46% 1.745ms 581.725us 322.591us 100.00% 429.567us 143.189us 3
|
| 4037 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 324.063us 100.46% 324.063us 324.063us 1
|
| 4038 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 322.591us 100.00% 322.591us 107.530us 3
|
| 4039 |
+
Activity Buffer Request 69.43% 1.434ms 69.43% 1.434ms 1.434ms 106.976us 33.16% 106.976us 106.976us 1
|
| 4040 |
+
aten::empty 1.29% 26.591us 1.29% 26.591us 4.432us 0.000us 0.00% 0.000us 0.000us 6
|
| 4041 |
+
cudaFuncSetAttribute 0.25% 5.220us 0.25% 5.220us 1.740us 0.000us 0.00% 0.000us 0.000us 3
|
| 4042 |
+
cudaLaunchKernel 10.90% 225.141us 10.90% 225.141us 75.047us 0.000us 0.00% 0.000us 0.000us 3
|
| 4043 |
+
cudaDeviceSynchronize 4.93% 101.805us 4.93% 101.805us 101.805us 0.000us 0.00% 0.000us 0.000us 1
|
| 4044 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4045 |
+
Self CPU time total: 2.066ms
|
| 4046 |
+
Self CUDA time total: 322.591us
|
| 4047 |
+
|
| 4048 |
+
|
| 4049 |
+
|
| 4050 |
+
======================================================================
|
| 4051 |
+
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L448
|
| 4052 |
+
======================================================================
|
| 4053 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4055 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4056 |
+
hf_kernels_flash_attn3 5.77% 124.745us 87.87% 1.900ms 1.900ms 0.000us 0.00% 654.301us 654.301us 1
|
| 4057 |
+
FlashAttnFunc 4.37% 94.576us 82.10% 1.775ms 591.589us 0.000us 0.00% 654.301us 218.100us 3
|
| 4058 |
+
_flash_attn3_48fe103_dirty::fwd 2.37% 51.203us 77.72% 1.680ms 560.064us 488.670us 100.00% 654.301us 218.100us 3
|
| 4059 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 490.142us 100.30% 490.142us 490.142us 1
|
| 4060 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 488.670us 100.00% 488.670us 162.890us 3
|
| 4061 |
+
Activity Buffer Request 66.37% 1.435ms 66.37% 1.435ms 1.435ms 165.631us 33.89% 165.631us 165.631us 1
|
| 4062 |
+
aten::empty 1.25% 26.990us 1.25% 26.990us 4.498us 0.000us 0.00% 0.000us 0.000us 6
|
| 4063 |
+
cudaFuncSetAttribute 0.24% 5.250us 0.24% 5.250us 1.750us 0.000us 0.00% 0.000us 0.000us 3
|
| 4064 |
+
cudaLaunchKernel 7.49% 161.858us 7.49% 161.858us 53.953us 0.000us 0.00% 0.000us 0.000us 3
|
| 4065 |
+
cudaDeviceSynchronize 12.13% 262.313us 12.13% 262.313us 262.313us 0.000us 0.00% 0.000us 0.000us 1
|
| 4066 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4067 |
+
Self CPU time total: 2.162ms
|
| 4068 |
+
Self CUDA time total: 488.670us
|
| 4069 |
+
|
| 4070 |
+
|
| 4071 |
+
|
| 4072 |
+
======================================================================
|
| 4073 |
+
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L512
|
| 4074 |
+
======================================================================
|
| 4075 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4076 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4077 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4078 |
+
hf_kernels_flash_attn3 5.69% 119.216us 86.59% 1.815ms 1.815ms 0.000us 0.00% 666.625us 666.625us 1
|
| 4079 |
+
FlashAttnFunc 4.40% 92.224us 80.91% 1.696ms 565.401us 0.000us 0.00% 666.625us 222.208us 3
|
| 4080 |
+
_flash_attn3_48fe103_dirty::fwd 2.44% 51.234us 76.51% 1.604ms 534.659us 497.473us 100.00% 666.625us 222.208us 3
|
| 4081 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 498.849us 100.28% 498.849us 498.849us 1
|
| 4082 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 497.473us 100.00% 497.473us 165.824us 3
|
| 4083 |
+
Activity Buffer Request 64.99% 1.363ms 64.99% 1.363ms 1.363ms 169.152us 34.00% 169.152us 169.152us 1
|
| 4084 |
+
aten::empty 1.25% 26.300us 1.25% 26.300us 4.383us 0.000us 0.00% 0.000us 0.000us 6
|
| 4085 |
+
cudaFuncSetAttribute 0.27% 5.600us 0.27% 5.600us 1.867us 0.000us 0.00% 0.000us 0.000us 3
|
| 4086 |
+
cudaLaunchKernel 7.55% 158.288us 7.55% 158.288us 52.763us 0.000us 0.00% 0.000us 0.000us 3
|
| 4087 |
+
cudaDeviceSynchronize 13.41% 281.113us 13.41% 281.113us 281.113us 0.000us 0.00% 0.000us 0.000us 1
|
| 4088 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4089 |
+
Self CPU time total: 2.097ms
|
| 4090 |
+
Self CUDA time total: 497.473us
|
| 4091 |
+
|
| 4092 |
+
|
| 4093 |
+
impl wl p50(ms) ok
|
| 4094 |
+
hf_kernels_flash_attn3 flux_L128 0.13 True
|
| 4095 |
+
hf_kernels_flash_attn3 flux_L256 0.15 True
|
| 4096 |
+
hf_kernels_flash_attn3 flux_L320 0.16 True
|
| 4097 |
+
hf_kernels_flash_attn3 flux_L384 0.16 True
|
| 4098 |
+
hf_kernels_flash_attn3 flux_L448 0.21 True
|
| 4099 |
+
hf_kernels_flash_attn3 flux_L512 0.21 True
|
| 4100 |
+
</pre></div>
|
| 4101 |
+
<div class="cell-stderr">
|
| 4102 |
+
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4103 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.23it/s]
|
| 4104 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.46it/s]
|
| 4105 |
+
</div>
|
| 4106 |
+
<div class="cell-artifacts">
|
| 4107 |
+
<h4>Artifacts:</h4>
|
| 4108 |
+
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
| 4109 |
</div>
|
| 4110 |
</div>
|
| 4111 |
</div>
|
flash_attn/impls/mem_efficient_attention.html
CHANGED
|
@@ -706,6 +706,29 @@
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
.cell-stderr {
|
| 710 |
background: var(--bg-error);
|
| 711 |
border-left: 2px solid var(--border-error);
|
|
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3556 |
if(output){
|
| 3557 |
output.classList.remove('output-stale');
|
| 3558 |
let html='';
|
| 3559 |
-
if(data.stdout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3560 |
console.log('UV Logs:', data);
|
| 3561 |
if(data.stderr) {
|
| 3562 |
// Split UV logs from regular stderr
|
|
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3678 |
}
|
| 3679 |
}
|
| 3680 |
|
| 3681 |
-
// Live reload functionality (robust SSE handling)
|
| 3682 |
-
(function(){
|
| 3683 |
-
|
| 3684 |
-
|
| 3685 |
-
|
| 3686 |
-
|
| 3687 |
-
|
| 3688 |
-
|
| 3689 |
-
|
| 3690 |
-
|
| 3691 |
-
|
| 3692 |
-
|
| 3693 |
-
|
| 3694 |
-
|
| 3695 |
-
|
| 3696 |
-
|
| 3697 |
-
|
| 3698 |
-
|
| 3699 |
-
|
| 3700 |
-
|
| 3701 |
-
})();
|
| 3702 |
|
| 3703 |
|
| 3704 |
document.addEventListener('DOMContentLoaded', function() {
|
|
@@ -3829,21 +3857,21 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.10.244-240.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
| 3836 |
<div class="main-content">
|
| 3837 |
<h1>Memory Efficient Attention Implementation</h1>
|
| 3838 |
<h2>Memory Efficient SDPA Benchmark</h2>
|
| 3839 |
-
<div class="cell
|
| 3840 |
<div class="cell-header">
|
| 3841 |
<span class="collapse-indicators">
|
| 3842 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3843 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: benchmark |
|
| 3847 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3849 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3855,12 +3883,12 @@ Cell: benchmark | 0.01s | FAILED
|
|
| 3855 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3856 |
<span class="c1"># dependencies = [</span>
|
| 3857 |
<span class="c1"># "numpy",</span>
|
| 3858 |
-
<span class="c1"># "torch",</span>
|
| 3859 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3860 |
<span class="c1"># ]</span>
|
| 3861 |
<span class="c1">#</span>
|
| 3862 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3863 |
-
<span class="c1"># kernels-benchmark-tools = { path = "
|
| 3864 |
<span class="c1"># ///</span>
|
| 3865 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3866 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3926,9 +3954,203 @@ Cell: benchmark | 0.01s | FAILED
|
|
| 3926 |
</div>
|
| 3927 |
</div>
|
| 3928 |
<div id="output-benchmark" class="cell-output">
|
| 3929 |
-
<div class="cell-
|
| 3930 |
-
|
| 3931 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3932 |
</div>
|
| 3933 |
</div>
|
| 3934 |
</div>
|
|
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
| 709 |
+
|
| 710 |
+
.cell-stdout {
|
| 711 |
+
background: var(--bg-tertiary);
|
| 712 |
+
padding: 0.75rem;
|
| 713 |
+
border-radius: 1px;
|
| 714 |
+
font-family: inherit;
|
| 715 |
+
font-size: 0.9rem;
|
| 716 |
+
color: var(--text-primary);
|
| 717 |
+
|
| 718 |
+
/* key bits */
|
| 719 |
+
overflow: auto; /* show scrollbars when needed */
|
| 720 |
+
max-width: 100%; /* respects whatever layout width you give it */
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
.cell-stdout .stdout-text {
|
| 724 |
+
margin: 0; /* reset pre default margin */
|
| 725 |
+
white-space: pre; /* keep line breaks, NO wrapping */
|
| 726 |
+
display: inline-block; /* shrink-to-content */
|
| 727 |
+
min-width: max-content; /* allow very long lines to define intrinsic width */
|
| 728 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
|
| 729 |
+
tab-size: 2;
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
.cell-stderr {
|
| 733 |
background: var(--bg-error);
|
| 734 |
border-left: 2px solid var(--border-error);
|
|
|
|
| 3579 |
if(output){
|
| 3580 |
output.classList.remove('output-stale');
|
| 3581 |
let html='';
|
| 3582 |
+
if (data.stdout) {
|
| 3583 |
+
html += '<div class="cell-stdout"><pre class="stdout-text">'
|
| 3584 |
+
+ escapeHtml(data.stdout)
|
| 3585 |
+
+ '</pre></div>';
|
| 3586 |
+
}
|
| 3587 |
+
|
| 3588 |
console.log('UV Logs:', data);
|
| 3589 |
if(data.stderr) {
|
| 3590 |
// Split UV logs from regular stderr
|
|
|
|
| 3706 |
}
|
| 3707 |
}
|
| 3708 |
|
| 3709 |
+
// // Live reload functionality (robust SSE handling)
|
| 3710 |
+
// (function(){
|
| 3711 |
+
// if (!('EventSource' in window)) {
|
| 3712 |
+
// console.warn('SSE not supported in this browser');
|
| 3713 |
+
// return;
|
| 3714 |
+
// }
|
| 3715 |
+
// let source = new EventSource('/events');
|
| 3716 |
+
// let isOpen = false;
|
| 3717 |
+
// source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
|
| 3718 |
+
// source.onmessage = function(e){
|
| 3719 |
+
// const msg=(e.data||'').trim(); if(!msg) return;
|
| 3720 |
+
// console.log('SSE message:', msg);
|
| 3721 |
+
// if (msg==='reload' || msg==='incremental') { location.reload(); }
|
| 3722 |
+
// // Ignore 'loading' to avoid premature reload loops
|
| 3723 |
+
// };
|
| 3724 |
+
// source.onerror = function(e){
|
| 3725 |
+
// // Let EventSource auto-reconnect instead of forcing a reload
|
| 3726 |
+
// if (isOpen) console.warn('SSE error after open, retrying...', e);
|
| 3727 |
+
// };
|
| 3728 |
+
// window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
|
| 3729 |
+
// })();
|
| 3730 |
|
| 3731 |
|
| 3732 |
document.addEventListener('DOMContentLoaded', function() {
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
| 3864 |
<div class="main-content">
|
| 3865 |
<h1>Memory Efficient Attention Implementation</h1>
|
| 3866 |
<h2>Memory Efficient SDPA Benchmark</h2>
|
| 3867 |
+
<div class="cell" id="cell-benchmark">
|
| 3868 |
<div class="cell-header">
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 3.60s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3883 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3884 |
<span class="c1"># dependencies = [</span>
|
| 3885 |
<span class="c1"># "numpy",</span>
|
| 3886 |
+
<span class="c1"># "torch==2.8.0",</span>
|
| 3887 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3888 |
<span class="c1"># ]</span>
|
| 3889 |
<span class="c1">#</span>
|
| 3890 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3891 |
+
<span class="c1"># kernels-benchmark-tools = { path = "../../../../../tools", editable = true }</span>
|
| 3892 |
<span class="c1"># ///</span>
|
| 3893 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3894 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 3954 |
</div>
|
| 3955 |
</div>
|
| 3956 |
<div id="output-benchmark" class="cell-output">
|
| 3957 |
+
<div class="cell-stdout"><pre class="stdout-text">
|
| 3958 |
+
======================================================================
|
| 3959 |
+
PROFILE TRACE: torch_mem_eff | flux_L128
|
| 3960 |
+
======================================================================
|
| 3961 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3962 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3963 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3964 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 743.839us 143.68% 743.839us 743.839us 1
|
| 3965 |
+
torch_mem_eff 14.97% 353.534us 98.94% 2.336ms 2.336ms 0.000us 0.00% 525.535us 525.535us 1
|
| 3966 |
+
aten::scaled_dot_product_attention 1.34% 31.582us 7.53% 177.879us 59.293us 0.000us 0.00% 451.039us 150.346us 3
|
| 3967 |
+
aten::_scaled_dot_product_efficient_attention 0.99% 23.447us 6.20% 146.297us 48.766us 0.000us 0.00% 451.039us 150.346us 3
|
| 3968 |
+
aten::_efficient_attention_forward 1.49% 35.270us 4.27% 100.806us 33.602us 451.039us 87.12% 451.039us 150.346us 3
|
| 3969 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 451.039us 87.12% 451.039us 150.346us 3
|
| 3970 |
+
aten::contiguous 0.56% 13.241us 73.52% 1.736ms 192.899us 0.000us 0.00% 74.496us 8.277us 9
|
| 3971 |
+
aten::clone 1.47% 34.702us 72.96% 1.723ms 191.428us 0.000us 0.00% 74.496us 8.277us 9
|
| 3972 |
+
aten::copy_ 3.23% 76.247us 68.33% 1.614ms 179.290us 66.656us 12.88% 74.496us 8.277us 9
|
| 3973 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 66.656us 12.88% 66.656us 7.406us 9
|
| 3974 |
+
Activity Buffer Request 61.73% 1.458ms 61.73% 1.458ms 1.458ms 7.840us 1.51% 7.840us 7.840us 1
|
| 3975 |
+
aten::transpose 2.92% 68.989us 3.85% 90.910us 3.788us 0.000us 0.00% 0.000us 0.000us 24
|
| 3976 |
+
aten::as_strided 0.93% 21.921us 0.93% 21.921us 0.913us 0.000us 0.00% 0.000us 0.000us 24
|
| 3977 |
+
aten::empty_like 0.77% 18.239us 3.16% 74.542us 8.282us 0.000us 0.00% 0.000us 0.000us 9
|
| 3978 |
+
aten::empty 3.59% 84.706us 3.59% 84.706us 4.034us 0.000us 0.00% 0.000us 0.000us 21
|
| 3979 |
+
cudaLaunchKernel 4.35% 102.715us 4.35% 102.715us 8.560us 0.000us 0.00% 0.000us 0.000us 12
|
| 3980 |
+
cudaStreamIsCapturing 0.16% 3.710us 0.16% 3.710us 1.237us 0.000us 0.00% 0.000us 0.000us 3
|
| 3981 |
+
cudaFuncSetAttribute 0.44% 10.440us 0.44% 10.440us 3.480us 0.000us 0.00% 0.000us 0.000us 3
|
| 3982 |
+
cudaDeviceSynchronize 1.06% 24.961us 1.06% 24.961us 24.961us 0.000us 0.00% 0.000us 0.000us 1
|
| 3983 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3984 |
+
Self CPU time total: 2.361ms
|
| 3985 |
+
Self CUDA time total: 517.695us
|
| 3986 |
+
|
| 3987 |
+
|
| 3988 |
+
|
| 3989 |
+
======================================================================
|
| 3990 |
+
PROFILE TRACE: torch_mem_eff | flux_L256
|
| 3991 |
+
======================================================================
|
| 3992 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3993 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3994 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3995 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 704.155us 121.71% 704.155us 704.155us 1
|
| 3996 |
+
torch_mem_eff 11.29% 250.325us 93.54% 2.073ms 2.073ms 0.000us 0.00% 586.972us 586.972us 1
|
| 3997 |
+
aten::scaled_dot_product_attention 0.83% 18.299us 6.32% 139.996us 46.665us 0.000us 0.00% 507.229us 169.076us 3
|
| 3998 |
+
aten::_scaled_dot_product_efficient_attention 0.91% 20.123us 5.49% 121.697us 40.566us 0.000us 0.00% 507.229us 169.076us 3
|
| 3999 |
+
aten::_efficient_attention_forward 1.32% 29.201us 3.61% 80.034us 26.678us 507.229us 87.67% 507.229us 169.076us 3
|
| 4000 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 507.229us 87.67% 507.229us 169.076us 3
|
| 4001 |
+
aten::contiguous 0.32% 7.068us 74.05% 1.641ms 182.386us 0.000us 0.00% 79.743us 8.860us 9
|
| 4002 |
+
aten::clone 1.01% 22.352us 73.73% 1.634ms 181.601us 0.000us 0.00% 79.743us 8.860us 9
|
| 4003 |
+
aten::copy_ 2.89% 63.964us 70.44% 1.562ms 173.503us 71.327us 12.33% 79.743us 8.860us 9
|
| 4004 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 71.327us 12.33% 71.327us 7.925us 9
|
| 4005 |
+
Activity Buffer Request 64.67% 1.433ms 64.67% 1.433ms 1.433ms 8.416us 1.45% 8.416us 8.416us 1
|
| 4006 |
+
aten::transpose 2.15% 47.759us 2.85% 63.231us 2.635us 0.000us 0.00% 0.000us 0.000us 24
|
| 4007 |
+
aten::as_strided 0.70% 15.472us 0.70% 15.472us 0.645us 0.000us 0.00% 0.000us 0.000us 24
|
| 4008 |
+
aten::empty_like 0.52% 11.480us 2.28% 50.532us 5.615us 0.000us 0.00% 0.000us 0.000us 9
|
| 4009 |
+
aten::empty 2.90% 64.203us 2.90% 64.203us 3.057us 0.000us 0.00% 0.000us 0.000us 21
|
| 4010 |
+
cudaLaunchKernel 3.80% 84.195us 3.80% 84.195us 7.016us 0.000us 0.00% 0.000us 0.000us 12
|
| 4011 |
+
cudaStreamIsCapturing 0.10% 2.170us 0.10% 2.170us 0.723us 0.000us 0.00% 0.000us 0.000us 3
|
| 4012 |
+
cudaFuncSetAttribute 0.15% 3.380us 0.15% 3.380us 1.127us 0.000us 0.00% 0.000us 0.000us 3
|
| 4013 |
+
cudaDeviceSynchronize 6.46% 143.197us 6.46% 143.197us 143.197us 0.000us 0.00% 0.000us 0.000us 1
|
| 4014 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4015 |
+
Self CPU time total: 2.217ms
|
| 4016 |
+
Self CUDA time total: 578.556us
|
| 4017 |
+
|
| 4018 |
+
|
| 4019 |
+
|
| 4020 |
+
======================================================================
|
| 4021 |
+
PROFILE TRACE: torch_mem_eff | flux_L320
|
| 4022 |
+
======================================================================
|
| 4023 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4024 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4025 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4026 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 741.345us 118.22% 741.345us 741.345us 1
|
| 4027 |
+
torch_mem_eff 10.83% 244.352us 91.98% 2.075ms 2.075ms 0.000us 0.00% 636.768us 636.768us 1
|
| 4028 |
+
aten::scaled_dot_product_attention 0.80% 18.001us 6.18% 139.437us 46.479us 0.000us 0.00% 543.969us 181.323us 3
|
| 4029 |
+
aten::_scaled_dot_product_efficient_attention 0.80% 18.160us 5.38% 121.436us 40.479us 0.000us 0.00% 543.969us 181.323us 3
|
| 4030 |
+
aten::_efficient_attention_forward 1.26% 28.484us 3.53% 79.573us 26.524us 543.969us 86.74% 543.969us 181.323us 3
|
| 4031 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 543.969us 86.74% 543.969us 181.323us 3
|
| 4032 |
+
aten::contiguous 0.34% 7.591us 72.87% 1.644ms 182.689us 0.000us 0.00% 92.799us 10.311us 9
|
| 4033 |
+
aten::clone 1.02% 22.973us 72.53% 1.637ms 181.846us 0.000us 0.00% 92.799us 10.311us 9
|
| 4034 |
+
aten::copy_ 2.84% 64.004us 69.28% 1.563ms 173.686us 83.135us 13.26% 92.799us 10.311us 9
|
| 4035 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 83.135us 13.26% 83.135us 9.237us 9
|
| 4036 |
+
Activity Buffer Request 63.58% 1.435ms 63.58% 1.435ms 1.435ms 9.664us 1.54% 9.664us 9.664us 1
|
| 4037 |
+
aten::transpose 2.42% 54.684us 3.15% 71.104us 2.963us 0.000us 0.00% 0.000us 0.000us 24
|
| 4038 |
+
aten::as_strided 0.73% 16.420us 0.73% 16.420us 0.684us 0.000us 0.00% 0.000us 0.000us 24
|
| 4039 |
+
aten::empty_like 0.53% 12.038us 2.24% 50.461us 5.607us 0.000us 0.00% 0.000us 0.000us 9
|
| 4040 |
+
aten::empty 2.78% 62.772us 2.78% 62.772us 2.989us 0.000us 0.00% 0.000us 0.000us 21
|
| 4041 |
+
cudaLaunchKernel 3.80% 85.752us 3.80% 85.752us 7.146us 0.000us 0.00% 0.000us 0.000us 12
|
| 4042 |
+
cudaStreamIsCapturing 0.10% 2.260us 0.10% 2.260us 0.753us 0.000us 0.00% 0.000us 0.000us 3
|
| 4043 |
+
cudaFuncSetAttribute 0.15% 3.330us 0.15% 3.330us 1.110us 0.000us 0.00% 0.000us 0.000us 3
|
| 4044 |
+
cudaDeviceSynchronize 8.02% 181.009us 8.02% 181.009us 181.009us 0.000us 0.00% 0.000us 0.000us 1
|
| 4045 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4046 |
+
Self CPU time total: 2.256ms
|
| 4047 |
+
Self CUDA time total: 627.104us
|
| 4048 |
+
|
| 4049 |
+
|
| 4050 |
+
|
| 4051 |
+
======================================================================
|
| 4052 |
+
PROFILE TRACE: torch_mem_eff | flux_L384
|
| 4053 |
+
======================================================================
|
| 4054 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4055 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4056 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 762.814us 117.08% 762.814us 762.814us 1
|
| 4058 |
+
torch_mem_eff 10.94% 270.925us 93.63% 2.319ms 2.319ms 0.000us 0.00% 663.068us 663.068us 1
|
| 4059 |
+
aten::scaled_dot_product_attention 0.75% 18.610us 6.03% 149.368us 49.789us 0.000us 0.00% 560.285us 186.762us 3
|
| 4060 |
+
aten::_scaled_dot_product_efficient_attention 0.84% 20.750us 5.28% 130.758us 43.586us 0.000us 0.00% 560.285us 186.762us 3
|
| 4061 |
+
aten::_efficient_attention_forward 1.24% 30.680us 3.47% 85.933us 28.644us 560.285us 85.99% 560.285us 186.762us 3
|
| 4062 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 560.285us 85.99% 560.285us 186.762us 3
|
| 4063 |
+
aten::contiguous 0.34% 8.310us 74.76% 1.851ms 205.718us 0.000us 0.00% 102.783us 11.420us 9
|
| 4064 |
+
aten::clone 0.93% 23.120us 74.43% 1.843ms 204.794us 0.000us 0.00% 102.783us 11.420us 9
|
| 4065 |
+
aten::copy_ 2.76% 68.243us 71.46% 1.770ms 196.615us 91.263us 14.01% 102.783us 11.420us 9
|
| 4066 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 91.263us 14.01% 91.263us 10.140us 9
|
| 4067 |
+
Activity Buffer Request 57.69% 1.429ms 57.69% 1.429ms 1.429ms 11.520us 1.77% 11.520us 11.520us 1
|
| 4068 |
+
aten::transpose 2.18% 53.884us 2.86% 70.837us 2.952us 0.000us 0.00% 0.000us 0.000us 24
|
| 4069 |
+
aten::as_strided 0.68% 16.953us 0.68% 16.953us 0.706us 0.000us 0.00% 0.000us 0.000us 24
|
| 4070 |
+
aten::empty_like 0.46% 11.381us 2.04% 50.492us 5.610us 0.000us 0.00% 0.000us 0.000us 9
|
| 4071 |
+
aten::empty 2.62% 64.842us 2.62% 64.842us 3.088us 0.000us 0.00% 0.000us 0.000us 21
|
| 4072 |
+
cudaLaunchKernel 11.97% 296.414us 11.97% 296.414us 24.701us 0.000us 0.00% 0.000us 0.000us 12
|
| 4073 |
+
cudaStreamIsCapturing 0.10% 2.540us 0.10% 2.540us 0.847us 0.000us 0.00% 0.000us 0.000us 3
|
| 4074 |
+
cudaFuncSetAttribute 0.13% 3.261us 0.13% 3.261us 1.087us 0.000us 0.00% 0.000us 0.000us 3
|
| 4075 |
+
cudaDeviceSynchronize 6.37% 157.857us 6.37% 157.857us 157.857us 0.000us 0.00% 0.000us 0.000us 1
|
| 4076 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
+
Self CPU time total: 2.476ms
|
| 4078 |
+
Self CUDA time total: 651.548us
|
| 4079 |
+
|
| 4080 |
+
|
| 4081 |
+
|
| 4082 |
+
======================================================================
|
| 4083 |
+
PROFILE TRACE: torch_mem_eff | flux_L448
|
| 4084 |
+
======================================================================
|
| 4085 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4086 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4087 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 811.582us 115.69% 811.582us 811.582us 1
|
| 4089 |
+
torch_mem_eff 10.28% 258.922us 90.15% 2.271ms 2.271ms 0.000us 0.00% 712.095us 712.095us 1
|
| 4090 |
+
aten::scaled_dot_product_attention 0.74% 18.760us 5.47% 137.886us 45.962us 0.000us 0.00% 611.487us 203.829us 3
|
| 4091 |
+
aten::_scaled_dot_product_efficient_attention 0.72% 18.189us 4.73% 119.126us 39.709us 0.000us 0.00% 611.487us 203.829us 3
|
| 4092 |
+
aten::_efficient_attention_forward 1.11% 28.033us 3.12% 78.704us 26.235us 611.487us 87.16% 611.487us 203.829us 3
|
| 4093 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 611.487us 87.16% 611.487us 203.829us 3
|
| 4094 |
+
aten::contiguous 0.29% 7.191us 72.68% 1.831ms 203.401us 0.000us 0.00% 100.608us 11.179us 9
|
| 4095 |
+
aten::clone 0.89% 22.393us 72.40% 1.823ms 202.602us 0.000us 0.00% 100.608us 11.179us 9
|
| 4096 |
+
aten::copy_ 2.57% 64.604us 69.47% 1.750ms 194.423us 90.048us 12.84% 100.608us 11.179us 9
|
| 4097 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.048us 12.84% 90.048us 10.005us 9
|
| 4098 |
+
Activity Buffer Request 58.13% 1.464ms 58.13% 1.464ms 1.464ms 10.560us 1.51% 10.560us 10.560us 1
|
| 4099 |
+
aten::transpose 1.95% 49.033us 2.60% 65.375us 2.724us 0.000us 0.00% 0.000us 0.000us 24
|
| 4100 |
+
aten::as_strided 0.65% 16.342us 0.65% 16.342us 0.681us 0.000us 0.00% 0.000us 0.000us 24
|
| 4101 |
+
aten::empty_like 0.51% 12.912us 2.03% 51.223us 5.691us 0.000us 0.00% 0.000us 0.000us 9
|
| 4102 |
+
aten::empty 2.50% 62.890us 2.50% 62.890us 2.995us 0.000us 0.00% 0.000us 0.000us 21
|
| 4103 |
+
cudaLaunchKernel 9.59% 241.441us 9.59% 241.441us 20.120us 0.000us 0.00% 0.000us 0.000us 12
|
| 4104 |
+
cudaStreamIsCapturing 0.09% 2.220us 0.09% 2.220us 0.740us 0.000us 0.00% 0.000us 0.000us 3
|
| 4105 |
+
cudaFuncSetAttribute 0.14% 3.650us 0.14% 3.650us 1.217us 0.000us 0.00% 0.000us 0.000us 3
|
| 4106 |
+
cudaDeviceSynchronize 9.85% 248.062us 9.85% 248.062us 248.062us 0.000us 0.00% 0.000us 0.000us 1
|
| 4107 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4108 |
+
Self CPU time total: 2.519ms
|
| 4109 |
+
Self CUDA time total: 701.535us
|
| 4110 |
+
|
| 4111 |
+
|
| 4112 |
+
|
| 4113 |
+
======================================================================
|
| 4114 |
+
PROFILE TRACE: torch_mem_eff | flux_L512
|
| 4115 |
+
======================================================================
|
| 4116 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4117 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4118 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 955.976us 112.33% 955.976us 955.976us 1
|
| 4120 |
+
torch_mem_eff 9.37% 248.255us 85.34% 2.262ms 2.262ms 0.000us 0.00% 865.703us 865.703us 1
|
| 4121 |
+
aten::scaled_dot_product_attention 0.68% 17.990us 5.29% 140.316us 46.772us 0.000us 0.00% 738.854us 246.285us 3
|
| 4122 |
+
aten::_scaled_dot_product_efficient_attention 0.72% 19.111us 4.61% 122.326us 40.775us 0.000us 0.00% 738.854us 246.285us 3
|
| 4123 |
+
aten::_efficient_attention_forward 1.10% 29.141us 2.98% 78.926us 26.309us 738.854us 86.81% 738.854us 246.285us 3
|
| 4124 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 738.854us 86.81% 738.854us 246.285us 3
|
| 4125 |
+
aten::contiguous 0.28% 7.521us 68.87% 1.825ms 202.832us 0.000us 0.00% 126.849us 14.094us 9
|
| 4126 |
+
aten::clone 0.86% 22.848us 68.58% 1.818ms 201.996us 0.000us 0.00% 126.849us 14.094us 9
|
| 4127 |
+
aten::copy_ 2.53% 66.983us 65.79% 1.744ms 193.757us 112.225us 13.19% 126.849us 14.094us 9
|
| 4128 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 112.225us 13.19% 112.225us 12.469us 9
|
| 4129 |
+
Activity Buffer Request 55.19% 1.463ms 55.19% 1.463ms 1.463ms 14.624us 1.72% 14.624us 14.624us 1
|
| 4130 |
+
aten::transpose 2.08% 55.231us 2.73% 72.342us 3.014us 0.000us 0.00% 0.000us 0.000us 24
|
| 4131 |
+
aten::as_strided 0.65% 17.111us 0.65% 17.111us 0.713us 0.000us 0.00% 0.000us 0.000us 24
|
| 4132 |
+
aten::empty_like 0.44% 11.730us 1.94% 51.302us 5.700us 0.000us 0.00% 0.000us 0.000us 9
|
| 4133 |
+
aten::empty 2.40% 63.653us 2.40% 63.653us 3.031us 0.000us 0.00% 0.000us 0.000us 21
|
| 4134 |
+
cudaLaunchKernel 8.85% 234.503us 8.85% 234.503us 19.542us 0.000us 0.00% 0.000us 0.000us 12
|
| 4135 |
+
cudaStreamIsCapturing 0.08% 2.150us 0.08% 2.150us 0.717us 0.000us 0.00% 0.000us 0.000us 3
|
| 4136 |
+
cudaFuncSetAttribute 0.11% 2.981us 0.11% 2.981us 0.994us 0.000us 0.00% 0.000us 0.000us 3
|
| 4137 |
+
cudaDeviceSynchronize 14.66% 388.669us 14.66% 388.669us 388.669us 0.000us 0.00% 0.000us 0.000us 1
|
| 4138 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4139 |
+
Self CPU time total: 2.651ms
|
| 4140 |
+
Self CUDA time total: 851.079us
|
| 4141 |
+
|
| 4142 |
+
|
| 4143 |
+
impl wl p50(ms) ok
|
| 4144 |
+
torch_mem_eff flux_L128 0.23 True
|
| 4145 |
+
torch_mem_eff flux_L256 0.26 True
|
| 4146 |
+
torch_mem_eff flux_L320 0.28 True
|
| 4147 |
+
torch_mem_eff flux_L384 0.28 True
|
| 4148 |
+
torch_mem_eff flux_L448 0.30 True
|
| 4149 |
+
torch_mem_eff flux_L512 0.34 True
|
| 4150 |
+
</pre></div>
|
| 4151 |
+
<div class="cell-artifacts">
|
| 4152 |
+
<h4>Artifacts:</h4>
|
| 4153 |
+
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
| 4154 |
</div>
|
| 4155 |
</div>
|
| 4156 |
</div>
|
flash_attn/impls/sage_attention.html
CHANGED
|
@@ -706,6 +706,29 @@
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
.cell-stderr {
|
| 710 |
background: var(--bg-error);
|
| 711 |
border-left: 2px solid var(--border-error);
|
|
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3556 |
if(output){
|
| 3557 |
output.classList.remove('output-stale');
|
| 3558 |
let html='';
|
| 3559 |
-
if(data.stdout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3560 |
console.log('UV Logs:', data);
|
| 3561 |
if(data.stderr) {
|
| 3562 |
// Split UV logs from regular stderr
|
|
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3678 |
}
|
| 3679 |
}
|
| 3680 |
|
| 3681 |
-
// Live reload functionality (robust SSE handling)
|
| 3682 |
-
(function(){
|
| 3683 |
-
|
| 3684 |
-
|
| 3685 |
-
|
| 3686 |
-
|
| 3687 |
-
|
| 3688 |
-
|
| 3689 |
-
|
| 3690 |
-
|
| 3691 |
-
|
| 3692 |
-
|
| 3693 |
-
|
| 3694 |
-
|
| 3695 |
-
|
| 3696 |
-
|
| 3697 |
-
|
| 3698 |
-
|
| 3699 |
-
|
| 3700 |
-
|
| 3701 |
-
})();
|
| 3702 |
|
| 3703 |
|
| 3704 |
document.addEventListener('DOMContentLoaded', function() {
|
|
@@ -3829,21 +3857,21 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.10.244-240.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
| 3836 |
<div class="main-content">
|
| 3837 |
<h1>SageAttention Implementation</h1>
|
| 3838 |
<h2>SageAttention Benchmark (INT8 Quantized)</h2>
|
| 3839 |
-
<div class="cell
|
| 3840 |
<div class="cell-header">
|
| 3841 |
<span class="collapse-indicators">
|
| 3842 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3843 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
-
<span id="uv-indicator-benchmark" style="cursor:
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: benchmark |
|
| 3847 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3849 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3855,14 +3883,14 @@ Cell: benchmark | 0.05s | FAILED
|
|
| 3855 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3856 |
<span class="c1"># dependencies = [</span>
|
| 3857 |
<span class="c1"># "numpy",</span>
|
| 3858 |
-
<span class="c1"># "torch",</span>
|
| 3859 |
<span class="c1"># "kernels",</span>
|
| 3860 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3861 |
<span class="c1"># "sageattention",</span>
|
| 3862 |
<span class="c1"># ]</span>
|
| 3863 |
<span class="c1">#</span>
|
| 3864 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3865 |
-
<span class="c1"># kernels-benchmark-tools = { path = "
|
| 3866 |
<span class="c1"># ///</span>
|
| 3867 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3868 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3937,9 +3965,80 @@ Cell: benchmark | 0.05s | FAILED
|
|
| 3937 |
</div>
|
| 3938 |
</div>
|
| 3939 |
<div id="output-benchmark" class="cell-output">
|
| 3940 |
-
<div class="cell-
|
| 3941 |
-
|
| 3942 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3943 |
</div>
|
| 3944 |
</div>
|
| 3945 |
</div>
|
|
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
| 709 |
+
|
| 710 |
+
.cell-stdout {
|
| 711 |
+
background: var(--bg-tertiary);
|
| 712 |
+
padding: 0.75rem;
|
| 713 |
+
border-radius: 1px;
|
| 714 |
+
font-family: inherit;
|
| 715 |
+
font-size: 0.9rem;
|
| 716 |
+
color: var(--text-primary);
|
| 717 |
+
|
| 718 |
+
/* key bits */
|
| 719 |
+
overflow: auto; /* show scrollbars when needed */
|
| 720 |
+
max-width: 100%; /* respects whatever layout width you give it */
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
.cell-stdout .stdout-text {
|
| 724 |
+
margin: 0; /* reset pre default margin */
|
| 725 |
+
white-space: pre; /* keep line breaks, NO wrapping */
|
| 726 |
+
display: inline-block; /* shrink-to-content */
|
| 727 |
+
min-width: max-content; /* allow very long lines to define intrinsic width */
|
| 728 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
|
| 729 |
+
tab-size: 2;
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
.cell-stderr {
|
| 733 |
background: var(--bg-error);
|
| 734 |
border-left: 2px solid var(--border-error);
|
|
|
|
| 3579 |
if(output){
|
| 3580 |
output.classList.remove('output-stale');
|
| 3581 |
let html='';
|
| 3582 |
+
if (data.stdout) {
|
| 3583 |
+
html += '<div class="cell-stdout"><pre class="stdout-text">'
|
| 3584 |
+
+ escapeHtml(data.stdout)
|
| 3585 |
+
+ '</pre></div>';
|
| 3586 |
+
}
|
| 3587 |
+
|
| 3588 |
console.log('UV Logs:', data);
|
| 3589 |
if(data.stderr) {
|
| 3590 |
// Split UV logs from regular stderr
|
|
|
|
| 3706 |
}
|
| 3707 |
}
|
| 3708 |
|
| 3709 |
+
// // Live reload functionality (robust SSE handling)
|
| 3710 |
+
// (function(){
|
| 3711 |
+
// if (!('EventSource' in window)) {
|
| 3712 |
+
// console.warn('SSE not supported in this browser');
|
| 3713 |
+
// return;
|
| 3714 |
+
// }
|
| 3715 |
+
// let source = new EventSource('/events');
|
| 3716 |
+
// let isOpen = false;
|
| 3717 |
+
// source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
|
| 3718 |
+
// source.onmessage = function(e){
|
| 3719 |
+
// const msg=(e.data||'').trim(); if(!msg) return;
|
| 3720 |
+
// console.log('SSE message:', msg);
|
| 3721 |
+
// if (msg==='reload' || msg==='incremental') { location.reload(); }
|
| 3722 |
+
// // Ignore 'loading' to avoid premature reload loops
|
| 3723 |
+
// };
|
| 3724 |
+
// source.onerror = function(e){
|
| 3725 |
+
// // Let EventSource auto-reconnect instead of forcing a reload
|
| 3726 |
+
// if (isOpen) console.warn('SSE error after open, retrying...', e);
|
| 3727 |
+
// };
|
| 3728 |
+
// window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
|
| 3729 |
+
// })();
|
| 3730 |
|
| 3731 |
|
| 3732 |
document.addEventListener('DOMContentLoaded', function() {
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
| 3864 |
<div class="main-content">
|
| 3865 |
<h1>SageAttention Implementation</h1>
|
| 3866 |
<h2>SageAttention Benchmark (INT8 Quantized)</h2>
|
| 3867 |
+
<div class="cell" id="cell-benchmark">
|
| 3868 |
<div class="cell-header">
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
+
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 34.80s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3883 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3884 |
<span class="c1"># dependencies = [</span>
|
| 3885 |
<span class="c1"># "numpy",</span>
|
| 3886 |
+
<span class="c1"># "torch==2.8.0",</span>
|
| 3887 |
<span class="c1"># "kernels",</span>
|
| 3888 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3889 |
<span class="c1"># "sageattention",</span>
|
| 3890 |
<span class="c1"># ]</span>
|
| 3891 |
<span class="c1">#</span>
|
| 3892 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3893 |
+
<span class="c1"># kernels-benchmark-tools = { path = "../../../../../tools", editable = true }</span>
|
| 3894 |
<span class="c1"># ///</span>
|
| 3895 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3896 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 3965 |
</div>
|
| 3966 |
</div>
|
| 3967 |
<div id="output-benchmark" class="cell-output">
|
| 3968 |
+
<div class="cell-stdout"><pre class="stdout-text">impl wl p50(ms) ok
|
| 3969 |
+
sage_int8_fp16 flux_L128 FAIL False
|
| 3970 |
+
Error: module 'sage_attention_c88aae76123df82b' has no attribute 'fwd'
|
| 3971 |
+
sage_int8_fp16 flux_L256 FAIL False
|
| 3972 |
+
Error: module 'sage_attention_c88aae76123df82b' has no attribute 'fwd'
|
| 3973 |
+
sage_int8_fp16 flux_L320 FAIL False
|
| 3974 |
+
Error: module 'sage_attention_c88aae76123df82b' has no attribute 'fwd'
|
| 3975 |
+
sage_int8_fp16 flux_L384 FAIL False
|
| 3976 |
+
Error: module 'sage_attention_c88aae76123df82b' has no attribute 'fwd'
|
| 3977 |
+
sage_int8_fp16 flux_L448 FAIL False
|
| 3978 |
+
Error: module 'sage_attention_c88aae76123df82b' has no attribute 'fwd'
|
| 3979 |
+
sage_int8_fp16 flux_L512 FAIL False
|
| 3980 |
+
Error: module 'sage_attention_c88aae76123df82b' has no attribute 'fwd'
|
| 3981 |
+
</pre></div>
|
| 3982 |
+
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3983 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3984 |
+
<div class="uv-logs-content" style="display: none;">
|
| 3985 |
+
Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 3986 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3987 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3988 |
+
Downloading sympy (6.0MiB)
|
| 3989 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3990 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3991 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3992 |
+
Downloading matplotlib (8.3MiB)
|
| 3993 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3994 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3995 |
+
Downloading networkx (1.9MiB)
|
| 3996 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3997 |
+
Downloading setuptools (1.1MiB)
|
| 3998 |
+
Downloading kiwisolver (1.4MiB)
|
| 3999 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4000 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4001 |
+
Downloading numpy (16.2MiB)
|
| 4002 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4003 |
+
Downloading torch (846.9MiB)
|
| 4004 |
+
Downloading triton (148.3MiB)
|
| 4005 |
+
Downloading fonttools (4.7MiB)
|
| 4006 |
+
Downloading pillow (6.7MiB)
|
| 4007 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4008 |
+
Downloading hf-xet (3.0MiB)
|
| 4009 |
+
Downloading nvidia-cufile-cu12
|
| 4010 |
+
Downloading kiwisolver
|
| 4011 |
+
Downloading hf-xet
|
| 4012 |
+
Downloading setuptools
|
| 4013 |
+
Downloading networkx
|
| 4014 |
+
Downloading fonttools
|
| 4015 |
+
Downloading pillow
|
| 4016 |
+
Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4017 |
+
Downloading nvidia-cuda-cupti-cu12
|
| 4018 |
+
Downloading matplotlib
|
| 4019 |
+
Downloading numpy
|
| 4020 |
+
Downloading sympy
|
| 4021 |
+
Downloading nvidia-nvjitlink-cu12
|
| 4022 |
+
Downloading nvidia-curand-cu12
|
| 4023 |
+
Downloading nvidia-cuda-nvrtc-cu12
|
| 4024 |
+
Downloading triton
|
| 4025 |
+
Downloading nvidia-cufft-cu12
|
| 4026 |
+
Downloading nvidia-cusolver-cu12
|
| 4027 |
+
Downloading nvidia-cusparse-cu12
|
| 4028 |
+
Downloading nvidia-cusparselt-cu12
|
| 4029 |
+
Downloading nvidia-nccl-cu12
|
| 4030 |
+
Downloading nvidia-cublas-cu12
|
| 4031 |
+
Downloading nvidia-cudnn-cu12
|
| 4032 |
+
Downloading torch
|
| 4033 |
+
Installed 48 packages in 236ms
|
| 4034 |
+
</div>
|
| 4035 |
+
</div>
|
| 4036 |
+
<div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 4037 |
+
Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 9.16it/s]
|
| 4038 |
+
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 12.59it/s]</div>
|
| 4039 |
+
<div class="cell-artifacts">
|
| 4040 |
+
<h4>Artifacts:</h4>
|
| 4041 |
+
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
| 4042 |
</div>
|
| 4043 |
</div>
|
| 4044 |
</div>
|
flash_attn/impls/xformers.html
CHANGED
|
@@ -706,6 +706,29 @@
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
.cell-stderr {
|
| 710 |
background: var(--bg-error);
|
| 711 |
border-left: 2px solid var(--border-error);
|
|
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3556 |
if(output){
|
| 3557 |
output.classList.remove('output-stale');
|
| 3558 |
let html='';
|
| 3559 |
-
if(data.stdout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3560 |
console.log('UV Logs:', data);
|
| 3561 |
if(data.stderr) {
|
| 3562 |
// Split UV logs from regular stderr
|
|
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3678 |
}
|
| 3679 |
}
|
| 3680 |
|
| 3681 |
-
// Live reload functionality (robust SSE handling)
|
| 3682 |
-
(function(){
|
| 3683 |
-
|
| 3684 |
-
|
| 3685 |
-
|
| 3686 |
-
|
| 3687 |
-
|
| 3688 |
-
|
| 3689 |
-
|
| 3690 |
-
|
| 3691 |
-
|
| 3692 |
-
|
| 3693 |
-
|
| 3694 |
-
|
| 3695 |
-
|
| 3696 |
-
|
| 3697 |
-
|
| 3698 |
-
|
| 3699 |
-
|
| 3700 |
-
|
| 3701 |
-
})();
|
| 3702 |
|
| 3703 |
|
| 3704 |
document.addEventListener('DOMContentLoaded', function() {
|
|
@@ -3829,21 +3857,21 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.10.244-240.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
| 3836 |
<div class="main-content">
|
| 3837 |
<h1>xFormers Memory Efficient Attention</h1>
|
| 3838 |
<h2>xFormers Benchmark</h2>
|
| 3839 |
-
<div class="cell
|
| 3840 |
<div class="cell-header">
|
| 3841 |
<span class="collapse-indicators">
|
| 3842 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3843 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
-
<span id="uv-indicator-benchmark" style="cursor:
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: benchmark |
|
| 3847 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3849 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3855,13 +3883,13 @@ Cell: benchmark | 0.01s | FAILED
|
|
| 3855 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3856 |
<span class="c1"># dependencies = [</span>
|
| 3857 |
<span class="c1"># "numpy",</span>
|
| 3858 |
-
<span class="c1"># "torch",</span>
|
| 3859 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3860 |
<span class="c1"># "xformers",</span>
|
| 3861 |
<span class="c1"># ]</span>
|
| 3862 |
<span class="c1">#</span>
|
| 3863 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3864 |
-
<span class="c1"># kernels-benchmark-tools = { path = "
|
| 3865 |
<span class="c1"># ///</span>
|
| 3866 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3867 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3926,9 +3954,169 @@ Cell: benchmark | 0.01s | FAILED
|
|
| 3926 |
</div>
|
| 3927 |
</div>
|
| 3928 |
<div id="output-benchmark" class="cell-output">
|
| 3929 |
-
<div class="cell-
|
| 3930 |
-
|
| 3931 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3932 |
</div>
|
| 3933 |
</div>
|
| 3934 |
</div>
|
|
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
| 709 |
+
|
| 710 |
+
.cell-stdout {
|
| 711 |
+
background: var(--bg-tertiary);
|
| 712 |
+
padding: 0.75rem;
|
| 713 |
+
border-radius: 1px;
|
| 714 |
+
font-family: inherit;
|
| 715 |
+
font-size: 0.9rem;
|
| 716 |
+
color: var(--text-primary);
|
| 717 |
+
|
| 718 |
+
/* key bits */
|
| 719 |
+
overflow: auto; /* show scrollbars when needed */
|
| 720 |
+
max-width: 100%; /* respects whatever layout width you give it */
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
.cell-stdout .stdout-text {
|
| 724 |
+
margin: 0; /* reset pre default margin */
|
| 725 |
+
white-space: pre; /* keep line breaks, NO wrapping */
|
| 726 |
+
display: inline-block; /* shrink-to-content */
|
| 727 |
+
min-width: max-content; /* allow very long lines to define intrinsic width */
|
| 728 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
|
| 729 |
+
tab-size: 2;
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
.cell-stderr {
|
| 733 |
background: var(--bg-error);
|
| 734 |
border-left: 2px solid var(--border-error);
|
|
|
|
| 3579 |
if(output){
|
| 3580 |
output.classList.remove('output-stale');
|
| 3581 |
let html='';
|
| 3582 |
+
if (data.stdout) {
|
| 3583 |
+
html += '<div class="cell-stdout"><pre class="stdout-text">'
|
| 3584 |
+
+ escapeHtml(data.stdout)
|
| 3585 |
+
+ '</pre></div>';
|
| 3586 |
+
}
|
| 3587 |
+
|
| 3588 |
console.log('UV Logs:', data);
|
| 3589 |
if(data.stderr) {
|
| 3590 |
// Split UV logs from regular stderr
|
|
|
|
| 3706 |
}
|
| 3707 |
}
|
| 3708 |
|
| 3709 |
+
// // Live reload functionality (robust SSE handling)
|
| 3710 |
+
// (function(){
|
| 3711 |
+
// if (!('EventSource' in window)) {
|
| 3712 |
+
// console.warn('SSE not supported in this browser');
|
| 3713 |
+
// return;
|
| 3714 |
+
// }
|
| 3715 |
+
// let source = new EventSource('/events');
|
| 3716 |
+
// let isOpen = false;
|
| 3717 |
+
// source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
|
| 3718 |
+
// source.onmessage = function(e){
|
| 3719 |
+
// const msg=(e.data||'').trim(); if(!msg) return;
|
| 3720 |
+
// console.log('SSE message:', msg);
|
| 3721 |
+
// if (msg==='reload' || msg==='incremental') { location.reload(); }
|
| 3722 |
+
// // Ignore 'loading' to avoid premature reload loops
|
| 3723 |
+
// };
|
| 3724 |
+
// source.onerror = function(e){
|
| 3725 |
+
// // Let EventSource auto-reconnect instead of forcing a reload
|
| 3726 |
+
// if (isOpen) console.warn('SSE error after open, retrying...', e);
|
| 3727 |
+
// };
|
| 3728 |
+
// window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
|
| 3729 |
+
// })();
|
| 3730 |
|
| 3731 |
|
| 3732 |
document.addEventListener('DOMContentLoaded', function() {
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
| 3864 |
<div class="main-content">
|
| 3865 |
<h1>xFormers Memory Efficient Attention</h1>
|
| 3866 |
<h2>xFormers Benchmark</h2>
|
| 3867 |
+
<div class="cell" id="cell-benchmark">
|
| 3868 |
<div class="cell-header">
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
+
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 4.83s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3883 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3884 |
<span class="c1"># dependencies = [</span>
|
| 3885 |
<span class="c1"># "numpy",</span>
|
| 3886 |
+
<span class="c1"># "torch==2.8.0",</span>
|
| 3887 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3888 |
<span class="c1"># "xformers",</span>
|
| 3889 |
<span class="c1"># ]</span>
|
| 3890 |
<span class="c1">#</span>
|
| 3891 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3892 |
+
<span class="c1"># kernels-benchmark-tools = { path = "../../../../../tools", editable = true }</span>
|
| 3893 |
<span class="c1"># ///</span>
|
| 3894 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3895 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 3954 |
</div>
|
| 3955 |
</div>
|
| 3956 |
<div id="output-benchmark" class="cell-output">
|
| 3957 |
+
<div class="cell-stdout"><pre class="stdout-text">
|
| 3958 |
+
======================================================================
|
| 3959 |
+
PROFILE TRACE: xformers_meff | flux_L128
|
| 3960 |
+
======================================================================
|
| 3961 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3962 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3963 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3964 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 506.718us 193.09% 506.718us 506.718us 1
|
| 3965 |
+
xformers_meff 20.33% 479.463us 99.70% 2.351ms 2.351ms 0.000us 0.00% 351.872us 351.872us 1
|
| 3966 |
+
xformers_flash3::flash_fwd 8.78% 206.960us 77.92% 1.837ms 612.487us 0.000us 0.00% 351.872us 117.291us 3
|
| 3967 |
+
flash_attn_3::fwd 3.33% 78.433us 69.14% 1.631ms 543.500us 262.432us 100.00% 351.872us 117.291us 3
|
| 3968 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 262.432us 100.00% 262.432us 87.477us 3
|
| 3969 |
+
Activity Buffer Request 61.85% 1.459ms 61.85% 1.459ms 1.459ms 89.440us 34.08% 89.440us 89.440us 1
|
| 3970 |
+
aten::empty 1.44% 34.032us 1.44% 34.032us 5.672us 0.000us 0.00% 0.000us 0.000us 6
|
| 3971 |
+
cudaFuncSetAttribute 0.62% 14.682us 0.62% 14.682us 4.894us 0.000us 0.00% 0.000us 0.000us 3
|
| 3972 |
+
cudaLaunchKernel 1.89% 44.672us 1.89% 44.672us 14.891us 0.000us 0.00% 0.000us 0.000us 3
|
| 3973 |
+
aten::reshape 0.50% 11.821us 1.45% 34.232us 5.705us 0.000us 0.00% 0.000us 0.000us 6
|
| 3974 |
+
aten::view 0.95% 22.411us 0.95% 22.411us 3.735us 0.000us 0.00% 0.000us 0.000us 6
|
| 3975 |
+
cudaDeviceSynchronize 0.30% 7.110us 0.30% 7.110us 7.110us 0.000us 0.00% 0.000us 0.000us 1
|
| 3976 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
+
Self CPU time total: 2.358ms
|
| 3978 |
+
Self CUDA time total: 262.432us
|
| 3979 |
+
|
| 3980 |
+
|
| 3981 |
+
|
| 3982 |
+
======================================================================
|
| 3983 |
+
PROFILE TRACE: xformers_meff | flux_L256
|
| 3984 |
+
======================================================================
|
| 3985 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3986 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3987 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 457.756us 155.59% 457.756us 457.756us 1
|
| 3989 |
+
xformers_meff 14.84% 310.507us 99.07% 2.072ms 2.072ms 0.000us 0.00% 391.132us 391.132us 1
|
| 3990 |
+
xformers_flash3::flash_fwd 7.41% 154.907us 83.06% 1.737ms 579.115us 0.000us 0.00% 391.132us 130.377us 3
|
| 3991 |
+
flash_attn_3::fwd 2.73% 57.112us 75.65% 1.582ms 527.479us 294.205us 100.00% 391.132us 130.377us 3
|
| 3992 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 294.205us 100.00% 294.205us 98.068us 3
|
| 3993 |
+
Activity Buffer Request 69.53% 1.454ms 69.53% 1.454ms 1.454ms 96.927us 32.95% 96.927us 96.927us 1
|
| 3994 |
+
aten::empty 1.38% 28.932us 1.38% 28.932us 4.822us 0.000us 0.00% 0.000us 0.000us 6
|
| 3995 |
+
cudaFuncSetAttribute 0.38% 7.960us 0.38% 7.960us 2.653us 0.000us 0.00% 0.000us 0.000us 3
|
| 3996 |
+
cudaLaunchKernel 1.63% 34.022us 1.63% 34.022us 11.341us 0.000us 0.00% 0.000us 0.000us 3
|
| 3997 |
+
aten::reshape 0.48% 10.060us 1.17% 24.410us 4.068us 0.000us 0.00% 0.000us 0.000us 6
|
| 3998 |
+
aten::view 0.69% 14.350us 0.69% 14.350us 2.392us 0.000us 0.00% 0.000us 0.000us 6
|
| 3999 |
+
cudaDeviceSynchronize 0.93% 19.421us 0.93% 19.421us 19.421us 0.000us 0.00% 0.000us 0.000us 1
|
| 4000 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
+
Self CPU time total: 2.092ms
|
| 4002 |
+
Self CUDA time total: 294.205us
|
| 4003 |
+
|
| 4004 |
+
|
| 4005 |
+
|
| 4006 |
+
======================================================================
|
| 4007 |
+
PROFILE TRACE: xformers_meff | flux_L320
|
| 4008 |
+
======================================================================
|
| 4009 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4010 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4011 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4012 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 455.327us 140.30% 455.327us 455.327us 1
|
| 4013 |
+
xformers_meff 14.70% 303.895us 98.43% 2.034ms 2.034ms 0.000us 0.00% 429.791us 429.791us 1
|
| 4014 |
+
xformers_flash3::flash_fwd 7.05% 145.707us 82.60% 1.707ms 568.998us 0.000us 0.00% 429.791us 143.264us 3
|
| 4015 |
+
flash_attn_3::fwd 2.62% 54.152us 75.55% 1.561ms 520.429us 324.543us 100.00% 429.791us 143.264us 3
|
| 4016 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 324.543us 100.00% 324.543us 108.181us 3
|
| 4017 |
+
Activity Buffer Request 69.54% 1.437ms 69.54% 1.437ms 1.437ms 105.248us 32.43% 105.248us 105.248us 1
|
| 4018 |
+
aten::empty 1.47% 30.342us 1.47% 30.342us 5.057us 0.000us 0.00% 0.000us 0.000us 6
|
| 4019 |
+
cudaFuncSetAttribute 0.27% 5.580us 0.27% 5.580us 1.860us 0.000us 0.00% 0.000us 0.000us 3
|
| 4020 |
+
cudaLaunchKernel 1.65% 34.132us 1.65% 34.132us 11.377us 0.000us 0.00% 0.000us 0.000us 3
|
| 4021 |
+
aten::reshape 0.42% 8.741us 1.13% 23.401us 3.900us 0.000us 0.00% 0.000us 0.000us 6
|
| 4022 |
+
aten::view 0.71% 14.660us 0.71% 14.660us 2.443us 0.000us 0.00% 0.000us 0.000us 6
|
| 4023 |
+
cudaDeviceSynchronize 1.57% 32.391us 1.57% 32.391us 32.391us 0.000us 0.00% 0.000us 0.000us 1
|
| 4024 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
+
Self CPU time total: 2.067ms
|
| 4026 |
+
Self CUDA time total: 324.543us
|
| 4027 |
+
|
| 4028 |
+
|
| 4029 |
+
|
| 4030 |
+
======================================================================
|
| 4031 |
+
PROFILE TRACE: xformers_meff | flux_L384
|
| 4032 |
+
======================================================================
|
| 4033 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4034 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4035 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 460.189us 141.34% 460.189us 460.189us 1
|
| 4037 |
+
xformers_meff 13.29% 304.067us 98.75% 2.259ms 2.259ms 0.000us 0.00% 433.468us 433.468us 1
|
| 4038 |
+
xformers_flash3::flash_fwd 6.63% 151.806us 84.43% 1.932ms 643.925us 0.000us 0.00% 433.468us 144.489us 3
|
| 4039 |
+
flash_attn_3::fwd 2.38% 54.492us 77.79% 1.780ms 593.323us 325.597us 100.00% 433.468us 144.489us 3
|
| 4040 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 325.597us 100.00% 325.597us 108.532us 3
|
| 4041 |
+
Activity Buffer Request 63.32% 1.449ms 63.32% 1.449ms 1.449ms 107.871us 33.13% 107.871us 107.871us 1
|
| 4042 |
+
aten::empty 1.26% 28.813us 1.26% 28.813us 4.802us 0.000us 0.00% 0.000us 0.000us 6
|
| 4043 |
+
cudaFuncSetAttribute 0.27% 6.140us 0.27% 6.140us 2.047us 0.000us 0.00% 0.000us 0.000us 3
|
| 4044 |
+
cudaLaunchKernel 10.56% 241.573us 10.56% 241.573us 80.524us 0.000us 0.00% 0.000us 0.000us 3
|
| 4045 |
+
aten::reshape 0.41% 9.348us 1.03% 23.589us 3.931us 0.000us 0.00% 0.000us 0.000us 6
|
| 4046 |
+
aten::view 0.62% 14.241us 0.62% 14.241us 2.374us 0.000us 0.00% 0.000us 0.000us 6
|
| 4047 |
+
cudaDeviceSynchronize 1.25% 28.691us 1.25% 28.691us 28.691us 0.000us 0.00% 0.000us 0.000us 1
|
| 4048 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
+
Self CPU time total: 2.288ms
|
| 4050 |
+
Self CUDA time total: 325.597us
|
| 4051 |
+
|
| 4052 |
+
|
| 4053 |
+
|
| 4054 |
+
======================================================================
|
| 4055 |
+
PROFILE TRACE: xformers_meff | flux_L448
|
| 4056 |
+
======================================================================
|
| 4057 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4058 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4059 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4060 |
+
xformers_meff 14.32% 335.208us 96.41% 2.256ms 2.256ms 0.000us 0.00% 650.207us 650.207us 1
|
| 4061 |
+
xformers_flash3::flash_fwd 6.57% 153.746us 81.05% 1.897ms 632.294us 0.000us 0.00% 650.207us 216.736us 3
|
| 4062 |
+
flash_attn_3::fwd 2.39% 56.024us 74.48% 1.743ms 581.045us 487.359us 100.00% 650.207us 216.736us 3
|
| 4063 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 545.022us 111.83% 545.022us 545.022us 1
|
| 4064 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 487.359us 100.00% 487.359us 162.453us 3
|
| 4065 |
+
Activity Buffer Request 62.65% 1.466ms 62.65% 1.466ms 1.466ms 162.848us 33.41% 162.848us 162.848us 1
|
| 4066 |
+
aten::empty 1.29% 30.110us 1.29% 30.110us 5.018us 0.000us 0.00% 0.000us 0.000us 6
|
| 4067 |
+
cudaFuncSetAttribute 0.25% 5.800us 0.25% 5.800us 1.933us 0.000us 0.00% 0.000us 0.000us 3
|
| 4068 |
+
cudaLaunchKernel 7.91% 185.030us 7.91% 185.030us 61.677us 0.000us 0.00% 0.000us 0.000us 3
|
| 4069 |
+
aten::reshape 0.42% 9.770us 1.04% 24.390us 4.065us 0.000us 0.00% 0.000us 0.000us 6
|
| 4070 |
+
aten::view 0.62% 14.620us 0.62% 14.620us 2.437us 0.000us 0.00% 0.000us 0.000us 6
|
| 4071 |
+
cudaDeviceSynchronize 3.59% 83.934us 3.59% 83.934us 83.934us 0.000us 0.00% 0.000us 0.000us 1
|
| 4072 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4073 |
+
Self CPU time total: 2.340ms
|
| 4074 |
+
Self CUDA time total: 487.359us
|
| 4075 |
+
|
| 4076 |
+
|
| 4077 |
+
|
| 4078 |
+
======================================================================
|
| 4079 |
+
PROFILE TRACE: xformers_meff | flux_L512
|
| 4080 |
+
======================================================================
|
| 4081 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4082 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4083 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4084 |
+
xformers_meff 13.07% 298.846us 95.47% 2.183ms 2.183ms 0.000us 0.00% 676.610us 676.610us 1
|
| 4085 |
+
xformers_flash3::flash_fwd 6.50% 148.626us 81.42% 1.862ms 620.693us 0.000us 0.00% 676.610us 225.537us 3
|
| 4086 |
+
flash_attn_3::fwd 2.33% 53.191us 74.93% 1.713ms 571.151us 505.889us 100.00% 676.610us 225.537us 3
|
| 4087 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 520.769us 102.94% 520.769us 520.769us 1
|
| 4088 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 505.889us 100.00% 505.889us 168.630us 3
|
| 4089 |
+
Activity Buffer Request 63.62% 1.455ms 63.62% 1.455ms 1.455ms 170.721us 33.75% 170.721us 170.721us 1
|
| 4090 |
+
aten::empty 1.23% 28.092us 1.23% 28.092us 4.682us 0.000us 0.00% 0.000us 0.000us 6
|
| 4091 |
+
cudaFuncSetAttribute 0.25% 5.790us 0.25% 5.790us 1.930us 0.000us 0.00% 0.000us 0.000us 3
|
| 4092 |
+
cudaLaunchKernel 7.50% 171.540us 7.50% 171.540us 57.180us 0.000us 0.00% 0.000us 0.000us 3
|
| 4093 |
+
aten::reshape 0.38% 8.590us 0.98% 22.470us 3.745us 0.000us 0.00% 0.000us 0.000us 6
|
| 4094 |
+
aten::view 0.61% 13.880us 0.61% 13.880us 2.313us 0.000us 0.00% 0.000us 0.000us 6
|
| 4095 |
+
cudaDeviceSynchronize 4.53% 103.496us 4.53% 103.496us 103.496us 0.000us 0.00% 0.000us 0.000us 1
|
| 4096 |
+
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4097 |
+
Self CPU time total: 2.287ms
|
| 4098 |
+
Self CUDA time total: 505.889us
|
| 4099 |
+
|
| 4100 |
+
|
| 4101 |
+
impl wl p50(ms) ok
|
| 4102 |
+
xformers_meff flux_L128 0.20 True
|
| 4103 |
+
xformers_meff flux_L256 0.21 True
|
| 4104 |
+
xformers_meff flux_L320 0.22 True
|
| 4105 |
+
xformers_meff flux_L384 0.22 True
|
| 4106 |
+
xformers_meff flux_L448 0.28 True
|
| 4107 |
+
xformers_meff flux_L512 0.27 True
|
| 4108 |
+
</pre></div>
|
| 4109 |
+
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4110 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4111 |
+
<div class="uv-logs-content" style="display: none;">
|
| 4112 |
+
Downloading xformers (111.8MiB)
|
| 4113 |
+
Downloading xformers
|
| 4114 |
+
Installed 1 package in 14ms
|
| 4115 |
+
</div>
|
| 4116 |
+
</div>
|
| 4117 |
+
<div class="cell-artifacts">
|
| 4118 |
+
<h4>Artifacts:</h4>
|
| 4119 |
+
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
| 4120 |
</div>
|
| 4121 |
</div>
|
| 4122 |
</div>
|
flash_attn/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
flash_attn/results/cells/combine.py
CHANGED
|
@@ -1,319 +1,69 @@
|
|
| 1 |
# /// script
|
| 2 |
# requires-python = ">=3.10"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "numpy",
|
| 5 |
-
# "torch",
|
| 6 |
-
# "kernels-benchmark-tools",
|
| 7 |
-
# "matplotlib",
|
| 8 |
-
# ]
|
| 9 |
-
#
|
| 10 |
# [tool.uv.sources]
|
| 11 |
-
# kernels-benchmark-tools = {
|
| 12 |
# ///
|
| 13 |
-
import
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
# Keep text as text (not paths) so CSS can style fonts, size, etc.
|
| 27 |
-
mpl.rcParams["svg.fonttype"] = "none"
|
| 28 |
-
# Make ids deterministic across builds
|
| 29 |
-
mpl.rcParams["svg.hashsalt"] = "latency-benchmark-combined"
|
| 30 |
-
# Avoid auto-closed figures interfering with our tagging
|
| 31 |
-
mpl.rcParams["figure.autolayout"] = True
|
| 32 |
-
# Make background transparent
|
| 33 |
-
mpl.rcParams["figure.facecolor"] = "none"
|
| 34 |
-
mpl.rcParams["axes.facecolor"] = "none"
|
| 35 |
-
mpl.rcParams["savefig.facecolor"] = "none"
|
| 36 |
-
mpl.rcParams["savefig.edgecolor"] = "none"
|
| 37 |
-
|
| 38 |
-
def _slugify(s: str) -> str:
|
| 39 |
-
s = (s or "").strip().lower()
|
| 40 |
-
keep = []
|
| 41 |
-
for ch in s:
|
| 42 |
-
if ch.isalnum():
|
| 43 |
-
keep.append(ch)
|
| 44 |
-
elif ch in (" ", "-", "_", "/", ".", ":"):
|
| 45 |
-
keep.append("-")
|
| 46 |
-
else:
|
| 47 |
-
keep.append("")
|
| 48 |
-
out = "".join(keep)
|
| 49 |
-
while "--" in out:
|
| 50 |
-
out = out.replace("--", "-")
|
| 51 |
-
return out.strip("-") or "unnamed"
|
| 52 |
-
|
| 53 |
-
def _tag_current_figure(default_series_prefix="series"):
|
| 54 |
-
"""Attach SVG ids (gid) to key artists so they can be targeted from CSS."""
|
| 55 |
-
fig = plt.gcf()
|
| 56 |
-
if fig is None:
|
| 57 |
-
return
|
| 58 |
-
|
| 59 |
-
# Tag the figure itself
|
| 60 |
-
fig.set_gid("figure--latency")
|
| 61 |
-
|
| 62 |
-
for ax_idx, ax in enumerate(fig.get_axes(), start=1):
|
| 63 |
-
ax.set_gid(f"axes--{ax_idx}")
|
| 64 |
-
|
| 65 |
-
# Axis labels & title
|
| 66 |
-
if ax.get_title():
|
| 67 |
-
for t in ax.texts:
|
| 68 |
-
if t.get_text() == ax.get_title():
|
| 69 |
-
t.set_gid("title--main")
|
| 70 |
-
if ax.xaxis and ax.xaxis.get_label():
|
| 71 |
-
ax.xaxis.label.set_gid("label--x")
|
| 72 |
-
if ax.yaxis and ax.yaxis.get_label():
|
| 73 |
-
ax.yaxis.label.set_gid("label--y")
|
| 74 |
-
|
| 75 |
-
# Gridlines
|
| 76 |
-
for i, gl in enumerate(ax.get_xgridlines(), start=1):
|
| 77 |
-
gl.set_gid(f"grid-x--{i}")
|
| 78 |
-
for i, gl in enumerate(ax.get_ygridlines(), start=1):
|
| 79 |
-
gl.set_gid(f"grid-y--{i}")
|
| 80 |
-
|
| 81 |
-
# Legend block & entries
|
| 82 |
-
leg = ax.get_legend()
|
| 83 |
-
if leg is not None:
|
| 84 |
-
leg.set_gid("legend")
|
| 85 |
-
for i, txt in enumerate(leg.get_texts(), start=1):
|
| 86 |
-
label_slug = _slugify(txt.get_text())
|
| 87 |
-
txt.set_gid(f"legend-label--{label_slug or i}")
|
| 88 |
-
|
| 89 |
-
# Series (lines, patches)
|
| 90 |
-
# Lines
|
| 91 |
-
line_seen = {}
|
| 92 |
-
for ln in getattr(ax, "lines", []):
|
| 93 |
-
raw_label = ln.get_label() or ""
|
| 94 |
-
# Matplotlib uses labels beginning with "_" for non-legendable items
|
| 95 |
-
label = raw_label if not raw_label.startswith("_") else f"{default_series_prefix}"
|
| 96 |
-
slug = _slugify(label)
|
| 97 |
-
line_seen[slug] = line_seen.get(slug, 0) + 1
|
| 98 |
-
suffix = "" if line_seen[slug] == 1 else f"-{line_seen[slug]}"
|
| 99 |
-
ln.set_gid(f"series--{slug}{suffix}")
|
| 100 |
-
|
| 101 |
-
# Patches (bars, areas)
|
| 102 |
-
patch_seen = {}
|
| 103 |
-
for pt in getattr(ax, "patches", []):
|
| 104 |
-
label = getattr(pt, "get_label", lambda: "")() or f"{default_series_prefix}"
|
| 105 |
-
if isinstance(label, str) and label.startswith("_"):
|
| 106 |
-
label = default_series_prefix
|
| 107 |
-
slug = _slugify(label)
|
| 108 |
-
patch_seen[slug] = patch_seen.get(slug, 0) + 1
|
| 109 |
-
suffix = "" if patch_seen[slug] == 1 else f"-{patch_seen[slug]}"
|
| 110 |
-
pt.set_gid(f"series--{slug}{suffix}")
|
| 111 |
-
|
| 112 |
-
def _postprocess_svg_add_classes(svg_path: Path):
|
| 113 |
-
"""Add convenient CSS classes alongside ids (e.g., class='series grid grid-x')."""
|
| 114 |
-
try:
|
| 115 |
-
import xml.etree.ElementTree as ET
|
| 116 |
-
ET.register_namespace("", "http://www.w3.org/2000/svg")
|
| 117 |
-
tree = ET.parse(svg_path)
|
| 118 |
-
root = tree.getroot()
|
| 119 |
-
for el in root.iter():
|
| 120 |
-
el_id = el.attrib.get("id", "")
|
| 121 |
-
if not el_id:
|
| 122 |
-
continue
|
| 123 |
-
cls = []
|
| 124 |
-
if el_id.startswith("figure--"):
|
| 125 |
-
cls.append("figure")
|
| 126 |
-
elif el_id.startswith("axes--"):
|
| 127 |
-
cls.append("axes")
|
| 128 |
-
elif el_id.startswith("grid-x--"):
|
| 129 |
-
cls += ["grid", "grid-x"]
|
| 130 |
-
elif el_id.startswith("grid-y--"):
|
| 131 |
-
cls += ["grid", "grid-y"]
|
| 132 |
-
elif el_id.startswith("legend"):
|
| 133 |
-
cls.append("legend")
|
| 134 |
-
elif el_id.startswith("label--x"):
|
| 135 |
-
cls.append("xlabel")
|
| 136 |
-
elif el_id.startswith("label--y"):
|
| 137 |
-
cls.append("ylabel")
|
| 138 |
-
elif el_id.startswith("title--"):
|
| 139 |
-
cls.append("title")
|
| 140 |
-
elif el_id.startswith("series--"):
|
| 141 |
-
cls.append("series")
|
| 142 |
-
if cls:
|
| 143 |
-
# Preserve any existing class (unlikely from Matplotlib)
|
| 144 |
-
existing = el.attrib.get("class", "")
|
| 145 |
-
el.set("class", (existing + " " + " ".join(cls)).strip())
|
| 146 |
-
tree.write(svg_path, encoding="utf-8", xml_declaration=True)
|
| 147 |
-
except Exception as e:
|
| 148 |
-
print(f"✗ SVG postprocess (classes) skipped: {e}")
|
| 149 |
-
|
| 150 |
-
# Monkey-patch savefig to force SVG & ensure tagging occurs even if kbt.viz saves internally.
|
| 151 |
-
_orig_savefig = plt.savefig
|
| 152 |
-
def _savefig_svg(fname, *args, **kwargs):
|
| 153 |
-
# Always save as SVG at a stable path for the artifact system
|
| 154 |
-
out = Path("latency.svg")
|
| 155 |
-
kwargs["format"] = "svg"
|
| 156 |
-
# Ensure everything we care about has ids before export
|
| 157 |
-
_tag_current_figure()
|
| 158 |
-
res = _orig_savefig(out, *args, **kwargs)
|
| 159 |
-
# Add helpful CSS classes on top of ids
|
| 160 |
-
_postprocess_svg_add_classes(out)
|
| 161 |
-
print(f"✓ Combined visualization saved as {out}")
|
| 162 |
-
return res
|
| 163 |
-
|
| 164 |
-
plt.savefig = _savefig_svg # apply patch
|
| 165 |
-
|
| 166 |
-
# Capture close calls in case kbt.viz() closes figures before we re-save
|
| 167 |
-
_orig_close = plt.close
|
| 168 |
-
_last_closed = {"fig": None}
|
| 169 |
-
def _capture_close(arg=None):
|
| 170 |
-
try:
|
| 171 |
-
if hasattr(arg, "savefig"): # looks like a Figure
|
| 172 |
-
_last_closed["fig"] = arg
|
| 173 |
-
else:
|
| 174 |
-
_last_closed["fig"] = plt.gcf()
|
| 175 |
-
finally:
|
| 176 |
-
return _orig_close(arg)
|
| 177 |
-
plt.close = _capture_close
|
| 178 |
-
|
| 179 |
-
# --- Locate benchmark artifacts --------------------------------------------------
|
| 180 |
-
cache_dirs = {
|
| 181 |
-
"Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
|
| 182 |
-
"MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
|
| 183 |
-
"Flash Attn 2": os.environ.get('UVNOTE_FILE_FLASH_ATTN2_BENCHMARK'),
|
| 184 |
-
"xFormers": os.environ.get('UVNOTE_FILE_XFORMERS_BENCHMARK'),
|
| 185 |
-
"SageAttention": os.environ.get('UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK'),
|
| 186 |
-
"Compiled (default)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT'),
|
| 187 |
-
"Compiled (max-autotune)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE'),
|
| 188 |
-
"HF Kernels Flash Attn": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK'),
|
| 189 |
-
"HF Kernels Flash Attn3": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK'),
|
| 190 |
}
|
| 191 |
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
print()
|
| 196 |
|
| 197 |
file_mapping = {
|
| 198 |
-
"Flash (PyTorch SDPA)": "attn.jsonl",
|
| 199 |
-
"MemEff (PyTorch SDPA)": "attn.jsonl",
|
| 200 |
-
"Flash Attn 2": "attn.jsonl",
|
| 201 |
-
"xFormers": "attn.jsonl",
|
| 202 |
-
"SageAttention": "attn.jsonl",
|
| 203 |
"Compiled (default)": "attn_default.jsonl",
|
| 204 |
"Compiled (max-autotune)": "attn_max_autotune.jsonl",
|
| 205 |
-
"HF Kernels Flash Attn": "attn.jsonl",
|
| 206 |
-
"HF Kernels Flash Attn3": "attn.jsonl",
|
| 207 |
}
|
| 208 |
|
|
|
|
| 209 |
all_paths = []
|
| 210 |
-
for name,
|
|
|
|
| 211 |
if cache_dir:
|
| 212 |
-
|
|
|
|
| 213 |
if path.exists() and path.stat().st_size > 0:
|
| 214 |
all_paths.append(str(path))
|
| 215 |
print(f"✓ Found {name}: {path}")
|
| 216 |
else:
|
| 217 |
-
print(f"⊘
|
| 218 |
else:
|
| 219 |
-
print(f"✗
|
| 220 |
-
print()
|
| 221 |
|
| 222 |
if not all_paths:
|
| 223 |
print("ERROR: No benchmark data files found!")
|
| 224 |
-
|
| 225 |
-
plt.savefig = _orig_savefig
|
| 226 |
-
plt.close = _orig_close
|
| 227 |
sys.exit(1)
|
| 228 |
|
| 229 |
-
#
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
|
|
|
|
|
|
| 233 |
|
| 234 |
try:
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
kbt.viz(all_paths)
|
| 238 |
-
# Safety net: if kbt.viz didn't save, save now.
|
| 239 |
-
# if not Path("latency.svg").exists():
|
| 240 |
-
# _tag_current_figure()
|
| 241 |
-
# plt.savefig("latency.svg")
|
| 242 |
|
| 243 |
-
|
|
|
|
| 244 |
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
print(
|
| 248 |
-
except Exception as e:
|
| 249 |
-
print(f"✗ Visualization failed: {e}")
|
| 250 |
finally:
|
| 251 |
-
# Clean up patches to avoid side effects in later cells
|
| 252 |
plt.savefig = _orig_savefig
|
| 253 |
-
plt.close = _orig_close
|
| 254 |
-
|
| 255 |
-
print()
|
| 256 |
-
print("ANALYSIS COMPLETE")
|
| 257 |
-
print(f"Total implementations analyzed: {len(all_paths)}")
|
| 258 |
-
print(f"\nImplementations included:")
|
| 259 |
-
for name, cache_dir in cache_dirs.items():
|
| 260 |
-
if cache_dir:
|
| 261 |
-
path = Path(cache_dir) / file_mapping[name]
|
| 262 |
-
if path.exists() and path.stat().st_size > 0:
|
| 263 |
-
print(f" ✓ {name}")
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
# Collect all benchmark data and export to CSV
|
| 268 |
-
all_data = {}
|
| 269 |
-
for name, cache_dir in cache_dirs.items():
|
| 270 |
-
if cache_dir:
|
| 271 |
-
path = Path(cache_dir) / file_mapping[name]
|
| 272 |
-
if path.exists() and path.stat().st_size > 0:
|
| 273 |
-
with open(path, 'r') as f:
|
| 274 |
-
records = [json.loads(line) for line in f]
|
| 275 |
-
all_data[name] = records
|
| 276 |
-
|
| 277 |
-
# Export to CSV
|
| 278 |
-
csv_path = Path("latency.csv")
|
| 279 |
-
with open(csv_path, 'w', newline='') as csvfile:
|
| 280 |
-
writer = csv.writer(csvfile)
|
| 281 |
-
|
| 282 |
-
# Write header
|
| 283 |
-
header = ["Implementation", "Impl ID", "Workload", "Batch", "Seq Length", "Heads", "Head Dim", "Dtype",
|
| 284 |
-
"Mean (ms)", "P10 (ms)", "P50 (ms)", "P90 (ms)", "Reps",
|
| 285 |
-
# "Compile (ms)",
|
| 286 |
-
"Peak Mem (MB)", "Backend", "Family"]
|
| 287 |
-
writer.writerow(header)
|
| 288 |
-
|
| 289 |
-
# Write data rows
|
| 290 |
-
for impl_name, records in all_data.items():
|
| 291 |
-
for record in records:
|
| 292 |
-
wl = record.get('wl', {})
|
| 293 |
-
lat = record.get('lat_ms', {})
|
| 294 |
-
tags = record.get('tags', {})
|
| 295 |
-
|
| 296 |
-
row = [
|
| 297 |
-
impl_name,
|
| 298 |
-
record.get('impl', ''),
|
| 299 |
-
wl.get('name', ''),
|
| 300 |
-
wl.get('batch', ''),
|
| 301 |
-
wl.get('seq_len', ''),
|
| 302 |
-
wl.get('heads', ''),
|
| 303 |
-
wl.get('head_dim', ''),
|
| 304 |
-
wl.get('dtype', ''),
|
| 305 |
-
lat.get('mean', ''),
|
| 306 |
-
lat.get('p10', ''),
|
| 307 |
-
lat.get('p50', ''),
|
| 308 |
-
lat.get('p90', ''),
|
| 309 |
-
lat.get('reps', ''),
|
| 310 |
-
# record.get('compile_ms', ''),
|
| 311 |
-
round(record.get('peak_bytes', 0) / 1024 / 1024, 2) if record.get('peak_bytes') else '',
|
| 312 |
-
tags.get('backend', ''),
|
| 313 |
-
tags.get('family', ''),
|
| 314 |
-
]
|
| 315 |
-
writer.writerow(row)
|
| 316 |
-
|
| 317 |
-
print(f"✓ CSV export complete: {csv_path}")
|
| 318 |
-
print(f"Total implementations: {len(all_data)}")
|
| 319 |
-
print(f"Total records: {sum(len(records) for records in all_data.values())}")
|
|
|
|
| 1 |
# /// script
|
| 2 |
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = ["torch", "kernels-benchmark-tools", "matplotlib"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
# [tool.uv.sources]
|
| 5 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 6 |
# ///
|
| 7 |
+
from kernels_benchmark_tools.core.visuals import generate_combined_results
|
| 8 |
+
|
| 9 |
+
# Note: Flash attention has multiple implementations with different output files
|
| 10 |
+
# Some use attn.jsonl, compiled variants use attn_default.jsonl and attn_max_autotune.jsonl
|
| 11 |
+
cache_env_map = {
|
| 12 |
+
"Flash (PyTorch SDPA)": "UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK",
|
| 13 |
+
"MemEff (PyTorch SDPA)": "UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK",
|
| 14 |
+
"xFormers": "UVNOTE_FILE_XFORMERS_BENCHMARK",
|
| 15 |
+
"Compiled (default)": "UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT",
|
| 16 |
+
"Compiled (max-autotune)": "UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE",
|
| 17 |
+
"HF Kernels Flash Attn": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK",
|
| 18 |
+
"HF Kernels Flash Attn3": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
}
|
| 20 |
|
| 21 |
+
# For flash attention, we need custom file mapping
|
| 22 |
+
import os
|
| 23 |
+
from pathlib import Path
|
|
|
|
| 24 |
|
| 25 |
file_mapping = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
"Compiled (default)": "attn_default.jsonl",
|
| 27 |
"Compiled (max-autotune)": "attn_max_autotune.jsonl",
|
|
|
|
|
|
|
| 28 |
}
|
| 29 |
|
| 30 |
+
# Collect paths with custom file names for compiled variants
|
| 31 |
all_paths = []
|
| 32 |
+
for name, env_var in cache_env_map.items():
|
| 33 |
+
cache_dir = os.environ.get(env_var)
|
| 34 |
if cache_dir:
|
| 35 |
+
filename = file_mapping.get(name, "attn.jsonl")
|
| 36 |
+
path = Path(cache_dir) / filename
|
| 37 |
if path.exists() and path.stat().st_size > 0:
|
| 38 |
all_paths.append(str(path))
|
| 39 |
print(f"✓ Found {name}: {path}")
|
| 40 |
else:
|
| 41 |
+
print(f"⊘ Skipped {name}: {path}")
|
| 42 |
else:
|
| 43 |
+
print(f"✗ Missing {name}")
|
|
|
|
| 44 |
|
| 45 |
if not all_paths:
|
| 46 |
print("ERROR: No benchmark data files found!")
|
| 47 |
+
import sys
|
|
|
|
|
|
|
| 48 |
sys.exit(1)
|
| 49 |
|
| 50 |
+
# Use the simplified visualization
|
| 51 |
+
from kernels_benchmark_tools.core import tools
|
| 52 |
+
from kernels_benchmark_tools.core.visuals import setup_svg_matplotlib, create_svg_with_tagging
|
| 53 |
+
|
| 54 |
+
setup_svg_matplotlib()
|
| 55 |
+
_orig_savefig, _orig_close = create_svg_with_tagging("latency.svg", "flash-attention")
|
| 56 |
|
| 57 |
try:
|
| 58 |
+
print("\nCOMBINED BENCHMARK SUMMARY\n")
|
| 59 |
+
tools.summarize(all_paths)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
+
print("\nGENERATING COMBINED VISUALIZATION\n")
|
| 62 |
+
tools.viz(all_paths)
|
| 63 |
|
| 64 |
+
import matplotlib.pyplot as plt
|
| 65 |
+
plt.savefig("latency.svg")
|
| 66 |
+
print("✓ SVG visualization ready!")
|
|
|
|
|
|
|
| 67 |
finally:
|
|
|
|
| 68 |
plt.savefig = _orig_savefig
|
| 69 |
+
plt.close = _orig_close
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
flash_attn/results/combined_results.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
layer_norm/impls/artifacts/benchmark/ln.jsonl
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S512_D4096", "batch": 1, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.039122000089264475, "p50": 0.04020200003651553, "p90": 0.04062199991494708, "mean": 0.040302199977304554, "iqr": 0.00047999992602854036, "raw_times": [0.04020200003651553, 0.04142299985687714, 0.04062199991494708, 0.04014199998891854, 0.039122000089264475], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.049882999974215636, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S512_D8192", "batch": 1, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.03869200008921325, "p50": 0.039361000062854146, "p90": 0.03952199995183037, "mean": 0.039353600004687905, "iqr": 0.0002899998889915878, "raw_times": [0.03923200006283878, 0.03996099985670298, 0.03952199995183037, 0.039361000062854146, 0.03869200008921325], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04323200005273975, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S1024_D4096", "batch": 1, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.038322000136759016, "p50": 0.039080999840734876, "p90": 0.03983200008406129, "mean": 0.03918759998668975, "iqr": 0.0012000000424450263, "raw_times": [0.038322000136759016, 0.038632000041616266, 0.03983200008406129, 0.039080999840734876, 0.04007099983027729], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04126199996790092, "peak_bytes": 75513856, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0016021728515625, "mse": 1.1682510375976562e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S1024_D8192", "batch": 1, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.038531999962287955, "p50": 0.03957200010518136, "p90": 0.040011999999478576, "mean": 0.040755799955149996, "iqr": 0.0013210001270635985, "raw_times": [0.04697199983638711, 0.03957200010518136, 0.040011999999478576, 0.03869099987241498, 0.038531999962287955], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04860299986830796, "peak_bytes": 151027712, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S2048_D4096", "batch": 1, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.03818100003627478, "p50": 0.039942000057635596, "p90": 0.04086199987796135, "mean": 0.044605999983104994, "iqr": 0.0025399997412023367, "raw_times": [0.06572299980689422, 0.04086199987796135, 0.03818100003627478, 0.039942000057635596, 0.038322000136759016], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.046752999878663104, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S2048_D8192", "batch": 1, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.046271999963209964, "p50": 0.046712000084880856, "p90": 0.0469120000161638, "mean": 0.04688640001404565, "iqr": 0.00020900006347801536, "raw_times": [0.046712000084880856, 0.04783300005328783, 0.046271999963209964, 0.046702999952685786, 0.0469120000161638], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.049573000069358386, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S4096_D4096", "batch": 1, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044022000110999215, "p50": 0.04556199996841315, "p90": 0.045742000111204106, "mean": 0.04578840002977813, "iqr": 0.000619000047663576, "raw_times": [0.04512300006354053, 0.04556199996841315, 0.045742000111204106, 0.044022000110999215, 0.04849299989473366], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04771299995809386, "peak_bytes": 302006272, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S4096_D8192", "batch": 1, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.20399000004545087, "p50": 0.20584999992934172, "p90": 0.20648999998229556, "mean": 0.20627999997486768, "iqr": 0.0007099999947968172, "raw_times": [0.20399000004545087, 0.2092899999297515, 0.20577999998749874, 0.20648999998229556, 0.20584999992934172], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.20653999990827288, "peak_bytes": 604012544, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
|
layer_norm/impls/cells/benchmark.py
CHANGED
|
@@ -2,13 +2,13 @@
|
|
| 2 |
# requires-python = ">=3.10"
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
-
# "torch",
|
| 6 |
# "kernels",
|
| 7 |
# "kernels-benchmark-tools",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
| 11 |
-
# kernels-benchmark-tools = { path = "
|
| 12 |
# ///
|
| 13 |
import torch
|
| 14 |
from kernels import get_kernel
|
|
|
|
| 2 |
# requires-python = ">=3.10"
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
# "kernels",
|
| 7 |
# "kernels-benchmark-tools",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
# ///
|
| 13 |
import torch
|
| 14 |
from kernels import get_kernel
|
layer_norm/impls/hf_kernels_layer_norm.html
CHANGED
|
@@ -706,6 +706,29 @@
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
.cell-stderr {
|
| 710 |
background: var(--bg-error);
|
| 711 |
border-left: 2px solid var(--border-error);
|
|
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3556 |
if(output){
|
| 3557 |
output.classList.remove('output-stale');
|
| 3558 |
let html='';
|
| 3559 |
-
if(data.stdout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3560 |
console.log('UV Logs:', data);
|
| 3561 |
if(data.stderr) {
|
| 3562 |
// Split UV logs from regular stderr
|
|
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3678 |
}
|
| 3679 |
}
|
| 3680 |
|
| 3681 |
-
// Live reload functionality (robust SSE handling)
|
| 3682 |
-
(function(){
|
| 3683 |
-
|
| 3684 |
-
|
| 3685 |
-
|
| 3686 |
-
|
| 3687 |
-
|
| 3688 |
-
|
| 3689 |
-
|
| 3690 |
-
|
| 3691 |
-
|
| 3692 |
-
|
| 3693 |
-
|
| 3694 |
-
|
| 3695 |
-
|
| 3696 |
-
|
| 3697 |
-
|
| 3698 |
-
|
| 3699 |
-
|
| 3700 |
-
|
| 3701 |
-
})();
|
| 3702 |
|
| 3703 |
|
| 3704 |
document.addEventListener('DOMContentLoaded', function() {
|
|
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.10.244-240.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
|
@@ -3838,14 +3866,14 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3838 |
<h1>HF Kernels LayerNorm Implementation</h1>
|
| 3839 |
<p>Based on kernels-community <code>layer-norm</code> kernel.</p>
|
| 3840 |
<h2>LayerNorm Benchmark (HF Kernels)</h2>
|
| 3841 |
-
<div class="cell
|
| 3842 |
<div class="cell-header">
|
| 3843 |
<span class="collapse-indicators">
|
| 3844 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3845 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3846 |
-
<span id="uv-indicator-benchmark" style="cursor:
|
| 3847 |
</span> |
|
| 3848 |
-
Cell: benchmark |
|
| 3849 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3850 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3851 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3856,13 +3884,13 @@ Cell: benchmark | 0.05s | FAILED
|
|
| 3856 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3857 |
<span class="c1"># dependencies = [</span>
|
| 3858 |
<span class="c1"># "numpy",</span>
|
| 3859 |
-
<span class="c1"># "torch",</span>
|
| 3860 |
<span class="c1"># "kernels",</span>
|
| 3861 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3862 |
<span class="c1"># ]</span>
|
| 3863 |
<span class="c1">#</span>
|
| 3864 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3865 |
-
<span class="c1"># kernels-benchmark-tools = { path = "
|
| 3866 |
<span class="c1"># ///</span>
|
| 3867 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3868 |
<span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
|
|
@@ -3920,9 +3948,28 @@ Cell: benchmark | 0.05s | FAILED
|
|
| 3920 |
</div>
|
| 3921 |
</div>
|
| 3922 |
<div id="output-benchmark" class="cell-output">
|
| 3923 |
-
<div class="cell-
|
| 3924 |
-
|
| 3925 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3926 |
</div>
|
| 3927 |
</div>
|
| 3928 |
</div>
|
|
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
| 709 |
+
|
| 710 |
+
.cell-stdout {
|
| 711 |
+
background: var(--bg-tertiary);
|
| 712 |
+
padding: 0.75rem;
|
| 713 |
+
border-radius: 1px;
|
| 714 |
+
font-family: inherit;
|
| 715 |
+
font-size: 0.9rem;
|
| 716 |
+
color: var(--text-primary);
|
| 717 |
+
|
| 718 |
+
/* key bits */
|
| 719 |
+
overflow: auto; /* show scrollbars when needed */
|
| 720 |
+
max-width: 100%; /* respects whatever layout width you give it */
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
.cell-stdout .stdout-text {
|
| 724 |
+
margin: 0; /* reset pre default margin */
|
| 725 |
+
white-space: pre; /* keep line breaks, NO wrapping */
|
| 726 |
+
display: inline-block; /* shrink-to-content */
|
| 727 |
+
min-width: max-content; /* allow very long lines to define intrinsic width */
|
| 728 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
|
| 729 |
+
tab-size: 2;
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
.cell-stderr {
|
| 733 |
background: var(--bg-error);
|
| 734 |
border-left: 2px solid var(--border-error);
|
|
|
|
| 3579 |
if(output){
|
| 3580 |
output.classList.remove('output-stale');
|
| 3581 |
let html='';
|
| 3582 |
+
if (data.stdout) {
|
| 3583 |
+
html += '<div class="cell-stdout"><pre class="stdout-text">'
|
| 3584 |
+
+ escapeHtml(data.stdout)
|
| 3585 |
+
+ '</pre></div>';
|
| 3586 |
+
}
|
| 3587 |
+
|
| 3588 |
console.log('UV Logs:', data);
|
| 3589 |
if(data.stderr) {
|
| 3590 |
// Split UV logs from regular stderr
|
|
|
|
| 3706 |
}
|
| 3707 |
}
|
| 3708 |
|
| 3709 |
+
// // Live reload functionality (robust SSE handling)
|
| 3710 |
+
// (function(){
|
| 3711 |
+
// if (!('EventSource' in window)) {
|
| 3712 |
+
// console.warn('SSE not supported in this browser');
|
| 3713 |
+
// return;
|
| 3714 |
+
// }
|
| 3715 |
+
// let source = new EventSource('/events');
|
| 3716 |
+
// let isOpen = false;
|
| 3717 |
+
// source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
|
| 3718 |
+
// source.onmessage = function(e){
|
| 3719 |
+
// const msg=(e.data||'').trim(); if(!msg) return;
|
| 3720 |
+
// console.log('SSE message:', msg);
|
| 3721 |
+
// if (msg==='reload' || msg==='incremental') { location.reload(); }
|
| 3722 |
+
// // Ignore 'loading' to avoid premature reload loops
|
| 3723 |
+
// };
|
| 3724 |
+
// source.onerror = function(e){
|
| 3725 |
+
// // Let EventSource auto-reconnect instead of forcing a reload
|
| 3726 |
+
// if (isOpen) console.warn('SSE error after open, retrying...', e);
|
| 3727 |
+
// };
|
| 3728 |
+
// window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
|
| 3729 |
+
// })();
|
| 3730 |
|
| 3731 |
|
| 3732 |
document.addEventListener('DOMContentLoaded', function() {
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3866 |
<h1>HF Kernels LayerNorm Implementation</h1>
|
| 3867 |
<p>Based on kernels-community <code>layer-norm</code> kernel.</p>
|
| 3868 |
<h2>LayerNorm Benchmark (HF Kernels)</h2>
|
| 3869 |
+
<div class="cell" id="cell-benchmark">
|
| 3870 |
<div class="cell-header">
|
| 3871 |
<span class="collapse-indicators">
|
| 3872 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3873 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3874 |
+
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3875 |
</span> |
|
| 3876 |
+
Cell: benchmark | 5.52s
|
| 3877 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3878 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3879 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3884 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3885 |
<span class="c1"># dependencies = [</span>
|
| 3886 |
<span class="c1"># "numpy",</span>
|
| 3887 |
+
<span class="c1"># "torch==2.8.0",</span>
|
| 3888 |
<span class="c1"># "kernels",</span>
|
| 3889 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3890 |
<span class="c1"># ]</span>
|
| 3891 |
<span class="c1">#</span>
|
| 3892 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3893 |
+
<span class="c1"># kernels-benchmark-tools = { path = "../../../../../tools", editable = true }</span>
|
| 3894 |
<span class="c1"># ///</span>
|
| 3895 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3896 |
<span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
|
|
|
|
| 3948 |
</div>
|
| 3949 |
</div>
|
| 3950 |
<div id="output-benchmark" class="cell-output">
|
| 3951 |
+
<div class="cell-stdout"><pre class="stdout-text">impl wl p50(ms) ok
|
| 3952 |
+
hf_kernels_layer_norm llama_S1024_D4096 0.04 False
|
| 3953 |
+
hf_kernels_layer_norm llama_S1024_D8192 0.04 False
|
| 3954 |
+
hf_kernels_layer_norm llama_S2048_D4096 0.04 False
|
| 3955 |
+
hf_kernels_layer_norm llama_S2048_D8192 0.05 False
|
| 3956 |
+
hf_kernels_layer_norm llama_S4096_D4096 0.05 False
|
| 3957 |
+
hf_kernels_layer_norm llama_S4096_D8192 0.21 False
|
| 3958 |
+
hf_kernels_layer_norm llama_S512_D4096 0.04 False
|
| 3959 |
+
hf_kernels_layer_norm llama_S512_D8192 0.04 False
|
| 3960 |
+
</pre></div>
|
| 3961 |
+
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3962 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3963 |
+
<div class="uv-logs-content" style="display: none;">
|
| 3964 |
+
Installed 10 packages in 16ms
|
| 3965 |
+
</div>
|
| 3966 |
+
</div>
|
| 3967 |
+
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 3968 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.02it/s]
|
| 3969 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.04it/s]</div>
|
| 3970 |
+
<div class="cell-artifacts">
|
| 3971 |
+
<h4>Artifacts:</h4>
|
| 3972 |
+
<a href="artifacts/benchmark/ln.jsonl" class="artifact" target="_blank">ln.jsonl</a>
|
| 3973 |
</div>
|
| 3974 |
</div>
|
| 3975 |
</div>
|
layer_norm/impls/torch_layer_norm.html
CHANGED
|
@@ -706,6 +706,29 @@
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
.cell-stderr {
|
| 710 |
background: var(--bg-error);
|
| 711 |
border-left: 2px solid var(--border-error);
|
|
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3556 |
if(output){
|
| 3557 |
output.classList.remove('output-stale');
|
| 3558 |
let html='';
|
| 3559 |
-
if(data.stdout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3560 |
console.log('UV Logs:', data);
|
| 3561 |
if(data.stderr) {
|
| 3562 |
// Split UV logs from regular stderr
|
|
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3678 |
}
|
| 3679 |
}
|
| 3680 |
|
| 3681 |
-
// Live reload functionality (robust SSE handling)
|
| 3682 |
-
(function(){
|
| 3683 |
-
|
| 3684 |
-
|
| 3685 |
-
|
| 3686 |
-
|
| 3687 |
-
|
| 3688 |
-
|
| 3689 |
-
|
| 3690 |
-
|
| 3691 |
-
|
| 3692 |
-
|
| 3693 |
-
|
| 3694 |
-
|
| 3695 |
-
|
| 3696 |
-
|
| 3697 |
-
|
| 3698 |
-
|
| 3699 |
-
|
| 3700 |
-
|
| 3701 |
-
})();
|
| 3702 |
|
| 3703 |
|
| 3704 |
document.addEventListener('DOMContentLoaded', function() {
|
|
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.10.244-240.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
|
@@ -3844,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3844 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3845 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3846 |
</span> |
|
| 3847 |
-
Cell: nv | 0.
|
| 3848 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3849 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3850 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3859,7 +3887,7 @@ Cell: nv | 0.22s
|
|
| 3859 |
</div>
|
| 3860 |
</div>
|
| 3861 |
<div id="output-nv" class="cell-output">
|
| 3862 |
-
<div class="cell-stdout">
|
| 3863 |
+-----------------------------------------------------------------------------------------+
|
| 3864 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3865 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3868,7 +3896,7 @@ Cell: nv | 0.22s
|
|
| 3868 |
| | | MIG M. |
|
| 3869 |
|=========================================+========================+======================|
|
| 3870 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3871 |
-
| N/A
|
| 3872 |
| | | N/A |
|
| 3873 |
+-----------------------------------------+------------------------+----------------------+
|
| 3874 |
|
|
@@ -3880,19 +3908,19 @@ Cell: nv | 0.22s
|
|
| 3880 |
| No running processes found |
|
| 3881 |
+-----------------------------------------------------------------------------------------+
|
| 3882 |
|
| 3883 |
-
</div>
|
| 3884 |
</div>
|
| 3885 |
</div>
|
| 3886 |
|
| 3887 |
<h2>LayerNorm Benchmark (PyTorch)</h2>
|
| 3888 |
-
<div class="cell
|
| 3889 |
<div class="cell-header">
|
| 3890 |
<span class="collapse-indicators">
|
| 3891 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3892 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3893 |
-
<span id="uv-indicator-benchmark" style="cursor:
|
| 3894 |
</span> |
|
| 3895 |
-
Cell: benchmark |
|
| 3896 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3897 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3898 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3903,12 +3931,12 @@ Cell: benchmark | 0.01s | FAILED
|
|
| 3903 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3904 |
<span class="c1"># dependencies = [</span>
|
| 3905 |
<span class="c1"># "numpy",</span>
|
| 3906 |
-
<span class="c1"># "torch",</span>
|
| 3907 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3908 |
<span class="c1"># ]</span>
|
| 3909 |
<span class="c1">#</span>
|
| 3910 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3911 |
-
<span class="c1"># kernels-benchmark-tools = { path = "
|
| 3912 |
<span class="c1"># ///</span>
|
| 3913 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3914 |
<span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
|
|
@@ -3946,9 +3974,25 @@ Cell: benchmark | 0.01s | FAILED
|
|
| 3946 |
</div>
|
| 3947 |
</div>
|
| 3948 |
<div id="output-benchmark" class="cell-output">
|
| 3949 |
-
<div class="cell-
|
| 3950 |
-
|
| 3951 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3952 |
</div>
|
| 3953 |
</div>
|
| 3954 |
</div>
|
|
|
|
| 706 |
white-space: pre-wrap;
|
| 707 |
color: var(--text-primary);
|
| 708 |
}
|
| 709 |
+
|
| 710 |
+
.cell-stdout {
|
| 711 |
+
background: var(--bg-tertiary);
|
| 712 |
+
padding: 0.75rem;
|
| 713 |
+
border-radius: 1px;
|
| 714 |
+
font-family: inherit;
|
| 715 |
+
font-size: 0.9rem;
|
| 716 |
+
color: var(--text-primary);
|
| 717 |
+
|
| 718 |
+
/* key bits */
|
| 719 |
+
overflow: auto; /* show scrollbars when needed */
|
| 720 |
+
max-width: 100%; /* respects whatever layout width you give it */
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
.cell-stdout .stdout-text {
|
| 724 |
+
margin: 0; /* reset pre default margin */
|
| 725 |
+
white-space: pre; /* keep line breaks, NO wrapping */
|
| 726 |
+
display: inline-block; /* shrink-to-content */
|
| 727 |
+
min-width: max-content; /* allow very long lines to define intrinsic width */
|
| 728 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
|
| 729 |
+
tab-size: 2;
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
.cell-stderr {
|
| 733 |
background: var(--bg-error);
|
| 734 |
border-left: 2px solid var(--border-error);
|
|
|
|
| 3579 |
if(output){
|
| 3580 |
output.classList.remove('output-stale');
|
| 3581 |
let html='';
|
| 3582 |
+
if (data.stdout) {
|
| 3583 |
+
html += '<div class="cell-stdout"><pre class="stdout-text">'
|
| 3584 |
+
+ escapeHtml(data.stdout)
|
| 3585 |
+
+ '</pre></div>';
|
| 3586 |
+
}
|
| 3587 |
+
|
| 3588 |
console.log('UV Logs:', data);
|
| 3589 |
if(data.stderr) {
|
| 3590 |
// Split UV logs from regular stderr
|
|
|
|
| 3706 |
}
|
| 3707 |
}
|
| 3708 |
|
| 3709 |
+
// // Live reload functionality (robust SSE handling)
|
| 3710 |
+
// (function(){
|
| 3711 |
+
// if (!('EventSource' in window)) {
|
| 3712 |
+
// console.warn('SSE not supported in this browser');
|
| 3713 |
+
// return;
|
| 3714 |
+
// }
|
| 3715 |
+
// let source = new EventSource('/events');
|
| 3716 |
+
// let isOpen = false;
|
| 3717 |
+
// source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
|
| 3718 |
+
// source.onmessage = function(e){
|
| 3719 |
+
// const msg=(e.data||'').trim(); if(!msg) return;
|
| 3720 |
+
// console.log('SSE message:', msg);
|
| 3721 |
+
// if (msg==='reload' || msg==='incremental') { location.reload(); }
|
| 3722 |
+
// // Ignore 'loading' to avoid premature reload loops
|
| 3723 |
+
// };
|
| 3724 |
+
// source.onerror = function(e){
|
| 3725 |
+
// // Let EventSource auto-reconnect instead of forcing a reload
|
| 3726 |
+
// if (isOpen) console.warn('SSE error after open, retrying...', e);
|
| 3727 |
+
// };
|
| 3728 |
+
// window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
|
| 3729 |
+
// })();
|
| 3730 |
|
| 3731 |
|
| 3732 |
document.addEventListener('DOMContentLoaded', function() {
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3872 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3873 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3874 |
</span> |
|
| 3875 |
+
Cell: nv | 0.21s
|
| 3876 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3877 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3878 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
+
<div class="cell-stdout"><pre class="stdout-text">Thu Oct 23 17:20:58 2025
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
+
| N/A 35C P0 70W / 350W | 0MiB / 46068MiB | 26% Default |
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
|
|
| 3908 |
| No running processes found |
|
| 3909 |
+-----------------------------------------------------------------------------------------+
|
| 3910 |
|
| 3911 |
+
</pre></div>
|
| 3912 |
</div>
|
| 3913 |
</div>
|
| 3914 |
|
| 3915 |
<h2>LayerNorm Benchmark (PyTorch)</h2>
|
| 3916 |
+
<div class="cell" id="cell-benchmark">
|
| 3917 |
<div class="cell-header">
|
| 3918 |
<span class="collapse-indicators">
|
| 3919 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
+
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
+
Cell: benchmark | 4.50s
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3931 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3932 |
<span class="c1"># dependencies = [</span>
|
| 3933 |
<span class="c1"># "numpy",</span>
|
| 3934 |
+
<span class="c1"># "torch==2.8.0",</span>
|
| 3935 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3936 |
<span class="c1"># ]</span>
|
| 3937 |
<span class="c1">#</span>
|
| 3938 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3939 |
+
<span class="c1"># kernels-benchmark-tools = { path = "../../../../../tools", editable = true }</span>
|
| 3940 |
<span class="c1"># ///</span>
|
| 3941 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3942 |
<span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
|
|
|
|
| 3974 |
</div>
|
| 3975 |
</div>
|
| 3976 |
<div id="output-benchmark" class="cell-output">
|
| 3977 |
+
<div class="cell-stdout"><pre class="stdout-text">impl wl p50(ms) ok
|
| 3978 |
+
torch_layer_norm llama_S1024_D4096 0.03 False
|
| 3979 |
+
torch_layer_norm llama_S1024_D8192 0.03 False
|
| 3980 |
+
torch_layer_norm llama_S2048_D4096 0.03 False
|
| 3981 |
+
torch_layer_norm llama_S2048_D8192 0.05 False
|
| 3982 |
+
torch_layer_norm llama_S4096_D4096 0.04 False
|
| 3983 |
+
torch_layer_norm llama_S4096_D8192 0.20 False
|
| 3984 |
+
torch_layer_norm llama_S512_D4096 0.03 False
|
| 3985 |
+
torch_layer_norm llama_S512_D8192 0.03 False
|
| 3986 |
+
</pre></div>
|
| 3987 |
+
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3988 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3989 |
+
<div class="uv-logs-content" style="display: none;">
|
| 3990 |
+
Installed 37 packages in 245ms
|
| 3991 |
+
</div>
|
| 3992 |
+
</div>
|
| 3993 |
+
<div class="cell-artifacts">
|
| 3994 |
+
<h4>Artifacts:</h4>
|
| 3995 |
+
<a href="artifacts/benchmark/ln.jsonl" class="artifact" target="_blank">ln.jsonl</a>
|
| 3996 |
</div>
|
| 3997 |
</div>
|
| 3998 |
</div>
|
layer_norm/results/artifacts/combine/latency.svg
ADDED
|
|
Git LFS Details
|
layer_norm/results/cells/combine.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = ["torch", "kernels-benchmark-tools", "matplotlib"]
|
| 4 |
+
# [tool.uv.sources]
|
| 5 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 6 |
+
# ///
|
| 7 |
+
from kernels_benchmark_tools.core.visuals import generate_combined_results
|
| 8 |
+
|
| 9 |
+
cache_env_map = {
|
| 10 |
+
"Torch LayerNorm": "UVNOTE_FILE_TORCH_LAYER_NORM_BENCHMARK",
|
| 11 |
+
"HF Kernels LayerNorm": "UVNOTE_FILE_HF_KERNELS_LAYER_NORM_BENCHMARK",
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
generate_combined_results(
|
| 15 |
+
cache_env_map=cache_env_map,
|
| 16 |
+
output_filename="ln.jsonl",
|
| 17 |
+
svg_filename="latency.svg",
|
| 18 |
+
figure_id="layernorm"
|
| 19 |
+
)
|
layer_norm/results/combined_results.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|