Spaces:

kernels-community
/

kernels-benchmarks

Running

App Files Files Community

drbh HF Staff commited on Oct 23

Commit

81fff32

verified ·

1 Parent(s): aa3ac98

Upload folder using huggingface_hub

Browse files

Files changed (31) hide show

activation/impls/artifacts/benchmark/activation.jsonl +3 -0
activation/impls/cells/benchmark.py +2 -2
activation/impls/compiled_swiglu.html +172 -35
activation/impls/hf_kernels_swiglu.html +139 -35
activation/impls/torch_swiglu.html +138 -34
activation/results/artifacts/combine/latency.svg +3 -0
activation/results/cells/combine.py +27 -0
activation/results/combined_results.html +0 -0
flash_attn/impls/artifacts/benchmark/attn.jsonl +6 -6
flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl +6 -6
flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl +6 -6
flash_attn/impls/cells/benchmark.py +11 -13
flash_attn/impls/cells/benchmark_default.py +2 -2
flash_attn/impls/cells/benchmark_max_autotune.py +2 -2
flash_attn/impls/compiled_variants.html +336 -31
flash_attn/impls/flash_attention.html +262 -34
flash_attn/impls/hf_kernels_flash_attn.html +214 -29
flash_attn/impls/hf_kernels_flash_attn3.html +203 -30
flash_attn/impls/mem_efficient_attention.html +252 -30
flash_attn/impls/sage_attention.html +130 -31
flash_attn/impls/xformers.html +219 -31
flash_attn/results/artifacts/combine/latency.svg +2 -2
flash_attn/results/cells/combine.py +39 -289
flash_attn/results/combined_results.html +0 -0
layer_norm/impls/artifacts/benchmark/ln.jsonl +8 -0
layer_norm/impls/cells/benchmark.py +2 -2
layer_norm/impls/hf_kernels_layer_norm.html +78 -31
layer_norm/impls/torch_layer_norm.html +79 -35
layer_norm/results/artifacts/combine/latency.svg +3 -0
layer_norm/results/cells/combine.py +19 -0
layer_norm/results/combined_results.html +0 -0

activation/impls/artifacts/benchmark/activation.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+{"ts": "2025-10-23T17:22:01Z", "run": "2868ab5dc1ce4d49ac015295dd5ab8d5", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "triton", "compile": "none"}, "wl": {"name": "llama_T512_D4096", "num_tokens": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.023811000119167147, "p50": 0.024261000135084032, "p90": 0.024421000034635654, "mean": 0.024255200014522416, "iqr": 0.00023000006876827683, "raw_times": [0.024261000135084032, 0.023811000119167147, 0.024190999965867377, 0.024591999817857868, 0.024421000034635654], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03041099989786744, "peak_bytes": 46139392, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
+{"ts": "2025-10-23T17:22:01Z", "run": "2868ab5dc1ce4d49ac015295dd5ab8d5", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "triton", "compile": "none"}, "wl": {"name": "llama_T512_D8192", "num_tokens": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.030561000130546745, "p50": 0.031221000199366244, "p90": 0.031622000051356736, "mean": 0.031125600116865826, "iqr": 0.001030000021273736, "raw_times": [0.030561000130546745, 0.031221000199366244, 0.030592000030083, 0.031622000051356736, 0.031632000172976404], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03244200001972786, "peak_bytes": 92276736, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
+{"ts": "2025-10-23T17:22:01Z", "run": "2868ab5dc1ce4d49ac015295dd5ab8d5", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "triton", "compile": "none"}, "wl": {"name": "llama_T512_D11008", "num_tokens": 512, "hidden_dim": 11008, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0339219998295448, "p50": 0.03464199994596129, "p90": 0.0347420000252896, "mean": 0.03469179991952842, "iqr": 0.00024100017981254496, "raw_times": [0.0339219998295448, 0.0347420000252896, 0.03464199994596129, 0.03565199995136936, 0.034500999845477054], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03648099982456188, "peak_bytes": 124520448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}

activation/impls/cells/benchmark.py CHANGED Viewed

@@ -2,13 +2,13 @@
 # requires-python = ">=3.10"
 # dependencies = [
 #     "numpy",
-#     "torch",
 #     "kernels-benchmark-tools",
 #     "kernels",
 # ]
 #
 # [tool.uv.sources]
-# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
 # ///
 import torch
 import sys

 # requires-python = ">=3.10"
 # dependencies = [
 #     "numpy",
+#     "torch==2.8.0",
 #     "kernels-benchmark-tools",
 #     "kernels",
 # ]
 #
 # [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
 # ///
 import torch
 import sys

activation/impls/compiled_swiglu.html CHANGED Viewed

@@ -706,6 +706,29 @@
             white-space: pre-wrap;
             color: var(--text-primary);
         }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
-                    if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
             }
         }
-        // Live reload functionality (robust SSE handling)
-        (function(){
-            if (!('EventSource' in window)) {
-                console.warn('SSE not supported in this browser');
-                return;
-            }
-            let source = new EventSource('/events');
-            let isOpen = false;
-            source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
-            source.onmessage = function(e){
-                const msg=(e.data||'').trim(); if(!msg) return;
-                console.log('SSE message:', msg);
-                if (msg==='reload' || msg==='incremental') { location.reload(); }
-                // Ignore 'loading' to avoid premature reload loops
-            };
-            source.onerror = function(e){
-                // Let EventSource auto-reconnect instead of forcing a reload
-                if (isOpen) console.warn('SSE error after open, retrying...', e);
-            };
-            window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
-        })();
         document.addEventListener('DOMContentLoaded', function() {
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
@@ -3843,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: nv | 0.25s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3859,7 +3887,7 @@ Cell: nv | 0.25s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout">Wed Oct 22 08:58:23 2025
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3868,7 +3896,7 @@ Cell: nv | 0.25s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   26C    P8             22W /  350W |       0MiB /  46068MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
@@ -3880,19 +3908,19 @@ Cell: nv | 0.25s
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
-</div>
 </div>
 </div>
 <h2>SwiGLU Benchmark (torch.compile)</h2>
-<div class="cell cell-failed" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 0.05s | FAILED
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,12 +3932,12 @@ Cell: benchmark | 0.05s | FAILED
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
-<span class="c1">#     &quot;torch&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
-<span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3967,9 +3995,118 @@ Cell: benchmark | 0.05s | FAILED
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
-<div class="cell-stderr">  × Failed to resolve script requirement
-  ╰─▶ Distribution not found at:
-      file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 </div>
 </div>
 </div>

             white-space: pre-wrap;
             color: var(--text-primary);
         }
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-primary);
+            /* key bits */
+            overflow: auto;          /* show scrollbars when needed */
+            max-width: 100%;         /* respects whatever layout width you give it */
+        }
+        .cell-stdout .stdout-text {
+            margin: 0;               /* reset pre default margin */
+            white-space: pre;        /* keep line breaks, NO wrapping */
+            display: inline-block;   /* shrink-to-content */
+            min-width: max-content;  /* allow very long lines to define intrinsic width */
+            font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+            tab-size: 2;
+        }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
             }
         }
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
         document.addEventListener('DOMContentLoaded', function() {
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
+Cell: nv | 0.23s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 </div>
 </div>
 <div id="output-nv" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">Thu Oct 23 17:21:49 2025
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   37C    P0             80W /  350W |       0MiB /  46068MiB |     13%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
+</pre></div>
 </div>
 </div>
 <h2>SwiGLU Benchmark (torch.compile)</h2>
+<div class="cell" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 14.79s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">Running SwiGLU benchmarks on cuda with bfloat16
+Testing 3 workloads
+======================================================================
+PROFILE TRACE: compiled_swiglu_max_autotune | llama_T512_D4096
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                           compiled_swiglu_max_autotune         0.00%       0.000us         0.00%       0.000us       0.000us       1.851ms      5297.74%       1.851ms     925.622us             2
+                           compiled_swiglu_max_autotune         0.10%     159.779us        99.99%     166.375ms     166.375ms       0.000us         0.00%      38.816us      38.816us             1
+                             Torch-Compiled Region: 0/1         1.45%       2.415ms        99.86%     166.157ms      55.386ms      11.007us        31.50%      38.816us      12.939us             3
+                                   aten::_foreach_copy_         0.02%      39.542us         0.05%      87.165us      29.055us      21.600us        61.81%      21.600us       7.200us             3
+void at::native::(anonymous namespace)::multi_tensor...         0.00%       0.000us         0.00%       0.000us       0.000us      21.600us        61.81%      21.600us       7.200us             3
+                    CUDAGraphNode.record (dynamo_timed)         0.00%       0.000us         0.00%       0.000us       0.000us      20.673us        59.16%      20.673us      20.673us             1
+                            triton_poi_fused_mul_silu_0         0.00%       0.000us         0.00%       0.000us       0.000us      11.007us        31.50%      11.007us       3.669us             3
+                                Activity Buffer Request         0.86%       1.424ms         0.86%       1.424ms       1.424ms       3.872us        11.08%       3.872us       3.872us             1
+                    CUDAGraphNode.record (dynamo_timed)        96.87%     161.185ms        97.39%     162.045ms     162.045ms       0.000us         0.00%       2.337us       2.337us             1
+                                            aten::fill_         0.02%      34.251us         0.05%      74.934us      37.467us       2.337us         6.69%       2.337us       1.168us             2
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.337us         6.69%       2.337us       1.168us             2
+                               TorchDynamo Cache Lookup         0.03%      57.633us         0.03%      57.633us      19.211us       0.000us         0.00%       0.000us       0.000us             3
+                                      Pregraph bytecode         0.01%      12.280us         0.01%      12.280us       4.093us       0.000us         0.00%       0.000us       0.000us             3
+                 AOTDispatcher Runtime Wrapper Prologue         0.01%      21.352us         0.01%      21.352us       7.117us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         0.07%     111.205us         0.07%     111.205us      18.534us       0.000us         0.00%       0.000us       0.000us             6
+                                  cudaStreamIsCapturing         0.01%      10.600us         0.01%      10.600us       0.815us       0.000us         0.00%       0.000us       0.000us            13
+                               cudaEventRecordWithFlags         0.00%       4.751us         0.00%       4.751us       1.584us       0.000us         0.00%       0.000us       0.000us             3
+                                    cudaStreamWaitEvent         0.00%       4.550us         0.00%       4.550us       1.517us       0.000us         0.00%       0.000us       0.000us             3
+                                    aten::empty_strided         0.01%      14.680us         0.01%      14.680us       4.893us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         0.05%      88.306us         0.05%      88.306us      17.661us       0.000us         0.00%       0.000us       0.000us             5
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 166.389ms
+Self CUDA time total: 34.944us
+======================================================================
+PROFILE TRACE: compiled_swiglu_max_autotune | llama_T512_D8192
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                           compiled_swiglu_max_autotune         0.00%       0.000us         0.00%       0.000us       0.000us       1.882ms      2857.54%       1.882ms     940.918us             2
+                           compiled_swiglu_max_autotune         0.08%     131.855us        99.99%     174.569ms     174.569ms       0.000us         0.00%      72.799us      72.799us             1
+                             Torch-Compiled Region: 0/3         1.26%       2.204ms        99.89%     174.392ms      58.131ms      18.240us        27.70%      72.799us      24.266us             3
+                                   aten::_foreach_copy_         0.02%      39.114us         0.05%      88.345us      29.448us      45.247us        68.71%      45.247us      15.082us             3
+void at::native::(anonymous namespace)::multi_tensor...         0.00%       0.000us         0.00%       0.000us       0.000us      45.247us        68.71%      45.247us      15.082us             3
+                    CUDAGraphNode.record (dynamo_timed)         0.00%       0.000us         0.00%       0.000us       0.000us      19.904us        30.22%      19.904us      19.904us             1
+                            triton_poi_fused_mul_silu_0         0.00%       0.000us         0.00%       0.000us       0.000us      18.240us        27.70%      18.240us       6.080us             3
+                                Activity Buffer Request         0.83%       1.441ms         0.83%       1.441ms       1.441ms       6.944us        10.54%       6.944us       6.944us             1
+                    CUDAGraphNode.record (dynamo_timed)        96.65%     168.746ms        97.67%     170.521ms     170.521ms       0.000us         0.00%       2.368us       2.368us             1
+                                            aten::fill_         0.02%      36.482us         0.04%      78.354us      39.177us       2.368us         3.60%       2.368us       1.184us             2
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.368us         3.60%       2.368us       1.184us             2
+                               TorchDynamo Cache Lookup         0.03%      45.013us         0.03%      45.013us      15.004us       0.000us         0.00%       0.000us       0.000us             3
+                                      Pregraph bytecode         0.01%       9.190us         0.01%       9.190us       3.063us       0.000us         0.00%       0.000us       0.000us             3
+                 AOTDispatcher Runtime Wrapper Prologue         0.01%      17.071us         0.01%      17.071us       5.690us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         0.04%      76.533us         0.04%      76.533us      12.755us       0.000us         0.00%       0.000us       0.000us             6
+                                  cudaStreamIsCapturing         0.01%       9.681us         0.01%       9.681us       0.745us       0.000us         0.00%       0.000us       0.000us            13
+                               cudaEventRecordWithFlags         0.00%       3.672us         0.00%       3.672us       1.224us       0.000us         0.00%       0.000us       0.000us             3
+                                    cudaStreamWaitEvent         0.00%       3.040us         0.00%       3.040us       1.013us       0.000us         0.00%       0.000us       0.000us             3
+                                    aten::empty_strided         0.01%      12.061us         0.01%      12.061us       4.020us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         0.05%      91.103us         0.05%      91.103us      18.221us       0.000us         0.00%       0.000us       0.000us             5
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 174.590ms
+Self CUDA time total: 65.855us
+======================================================================
+PROFILE TRACE: compiled_swiglu_max_autotune | llama_T512_D11008
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                           compiled_swiglu_max_autotune         0.00%       0.000us         0.00%       0.000us       0.000us       1.863ms      1771.89%       1.863ms     931.590us             2
+                           compiled_swiglu_max_autotune         0.07%     121.234us        99.99%     174.986ms     174.986ms       0.000us         0.00%     113.760us     113.760us             1
+                             Torch-Compiled Region: 0/5         1.21%       2.117ms        99.90%     174.826ms      58.275ms      24.864us        23.65%     113.760us      37.920us             3
+                                   aten::_foreach_copy_         0.02%      36.152us         0.05%      83.124us      27.708us      78.144us        74.32%      78.144us      26.048us             3
+void at::native::(anonymous namespace)::multi_tensor...         0.00%       0.000us         0.00%       0.000us       0.000us      78.144us        74.32%      78.144us      26.048us             3
+                            triton_poi_fused_mul_silu_0         0.00%       0.000us         0.00%       0.000us       0.000us      24.864us        23.65%      24.864us       8.288us             3
+                    CUDAGraphNode.record (dynamo_timed)         0.00%       0.000us         0.00%       0.000us       0.000us      19.776us        18.81%      19.776us      19.776us             1
+                                Activity Buffer Request         0.77%       1.349ms         0.77%       1.349ms       1.349ms       8.608us         8.19%       8.608us       8.608us             1
+                    CUDAGraphNode.record (dynamo_timed)        96.23%     168.408ms        97.80%     171.145ms     171.145ms       0.000us         0.00%       2.144us       2.144us             1
+                                            aten::fill_         0.02%      32.121us         0.04%      72.933us      36.467us       2.144us         2.04%       2.144us       1.072us             2
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.144us         2.04%       2.144us       1.072us             2
+                               TorchDynamo Cache Lookup         0.02%      38.274us         0.02%      38.274us      12.758us       0.000us         0.00%       0.000us       0.000us             3
+                                      Pregraph bytecode         0.01%       9.421us         0.01%       9.421us       3.140us       0.000us         0.00%       0.000us       0.000us             3
+                 AOTDispatcher Runtime Wrapper Prologue         0.01%      14.201us         0.01%      14.201us       4.734us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         0.04%      73.664us         0.04%      73.664us      12.277us       0.000us         0.00%       0.000us       0.000us             6
+                                  cudaStreamIsCapturing         0.01%       9.722us         0.01%       9.722us       0.748us       0.000us         0.00%       0.000us       0.000us            13
+                               cudaEventRecordWithFlags         0.00%       3.409us         0.00%       3.409us       1.136us       0.000us         0.00%       0.000us       0.000us             3
+                                    cudaStreamWaitEvent         0.00%       2.910us         0.00%       2.910us       0.970us       0.000us         0.00%       0.000us       0.000us             3
+                                    aten::empty_strided         0.01%      11.600us         0.01%      11.600us       3.867us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         0.05%      87.784us         0.05%      87.784us      17.557us       0.000us         0.00%       0.000us       0.000us             5
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 175.003ms
+Self CUDA time total: 105.152us
+impl                     wl                  p50(ms)  ok
+compiled_swiglu_max_autotune llama_T512_D11008      0.11  True
+compiled_swiglu_max_autotune llama_T512_D4096       0.10  True
+compiled_swiglu_max_autotune llama_T512_D8192       0.11  True
+</pre></div>
+<div class="uv-install-logs" id="uv-logs-benchmark">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Installed 37 packages in 247ms
+</div>
+</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
 </div>
 </div>
 </div>

activation/impls/hf_kernels_swiglu.html CHANGED Viewed

@@ -706,6 +706,29 @@
             white-space: pre-wrap;
             color: var(--text-primary);
         }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
-                    if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
             }
         }
-        // Live reload functionality (robust SSE handling)
-        (function(){
-            if (!('EventSource' in window)) {
-                console.warn('SSE not supported in this browser');
-                return;
-            }
-            let source = new EventSource('/events');
-            let isOpen = false;
-            source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
-            source.onmessage = function(e){
-                const msg=(e.data||'').trim(); if(!msg) return;
-                console.log('SSE message:', msg);
-                if (msg==='reload' || msg==='incremental') { location.reload(); }
-                // Ignore 'loading' to avoid premature reload loops
-            };
-            source.onerror = function(e){
-                // Let EventSource auto-reconnect instead of forcing a reload
-                if (isOpen) console.warn('SSE error after open, retrying...', e);
-            };
-            window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
-        })();
         document.addEventListener('DOMContentLoaded', function() {
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
@@ -3843,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: nv | 0.25s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3859,7 +3887,7 @@ Cell: nv | 0.25s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout">Wed Oct 22 08:58:23 2025
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3868,7 +3896,7 @@ Cell: nv | 0.25s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   26C    P8             22W /  350W |       0MiB /  46068MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
@@ -3880,19 +3908,19 @@ Cell: nv | 0.25s
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
-</div>
 </div>
 </div>
 <h2>SwiGLU Benchmark</h2>
-<div class="cell cell-failed" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 0.01s | FAILED
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,13 +3932,13 @@ Cell: benchmark | 0.01s | FAILED
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
-<span class="c1">#     &quot;torch&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1">#     &quot;kernels&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
-<span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3970,9 +3998,85 @@ Cell: benchmark | 0.01s | FAILED
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
-<div class="cell-stderr">  × Failed to resolve script requirement
-  ╰─▶ Distribution not found at:
-      file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 </div>
 </div>
 </div>

             white-space: pre-wrap;
             color: var(--text-primary);
         }
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-primary);
+            /* key bits */
+            overflow: auto;          /* show scrollbars when needed */
+            max-width: 100%;         /* respects whatever layout width you give it */
+        }
+        .cell-stdout .stdout-text {
+            margin: 0;               /* reset pre default margin */
+            white-space: pre;        /* keep line breaks, NO wrapping */
+            display: inline-block;   /* shrink-to-content */
+            min-width: max-content;  /* allow very long lines to define intrinsic width */
+            font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+            tab-size: 2;
+        }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
             }
         }
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
         document.addEventListener('DOMContentLoaded', function() {
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
+Cell: nv | 0.23s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 </div>
 </div>
 <div id="output-nv" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">Thu Oct 23 17:21:49 2025
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   37C    P0             80W /  350W |       0MiB /  46068MiB |     13%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
+</pre></div>
 </div>
 </div>
 <h2>SwiGLU Benchmark</h2>
+<div class="cell" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 4.10s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1">#     &quot;kernels&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">Running SwiGLU benchmarks on cuda with bfloat16
+Testing 3 workloads
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | llama_T512_D4096
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      77.600us       379.50%      77.600us      77.600us             1
+                                      hf_kernels_swiglu         9.39%     165.439us        99.61%       1.754ms       1.754ms       0.000us         0.00%      27.360us      27.360us             1
+                      _activation_beeaae6::silu_and_mul         1.24%      21.822us        87.35%       1.539ms     512.861us      20.448us       100.00%      27.360us       9.120us             3
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      20.448us       100.00%      20.448us       6.816us             3
+                                Activity Buffer Request        83.94%       1.478ms        83.94%       1.478ms       1.478ms       6.912us        33.80%       6.912us       6.912us             1
+                                            aten::empty         2.87%      50.462us         2.87%      50.462us      16.821us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         2.17%      38.291us         2.17%      38.291us      12.764us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         0.39%       6.830us         0.39%       6.830us       6.830us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 1.761ms
+Self CUDA time total: 20.448us
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | llama_T512_D8192
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      70.720us       154.22%      70.720us      70.720us             1
+                                      hf_kernels_swiglu         5.60%      88.845us        99.68%       1.581ms       1.581ms       0.000us         0.00%      69.152us      69.152us             1
+                      _activation_beeaae6::silu_and_mul         1.32%      20.881us        92.90%       1.474ms     491.244us      45.856us       100.00%      69.152us      23.051us             3
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      45.856us       100.00%      45.856us      15.285us             3
+                                Activity Buffer Request        89.94%       1.427ms        89.94%       1.427ms       1.427ms      23.296us        50.80%      23.296us      23.296us             1
+                                            aten::empty         1.18%      18.690us         1.18%      18.690us       6.230us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         1.64%      25.971us         1.64%      25.971us       8.657us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         0.32%       5.141us         0.32%       5.141us       5.141us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 1.586ms
+Self CUDA time total: 45.856us
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | llama_T512_D11008
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                      hf_kernels_swiglu         5.54%      88.883us        99.68%       1.600ms       1.600ms       0.000us         0.00%     123.326us     123.326us             1
+                      _activation_beeaae6::silu_and_mul         1.34%      21.482us        92.90%       1.491ms     497.111us      75.967us       100.00%     123.326us      41.109us             3
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      81.632us       107.46%      81.632us      81.632us             1
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      75.967us       100.00%      75.967us      25.322us             3
+                                Activity Buffer Request        89.90%       1.443ms        89.90%       1.443ms       1.443ms      47.359us        62.34%      47.359us      47.359us             1
+                                            aten::empty         1.25%      19.991us         1.25%      19.991us       6.664us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         1.65%      26.561us         1.65%      26.561us       8.854us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         0.32%       5.170us         0.32%       5.170us       5.170us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 1.605ms
+Self CUDA time total: 75.967us
+impl                     wl                  p50(ms)  ok
+hf_kernels_swiglu        llama_T512_D11008      0.03  True
+hf_kernels_swiglu        llama_T512_D4096       0.02  True
+hf_kernels_swiglu        llama_T512_D8192       0.03  True
+</pre></div>
+<div class="uv-install-logs" id="uv-logs-benchmark">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Installed 10 packages in 14ms
+</div>
+</div>
+<div class="cell-stderr">Fetching 7 files:   0%|          | 0/7 [00:00&lt;?, ?it/s]
+Fetching 7 files:  71%|███████▏  | 5/7 [00:00&lt;00:00, 14.56it/s]
+Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 20.37it/s]</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
 </div>
 </div>
 </div>

activation/impls/torch_swiglu.html CHANGED Viewed

@@ -706,6 +706,29 @@
             white-space: pre-wrap;
             color: var(--text-primary);
         }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
-                    if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
             }
         }
-        // Live reload functionality (robust SSE handling)
-        (function(){
-            if (!('EventSource' in window)) {
-                console.warn('SSE not supported in this browser');
-                return;
-            }
-            let source = new EventSource('/events');
-            let isOpen = false;
-            source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
-            source.onmessage = function(e){
-                const msg=(e.data||'').trim(); if(!msg) return;
-                console.log('SSE message:', msg);
-                if (msg==='reload' || msg==='incremental') { location.reload(); }
-                // Ignore 'loading' to avoid premature reload loops
-            };
-            source.onerror = function(e){
-                // Let EventSource auto-reconnect instead of forcing a reload
-                if (isOpen) console.warn('SSE error after open, retrying...', e);
-            };
-            window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
-        })();
         document.addEventListener('DOMContentLoaded', function() {
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
@@ -3843,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: nv | 0.25s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3859,7 +3887,7 @@ Cell: nv | 0.25s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout">Wed Oct 22 08:58:23 2025
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3868,7 +3896,7 @@ Cell: nv | 0.25s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   26C    P8             22W /  350W |       0MiB /  46068MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
@@ -3880,19 +3908,19 @@ Cell: nv | 0.25s
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
-</div>
 </div>
 </div>
 <h2>SwiGLU Benchmark (PyTorch Native)</h2>
-<div class="cell cell-failed" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 0.02s | FAILED
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,12 +3932,12 @@ Cell: benchmark | 0.02s | FAILED
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
-<span class="c1">#     &quot;torch&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
-<span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3966,9 +3994,85 @@ Cell: benchmark | 0.02s | FAILED
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
-<div class="cell-stderr">  × Failed to resolve script requirement
-  ╰─▶ Distribution not found at:
-      file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 </div>
 </div>
 </div>

             white-space: pre-wrap;
             color: var(--text-primary);
         }
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-primary);
+            /* key bits */
+            overflow: auto;          /* show scrollbars when needed */
+            max-width: 100%;         /* respects whatever layout width you give it */
+        }
+        .cell-stdout .stdout-text {
+            margin: 0;               /* reset pre default margin */
+            white-space: pre;        /* keep line breaks, NO wrapping */
+            display: inline-block;   /* shrink-to-content */
+            min-width: max-content;  /* allow very long lines to define intrinsic width */
+            font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+            tab-size: 2;
+        }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
             }
         }
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
         document.addEventListener('DOMContentLoaded', function() {
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
+Cell: nv | 0.23s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 </div>
 </div>
 <div id="output-nv" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">Thu Oct 23 17:21:49 2025
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   37C    P0             80W /  350W |       0MiB /  46068MiB |     13%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
+</pre></div>
 </div>
 </div>
 <h2>SwiGLU Benchmark (PyTorch Native)</h2>
+<div class="cell" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 3.41s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">Running SwiGLU benchmarks on cuda with bfloat16
+Testing 3 workloads
+======================================================================
+PROFILE TRACE: torch_swiglu | llama_T512_D4096
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                           torch_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us     170.400us       513.50%     170.400us     170.400us             1
+                                           torch_swiglu        10.35%     190.189us        99.61%       1.830ms       1.830ms       0.000us         0.00%      39.104us      39.104us             1
+                                             aten::silu         3.11%      57.064us        83.35%       1.532ms     510.522us      17.280us        52.07%      23.200us       7.733us             3
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      17.280us        52.07%      17.280us       5.760us             3
+                                              aten::mul         2.20%      40.433us         3.25%      59.723us      19.908us      15.904us        47.93%      15.904us       5.301us             3
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      15.904us        47.93%      15.904us       5.301us             3
+                                Activity Buffer Request        77.87%       1.431ms        77.87%       1.431ms       1.431ms       5.920us        17.84%       5.920us       5.920us             1
+                                            aten::slice         2.14%      39.352us         2.66%      48.892us       8.149us       0.000us         0.00%       0.000us       0.000us             6
+                                       aten::as_strided         0.52%       9.540us         0.52%       9.540us       1.590us       0.000us         0.00%       0.000us       0.000us             6
+                                       cudaLaunchKernel         3.42%      62.871us         3.42%      62.871us      10.479us       0.000us         0.00%       0.000us       0.000us             6
+                                  cudaDeviceSynchronize         0.39%       7.170us         0.39%       7.170us       7.170us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 1.838ms
+Self CUDA time total: 33.184us
+======================================================================
+PROFILE TRACE: torch_swiglu | llama_T512_D8192
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                           torch_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us     144.478us       207.68%     144.478us     144.478us             1
+                                           torch_swiglu         6.51%     109.976us        99.67%       1.683ms       1.683ms       0.000us         0.00%      87.038us      87.038us             1
+                                             aten::silu         2.61%      44.013us        89.15%       1.506ms     501.918us      36.351us        52.25%      53.823us      17.941us             3
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      36.351us        52.25%      36.351us      12.117us             3
+                                              aten::mul         1.57%      26.450us         2.46%      41.521us      13.840us      33.215us        47.75%      33.215us      11.072us             3
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      33.215us        47.75%      33.215us      11.072us             3
+                                Activity Buffer Request        84.91%       1.434ms        84.91%       1.434ms       1.434ms      17.472us        25.12%      17.472us      17.472us             1
+                                            aten::slice         1.23%      20.821us         1.55%      26.141us       4.357us       0.000us         0.00%       0.000us       0.000us             6
+                                       aten::as_strided         0.31%       5.320us         0.31%       5.320us       0.887us       0.000us         0.00%       0.000us       0.000us             6
+                                       cudaLaunchKernel         2.52%      42.602us         2.52%      42.602us       7.100us       0.000us         0.00%       0.000us       0.000us             6
+                                  cudaDeviceSynchronize         0.33%       5.630us         0.33%       5.630us       5.630us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 1.689ms
+Self CUDA time total: 69.566us
+======================================================================
+PROFILE TRACE: torch_swiglu | llama_T512_D11008
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                           torch_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us     147.999us       151.09%     147.999us     147.999us             1
+                                           torch_swiglu         7.64%     131.036us        99.70%       1.710ms       1.710ms       0.000us         0.00%     124.063us     124.063us             1
+                                             aten::silu         2.56%      43.903us        88.06%       1.510ms     503.475us      50.015us        51.06%      76.127us      25.376us             3
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      50.015us        51.06%      50.015us      16.672us             3
+                                              aten::mul         1.50%      25.771us         2.43%      41.641us      13.880us      47.936us        48.94%      47.936us      15.979us             3
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.936us        48.94%      47.936us      15.979us             3
+                                Activity Buffer Request        83.94%       1.440ms        83.94%       1.440ms       1.440ms      26.112us        26.66%      26.112us      26.112us             1
+                                            aten::slice         1.28%      22.003us         1.58%      27.082us       4.514us       0.000us         0.00%       0.000us       0.000us             6
+                                       aten::as_strided         0.30%       5.079us         0.30%       5.079us       0.846us       0.000us         0.00%       0.000us       0.000us             6
+                                       cudaLaunchKernel         2.48%      42.561us         2.48%      42.561us       7.093us       0.000us         0.00%       0.000us       0.000us             6
+                                  cudaDeviceSynchronize         0.30%       5.120us         0.30%       5.120us       5.120us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 1.715ms
+Self CUDA time total: 97.951us
+impl                     wl                  p50(ms)  ok
+torch_swiglu             llama_T512_D11008      0.05  True
+torch_swiglu             llama_T512_D4096       0.04  True
+torch_swiglu             llama_T512_D8192       0.05  True
+</pre></div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
 </div>
 </div>
 </div>

activation/results/artifacts/combine/latency.svg ADDED Viewed

Git LFS Details

SHA256: 1d1e9eae17f133adc5891c297d9d75eafd2f519d8bef5ddcb971d9333606511e
Pointer size: 130 Bytes
Size of remote file: 15.6 kB

activation/results/cells/combine.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+#     "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+# Map display names to uvnote environment variables
+cache_env_map = {
+    "HF Kernels SwiGLU": "UVNOTE_FILE_HF_KERNELS_SWIGLU_BENCHMARK",
+    "PyTorch SwiGLU": "UVNOTE_FILE_TORCH_SWIGLU_BENCHMARK",
+    "Compiled SwiGLU": "UVNOTE_FILE_COMPILED_SWIGLU_BENCHMARK",
+}
+# Generate combined results with visualization
+generate_combined_results(
+    cache_env_map=cache_env_map,
+    output_filename="activation.jsonl",
+    svg_filename="latency.svg"
+)

activation/results/combined_results.html ADDED Viewed

The diff for this file is too large to render. See raw diff

flash_attn/impls/artifacts/benchmark/attn.jsonl CHANGED Viewed

@@ -1,6 +1,6 @@
-{"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.3389439880847931, "p50": 0.3461120128631592, "p90": 0.3461120128631592, "mean": 0.3452928066253662, "reps": 5, "warmup": 2}, "compile_ms": 0.9463679790496826, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000362396240234375, "mse": 2.9206275939941406e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.40959998965263367, "p50": 0.41280001401901245, "p90": 0.41286399960517883, "mean": 0.41234560012817384, "reps": 5, "warmup": 2}, "compile_ms": 0.34329599142074585, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.4310399889945984, "p50": 0.4331519901752472, "p90": 0.4362240135669708, "mean": 0.4366208016872406, "reps": 5, "warmup": 2}, "compile_ms": 0.35942399501800537, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.4359680116176605, "p50": 0.44361600279808044, "p90": 0.447488009929657, "mean": 0.4450624048709869, "reps": 5, "warmup": 2}, "compile_ms": 0.3678080141544342, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.4711039960384369, "p50": 0.47513601183891296, "p90": 0.4763199985027313, "mean": 0.4750400006771088, "reps": 5, "warmup": 2}, "compile_ms": 0.40857601165771484, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.49663999676704407, "p50": 0.4997119903564453, "p90": 0.5038080215454102, "mean": 0.5009407997131348, "reps": 5, "warmup": 2}, "compile_ms": 0.43724799156188965, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}

+{"ts": "2025-10-23T17:22:14Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17766900009519304, "p50": 0.1805790000162233, "p90": 0.1809689999845432, "mean": 0.18065700000988727, "iqr": 0.0005199999577598646, "raw_times": [0.17766900009519304, 0.18044900002678332, 0.1836189999266935, 0.1805790000162233, 0.1809689999845432], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18813999986377894, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-23T17:22:14Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2040300000771822, "p50": 0.208629999860932, "p90": 0.20883999991383462, "mean": 0.2071937999517104, "iqr": 0.004771000021719374, "raw_times": [0.208629999860932, 0.2104000000144879, 0.20883999991383462, 0.20406899989211524, 0.2040300000771822], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2113399998506793, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-23T17:22:15Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21214000003055844, "p50": 0.22414099998968595, "p90": 0.22725099984199915, "mean": 0.22296499996627972, "iqr": 0.014549999832524918, "raw_times": [0.22414099998968595, 0.21270100000947423, 0.23859199995968083, 0.21214000003055844, 0.22725099984199915], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.215200000184268, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-23T17:22:15Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2172510000946204, "p50": 0.21762999995189602, "p90": 0.229150999984995, "mean": 0.22471280003628635, "iqr": 0.011839999842777615, "raw_times": [0.2172510000946204, 0.229150999984995, 0.21762999995189602, 0.24222100000770297, 0.21731100014221738], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22035099982531392, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-23T17:22:15Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2690430001166533, "p50": 0.2719639999213541, "p90": 0.2809840000281838, "mean": 0.27520160006133665, "iqr": 0.011710999842762249, "raw_times": [0.2719639999213541, 0.26927300018542155, 0.2690430001166533, 0.2809840000281838, 0.2847440000550705], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.26890300000559364, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-23T17:22:15Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.27566299991121923, "p50": 0.2808829999594309, "p90": 0.29306400006134936, "mean": 0.2846773999863217, "iqr": 0.01699100016594457, "raw_times": [0.29306400006134936, 0.2760729998954048, 0.2808829999594309, 0.29770400010420417, 0.27566299991121923], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2820939998855465, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}

flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl CHANGED Viewed

@@ -1,6 +1,6 @@
-{"ts": "2025-10-02T16:11:55Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.3563520014286041, "p50": 0.35942399501800537, "p90": 0.3624959886074066, "mean": 0.3856383919715881, "reps": 5, "warmup": 2}, "compile_ms": 2383.33544921875, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T16:11:55Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.4926080107688904, "p50": 0.49663999676704407, "p90": 0.5017600059509277, "mean": 0.4982912003993988, "reps": 5, "warmup": 2}, "compile_ms": 76.60860443115234, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T16:11:55Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5335040092468262, "p50": 0.5366079807281494, "p90": 0.5386239886283875, "mean": 0.5369919896125793, "reps": 5, "warmup": 2}, "compile_ms": 74.49088287353516, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T16:11:55Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5775359869003296, "p50": 0.5868800282478333, "p90": 0.5877760052680969, "mean": 0.5841408014297486, "reps": 5, "warmup": 2}, "compile_ms": 72.97433471679688, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T16:11:56Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.6072319746017456, "p50": 0.6113280057907104, "p90": 0.6144000291824341, "mean": 0.6184704065322876, "reps": 5, "warmup": 2}, "compile_ms": 215.12498474121094, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T16:11:56Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.6399999856948853, "p50": 0.6430720090866089, "p90": 0.6430720090866089, "mean": 0.6428672075271606, "reps": 5, "warmup": 2}, "compile_ms": 71.8028793334961, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}

+{"ts": "2025-10-23T17:21:26Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.19892000000254484, "p50": 0.20128900018789864, "p90": 0.20218000008753734, "mean": 0.20126180006627692, "iqr": 0.0013400001535046613, "raw_times": [0.19892000000254484, 0.20083999993403268, 0.20128900018789864, 0.2030800001193711, 0.20218000008753734], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2210709999417304, "peak_bytes": 152174592, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-23T17:21:26Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.229150999984995, "p50": 0.22967100017012854, "p90": 0.23078200001691584, "mean": 0.23312540001825255, "iqr": 0.0012210000477352878, "raw_times": [0.23078200001691584, 0.22956099996918056, 0.22967100017012854, 0.2464619999500428, 0.229150999984995], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2712529999371327, "peak_bytes": 163971072, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-23T17:21:27Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2344019999327429, "p50": 0.23504099999627215, "p90": 0.23719199998595286, "mean": 0.23960979997355025, "iqr": 0.0026000000161729986, "raw_times": [0.2568219999830035, 0.23719199998595286, 0.2344019999327429, 0.23459199996977986, 0.23504099999627215], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.24443200004498067, "peak_bytes": 167116800, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-23T17:21:27Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.23659099997530575, "p50": 0.23880100002315885, "p90": 0.23884200004431477, "mean": 0.23843920002946106, "iqr": 0.0014209999790182337, "raw_times": [0.23880100002315885, 0.2405410000392294, 0.23742100006529654, 0.23884200004431477, 0.23659099997530575], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25097200000345765, "peak_bytes": 169345024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-23T17:21:27Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "FailOnRecompileLimitHit", "msg": "recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value."}}
+{"ts": "2025-10-23T17:21:27Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "FailOnRecompileLimitHit", "msg": "recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value."}}

flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl CHANGED Viewed

@@ -1,6 +1,6 @@
-{"ts": "2025-10-02T16:11:08Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.3665919899940491, "p50": 0.3768320083618164, "p90": 0.41171199083328247, "mean": 0.40020479559898375, "reps": 5, "warmup": 2}, "compile_ms": 2910.97705078125, "peak_bytes": 85722112, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T16:11:08Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5160959959030151, "p50": 0.5489599704742432, "p90": 0.5631359815597534, "mean": 0.5535807967185974, "reps": 5, "warmup": 2}, "compile_ms": 85.84806060791016, "peak_bytes": 97387520, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T16:11:09Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.562175989151001, "p50": 0.6144000291824341, "p90": 0.6318079829216003, "mean": 0.6143999934196472, "reps": 5, "warmup": 2}, "compile_ms": 82.77401733398438, "peak_bytes": 99746816, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T16:11:09Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.6512640118598938, "p50": 0.6584320068359375, "p90": 0.6799359917640686, "mean": 0.6754495978355408, "reps": 5, "warmup": 2}, "compile_ms": 81.94969940185547, "peak_bytes": 101843968, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T16:11:09Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.6973119974136353, "p50": 0.7014080286026001, "p90": 0.7229440212249756, "mean": 0.7210752129554748, "reps": 5, "warmup": 2}, "compile_ms": 81.1141128540039, "peak_bytes": 103810048, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T16:11:10Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.7485439777374268, "p50": 0.7557439804077148, "p90": 0.7710719704627991, "mean": 0.7735359907150269, "reps": 5, "warmup": 2}, "compile_ms": 767.1397094726562, "peak_bytes": 106562560, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}

+{"ts": "2025-10-23T17:21:09Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.18720899993240891, "p50": 0.19000999986928946, "p90": 0.1910489997953846, "mean": 0.1901993999581464, "iqr": 0.002129999757016776, "raw_times": [0.19381000015528116, 0.1889190000383678, 0.1910489997953846, 0.19000999986928946, 0.18720899993240891], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.3206559999853198, "peak_bytes": 143131648, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-23T17:21:11Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.1962300000286632, "p50": 0.19820000011350203, "p90": 0.20246899998710433, "mean": 0.19939980002163793, "iqr": 0.00514900011694408, "raw_times": [0.19731999987016025, 0.19820000011350203, 0.20246899998710433, 0.1962300000286632, 0.20278000010875985], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.30226499984564725, "peak_bytes": 147850240, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-23T17:21:13Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2074609999453969, "p50": 0.20947000007254246, "p90": 0.21126000001459033, "mean": 0.2095743999689148, "iqr": 0.0030890000743966084, "raw_times": [0.2115099998718506, 0.21126000001459033, 0.20947000007254246, 0.2074609999453969, 0.20817099994019372], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.31840599990573537, "peak_bytes": 150209536, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-23T17:21:15Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21225999989837874, "p50": 0.21317000005183218, "p90": 0.21427999990919488, "mean": 0.21412619998955051, "iqr": 0.0015599998732795939, "raw_times": [0.21225999989837874, 0.21317000005183218, 0.21427999990919488, 0.2127200000359153, 0.21820100005243148], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.3197160001491284, "peak_bytes": 152568832, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-23T17:21:15Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "FailOnRecompileLimitHit", "msg": "recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value."}}
+{"ts": "2025-10-23T17:21:15Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "FailOnRecompileLimitHit", "msg": "recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value."}}

flash_attn/impls/cells/benchmark.py CHANGED Viewed

@@ -2,38 +2,36 @@
 # requires-python = ">=3.10"
 # dependencies = [
 #     "numpy",
-#     "torch",
 #     "kernels-benchmark-tools",
-#     "xformers",
 # ]
 #
 # [tool.uv.sources]
-# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
 # ///
 import torch
 import sys
 import os
 import kernels_benchmark_tools as kbt
-import xformers.ops as xops
-def xformers_attention(q, k, v):
-    """xFormers memory efficient attention"""
-    # xFormers expects [batch, seq_len, heads, head_dim]
-    return xops.memory_efficient_attention(q, k, v)
 kbt.add(
-    "xformers_meff",
-    xformers_attention,
-    tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
 )
 if __name__ == "__main__":
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype = "float32" if device == "cpu" else "bfloat16"
-    # Flux-like workloads
     base = 1024 if device == "cuda" else 512
     flux_sizes = (
         [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]

 # requires-python = ">=3.10"
 # dependencies = [
 #     "numpy",
+#     "torch==2.8.0",
 #     "kernels-benchmark-tools",
 # ]
 #
 # [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
 # ///
 import torch
 import sys
 import os
 import kernels_benchmark_tools as kbt
+def torch_flash(q, k, v):
+    qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+    with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
+        o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+    return o.transpose(1, 2).contiguous()
 kbt.add(
+    "torch_flash_ma",
+    torch_flash,
+    tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
 )
 if __name__ == "__main__":
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype = "float32" if device == "cpu" else "bfloat16"
+    # Flux-like workloads scaled down for CPU testing
     base = 1024 if device == "cuda" else 512
     flux_sizes = (
         [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]

flash_attn/impls/cells/benchmark_default.py CHANGED Viewed

@@ -2,12 +2,12 @@
 # requires-python = ">=3.10"
 # dependencies = [
 #     "numpy",
-#     "torch",
 #     "kernels-benchmark-tools",
 # ]
 #
 # [tool.uv.sources]
-# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
 # ///
 import torch
 import sys

 # requires-python = ">=3.10"
 # dependencies = [
 #     "numpy",
+#     "torch==2.8.0",
 #     "kernels-benchmark-tools",
 # ]
 #
 # [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
 # ///
 import torch
 import sys

flash_attn/impls/cells/benchmark_max_autotune.py CHANGED Viewed

@@ -2,12 +2,12 @@
 # requires-python = ">=3.10"
 # dependencies = [
 #     "numpy",
-#     "torch",
 #     "kernels-benchmark-tools",
 # ]
 #
 # [tool.uv.sources]
-# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
 # ///
 import torch
 import sys

 # requires-python = ">=3.10"
 # dependencies = [
 #     "numpy",
+#     "torch==2.8.0",
 #     "kernels-benchmark-tools",
 # ]
 #
 # [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
 # ///
 import torch
 import sys

flash_attn/impls/compiled_variants.html CHANGED Viewed

@@ -706,6 +706,29 @@
             white-space: pre-wrap;
             color: var(--text-primary);
         }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
-                    if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
             }
         }
-        // Live reload functionality (robust SSE handling)
-        (function(){
-            if (!('EventSource' in window)) {
-                console.warn('SSE not supported in this browser');
-                return;
-            }
-            let source = new EventSource('/events');
-            let isOpen = false;
-            source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
-            source.onmessage = function(e){
-                const msg=(e.data||'').trim(); if(!msg) return;
-                console.log('SSE message:', msg);
-                if (msg==='reload' || msg==='incremental') { location.reload(); }
-                // Ignore 'loading' to avoid premature reload loops
-            };
-            source.onerror = function(e){
-                // Let EventSource auto-reconnect instead of forcing a reload
-                if (isOpen) console.warn('SSE error after open, retrying...', e);
-            };
-            window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
-        })();
         document.addEventListener('DOMContentLoaded', function() {
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
@@ -3837,14 +3865,14 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
         <h1>Torch Compile Variants!</h1>
 <p>This file benchmarks Flash Attention with different torch.compile modes.</p>
 <h2>Flash Attention with torch.compile(mode="default")</h2>
-<div class="cell cell-failed" id="cell-benchmark_default">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark_default')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark_default" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: benchmark_default | 0.02s | FAILED
  | <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
 <a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
@@ -3856,12 +3884,12 @@ Cell: benchmark_default | 0.02s | FAILED
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
-<span class="c1">#     &quot;torch&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
-<span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3929,14 +3957,291 @@ Cell: benchmark_default | 0.02s | FAILED
 </div>
 </div>
 <div id="output-benchmark_default" class="cell-output">
-<div class="cell-stderr">  × Failed to resolve script requirement
-  ╰─▶ Distribution not found at:
-      file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 </div>
 </div>
 </div>
 <h2>Flash Attention with torch.compile(mode="max-autotune")</h2>
     </div>
 </body>

             white-space: pre-wrap;
             color: var(--text-primary);
         }
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-primary);
+            /* key bits */
+            overflow: auto;          /* show scrollbars when needed */
+            max-width: 100%;         /* respects whatever layout width you give it */
+        }
+        .cell-stdout .stdout-text {
+            margin: 0;               /* reset pre default margin */
+            white-space: pre;        /* keep line breaks, NO wrapping */
+            display: inline-block;   /* shrink-to-content */
+            min-width: max-content;  /* allow very long lines to define intrinsic width */
+            font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+            tab-size: 2;
+        }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
             }
         }
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
         document.addEventListener('DOMContentLoaded', function() {
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
         <h1>Torch Compile Variants!</h1>
 <p>This file benchmarks Flash Attention with different torch.compile modes.</p>
 <h2>Flash Attention with torch.compile(mode="default")</h2>
+<div class="cell" id="cell-benchmark_default">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark_default')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
+Cell: benchmark_default | 12.08s
  | <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
 <a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 </div>
 </div>
 <div id="output-benchmark_default" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">
+======================================================================
+PROFILE TRACE: torch_flash_compiled_default | flux_L128
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                           torch_flash_compiled_default         0.00%       0.000us         0.00%       0.000us       0.000us     967.332us       298.12%     967.332us     967.332us             1
+                           torch_flash_compiled_default         5.37%     154.798us        99.77%       2.878ms       2.878ms       0.000us         0.00%     324.481us     324.481us             1
+                             Torch-Compiled Region: 0/1        20.96%     604.478us        92.49%       2.668ms     889.236us       0.000us         0.00%     324.481us     108.160us             3
+              aten::_scaled_dot_product_flash_attention         1.54%      44.432us         8.35%     240.853us      80.284us       0.000us         0.00%     276.257us      92.086us             3
+                         aten::_flash_attention_forward         1.64%      47.371us         5.29%     152.657us      50.886us     276.257us        85.14%     276.257us      92.086us             3
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us     276.257us        85.14%     276.257us      92.086us             3
+triton_poi_fused__scaled_dot_product_flash_attention...         3.50%     100.807us         6.04%     174.309us      19.368us      36.704us        11.31%      36.704us       4.078us             9
+triton_poi_fused__scaled_dot_product_flash_attention...         0.00%       0.000us         0.00%       0.000us       0.000us      36.704us        11.31%      36.704us       4.078us             9
+                               triton_poi_fused_clone_1         1.27%      36.672us         2.17%      62.583us      20.861us      11.520us         3.55%      11.520us       3.840us             3
+                               triton_poi_fused_clone_1         0.00%       0.000us         0.00%       0.000us       0.000us      11.520us         3.55%      11.520us       3.840us             3
+                               TorchDynamo Cache Lookup         1.91%      55.093us         1.91%      55.093us      18.364us       0.000us         0.00%       0.000us       0.000us             3
+                                      Pregraph bytecode         0.36%      10.400us         0.36%      10.400us       3.467us       0.000us         0.00%       0.000us       0.000us             3
+                 AOTDispatcher Runtime Wrapper Prologue         0.70%      20.280us         0.70%      20.280us       6.760us       0.000us         0.00%       0.000us       0.000us             3
+                                Activity Buffer Request        53.91%       1.555ms        53.91%       1.555ms       1.555ms       0.000us         0.00%       0.000us       0.000us             1
+                                         cuLaunchKernel         3.45%      99.413us         3.45%      99.413us       8.284us       0.000us         0.00%       0.000us       0.000us            12
+                                        aten::transpose         1.19%      34.395us         1.52%      43.764us       3.647us       0.000us         0.00%       0.000us       0.000us            12
+                                       aten::as_strided         0.32%       9.369us         0.32%       9.369us       0.781us       0.000us         0.00%       0.000us       0.000us            12
+                                       aten::empty_like         0.44%      12.621us         1.20%      34.732us      11.577us       0.000us         0.00%       0.000us       0.000us             3
+                                    aten::empty_strided         0.77%      22.111us         0.77%      22.111us       7.370us       0.000us         0.00%       0.000us       0.000us             3
+                                            aten::empty         1.24%      35.841us         1.24%      35.841us       2.987us       0.000us         0.00%       0.000us       0.000us            12
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.884ms
+Self CUDA time total: 324.481us
+======================================================================
+PROFILE TRACE: torch_flash_compiled_default | flux_L256
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                           torch_flash_compiled_default         0.00%       0.000us         0.00%       0.000us       0.000us     834.378us       233.60%     834.378us     834.378us             1
+                           torch_flash_compiled_default         4.04%      97.294us        99.68%       2.400ms       2.400ms       0.000us         0.00%     357.190us     357.190us             1
+                             Torch-Compiled Region: 0/3        19.97%     480.803us        94.43%       2.274ms     757.987us       0.000us         0.00%     357.190us     119.063us             3
+              aten::_scaled_dot_product_flash_attention         1.08%      25.983us         7.33%     176.640us      58.880us       0.000us         0.00%     300.165us     100.055us             3
+                         aten::_flash_attention_forward         1.50%      36.164us         5.01%     120.717us      40.239us     300.165us        84.04%     300.165us     100.055us             3
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us     300.165us        84.04%     300.165us     100.055us             3
+triton_poi_fused__scaled_dot_product_flash_attention...         3.30%      79.496us         6.27%     150.937us      16.771us      40.161us        11.24%      40.161us       4.462us             9
+triton_poi_fused__scaled_dot_product_flash_attention...         0.00%       0.000us         0.00%       0.000us       0.000us      40.161us        11.24%      40.161us       4.462us             9
+                               triton_poi_fused_clone_1         2.33%      56.123us         3.38%      81.404us      27.135us      16.864us         4.72%      16.864us       5.621us             3
+                               triton_poi_fused_clone_1         0.00%       0.000us         0.00%       0.000us       0.000us      16.864us         4.72%      16.864us       5.621us             3
+                               TorchDynamo Cache Lookup         1.21%      29.133us         1.21%      29.133us       9.711us       0.000us         0.00%       0.000us       0.000us             3
+                                      Pregraph bytecode         0.32%       7.730us         0.32%       7.730us       2.577us       0.000us         0.00%       0.000us       0.000us             3
+                 AOTDispatcher Runtime Wrapper Prologue         0.49%      11.750us         0.49%      11.750us       3.917us       0.000us         0.00%       0.000us       0.000us             3
+                                Activity Buffer Request        56.67%       1.365ms        56.67%       1.365ms       1.365ms       0.000us         0.00%       0.000us       0.000us             1
+                                         cuLaunchKernel         4.02%      96.722us         4.02%      96.722us       8.060us       0.000us         0.00%       0.000us       0.000us            12
+                                        aten::transpose         0.90%      21.580us         1.24%      29.940us       2.495us       0.000us         0.00%       0.000us       0.000us            12
+                                       aten::as_strided         0.35%       8.360us         0.35%       8.360us       0.697us       0.000us         0.00%       0.000us       0.000us            12
+                                       aten::empty_like         0.27%       6.480us         1.00%      23.971us       7.990us       0.000us         0.00%       0.000us       0.000us             3
+                                    aten::empty_strided         0.73%      17.491us         0.73%      17.491us       5.830us       0.000us         0.00%       0.000us       0.000us             3
+                                            aten::empty         1.24%      29.800us         1.24%      29.800us       2.483us       0.000us         0.00%       0.000us       0.000us            12
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.408ms
+Self CUDA time total: 357.190us
+======================================================================
+PROFILE TRACE: torch_flash_compiled_default | flux_L320
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                           torch_flash_compiled_default         0.00%       0.000us         0.00%       0.000us       0.000us     876.295us       230.02%     876.295us     876.295us             1
+                           torch_flash_compiled_default         3.99%      99.235us        99.67%       2.477ms       2.477ms       0.000us         0.00%     380.963us     380.963us             1
+                             Torch-Compiled Region: 0/5        19.71%     489.623us        94.50%       2.348ms     782.708us       0.000us         0.00%     380.963us     126.988us             3
+              aten::_scaled_dot_product_flash_attention         1.15%      28.583us         7.58%     188.458us      62.819us       0.000us         0.00%     323.107us     107.702us             3
+                         aten::_flash_attention_forward         1.61%      40.110us         5.06%     125.615us      41.872us     323.107us        84.81%     323.107us     107.702us             3
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us     323.107us        84.81%     323.107us     107.702us             3
+triton_poi_fused__scaled_dot_product_flash_attention...         3.47%      86.344us         6.19%     153.807us      17.090us      44.448us        11.67%      44.448us       4.939us             9
+triton_poi_fused__scaled_dot_product_flash_attention...         0.00%       0.000us         0.00%       0.000us       0.000us      44.448us        11.67%      44.448us       4.939us             9
+                               triton_poi_fused_clone_1         1.44%      35.902us         2.40%      59.634us      19.878us      13.408us         3.52%      13.408us       4.469us             3
+                               triton_poi_fused_clone_1         0.00%       0.000us         0.00%       0.000us       0.000us      13.408us         3.52%      13.408us       4.469us             3
+                               TorchDynamo Cache Lookup         1.18%      29.223us         1.18%      29.223us       9.741us       0.000us         0.00%       0.000us       0.000us             3
+                                      Pregraph bytecode         0.30%       7.450us         0.30%       7.450us       2.483us       0.000us         0.00%       0.000us       0.000us             3
+                 AOTDispatcher Runtime Wrapper Prologue         0.46%      11.502us         0.46%      11.502us       3.834us       0.000us         0.00%       0.000us       0.000us             3
+                                Activity Buffer Request        57.86%       1.438ms        57.86%       1.438ms       1.438ms       0.000us         0.00%       0.000us       0.000us             1
+                                         cuLaunchKernel         3.67%      91.195us         3.67%      91.195us       7.600us       0.000us         0.00%       0.000us       0.000us            12
+                                        aten::transpose         0.95%      23.681us         1.38%      34.260us       2.855us       0.000us         0.00%       0.000us       0.000us            12
+                                       aten::as_strided         0.43%      10.579us         0.43%      10.579us       0.882us       0.000us         0.00%       0.000us       0.000us            12
+                                       aten::empty_like         0.27%       6.811us         0.93%      23.051us       7.684us       0.000us         0.00%       0.000us       0.000us             3
+                                    aten::empty_strided         0.65%      16.240us         0.65%      16.240us       5.413us       0.000us         0.00%       0.000us       0.000us             3
+                                            aten::empty         1.30%      32.232us         1.30%      32.232us       2.686us       0.000us         0.00%       0.000us       0.000us            12
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.485ms
+Self CUDA time total: 380.963us
+======================================================================
+PROFILE TRACE: torch_flash_compiled_default | flux_L384
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                           torch_flash_compiled_default         0.00%       0.000us         0.00%       0.000us       0.000us     900.385us       224.95%     900.385us     900.385us             1
+                           torch_flash_compiled_default         3.56%     101.756us        99.74%       2.848ms       2.848ms       0.000us         0.00%     400.258us     400.258us             1
+                             Torch-Compiled Region: 0/7        18.27%     521.655us        95.19%       2.718ms     906.103us       0.000us         0.00%     400.258us     133.419us             3
+              aten::_scaled_dot_product_flash_attention         0.99%      28.253us         6.33%     180.729us      60.243us       0.000us         0.00%     336.352us     112.117us             3
+                         aten::_flash_attention_forward         1.29%      36.890us         4.19%     119.565us      39.855us     336.352us        84.03%     336.352us     112.117us             3
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us     336.352us        84.03%     336.352us     112.117us             3
+triton_poi_fused__scaled_dot_product_flash_attention...         3.07%      87.777us        16.12%     460.302us      51.145us      49.985us        12.49%      49.985us       5.554us             9
+triton_poi_fused__scaled_dot_product_flash_attention...         0.00%       0.000us         0.00%       0.000us       0.000us      49.985us        12.49%      49.985us       5.554us             9
+                               triton_poi_fused_clone_1         1.24%      35.330us         2.05%      58.492us      19.497us      13.921us         3.48%      13.921us       4.640us             3
+                               triton_poi_fused_clone_1         0.00%       0.000us         0.00%       0.000us       0.000us      13.921us         3.48%      13.921us       4.640us             3
+                               TorchDynamo Cache Lookup         0.99%      28.213us         0.99%      28.213us       9.404us       0.000us         0.00%       0.000us       0.000us             3
+                                      Pregraph bytecode         0.25%       7.170us         0.25%       7.170us       2.390us       0.000us         0.00%       0.000us       0.000us             3
+                 AOTDispatcher Runtime Wrapper Prologue         0.43%      12.361us         0.43%      12.361us       4.120us       0.000us         0.00%       0.000us       0.000us             3
+                                Activity Buffer Request        51.74%       1.478ms        51.74%       1.478ms       1.478ms       0.000us         0.00%       0.000us       0.000us             1
+                                         cuLaunchKernel        13.86%     395.687us        13.86%     395.687us      32.974us       0.000us         0.00%       0.000us       0.000us            12
+                                        aten::transpose         0.83%      23.691us         1.15%      32.911us       2.743us       0.000us         0.00%       0.000us       0.000us            12
+                                       aten::as_strided         0.32%       9.220us         0.32%       9.220us       0.768us       0.000us         0.00%       0.000us       0.000us            12
+                                       aten::empty_like         0.23%       6.600us         0.78%      22.311us       7.437us       0.000us         0.00%       0.000us       0.000us             3
+                                    aten::empty_strided         0.55%      15.711us         0.55%      15.711us       5.237us       0.000us         0.00%       0.000us       0.000us             3
+                                            aten::empty         1.03%      29.502us         1.03%      29.502us       2.459us       0.000us         0.00%       0.000us       0.000us            12
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.856ms
+Self CUDA time total: 400.258us
+impl                     wl                  p50(ms)  ok
+torch_flash_compiled_default flux_L128              0.20  True
+torch_flash_compiled_default flux_L256              0.23  True
+torch_flash_compiled_default flux_L320              0.24  True
+torch_flash_compiled_default flux_L384              0.24  True
+torch_flash_compiled_default flux_L448             FAIL  False
+  Error: recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value.
+torch_flash_compiled_default flux_L512             FAIL  False
+  Error: recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value.
+</pre></div>
+<div class="uv-install-logs" id="uv-logs-benchmark_default">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Installed 37 packages in 247ms
+</div>
+</div>
+<div class="cell-stderr">W1023 17:21:27.942000 6833 torch/_dynamo/convert_frame.py:1016] [0/8] torch._dynamo hit config.recompile_limit (8)
+W1023 17:21:27.942000 6833 torch/_dynamo/convert_frame.py:1016] [0/8]    function: &#x27;torch_flash_base&#x27; (/__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cells/benchmark_default.py:18)
+W1023 17:21:27.942000 6833 torch/_dynamo/convert_frame.py:1016] [0/8]    last reason: 0/7: GLOBAL_STATE changed: num_threads
+W1023 17:21:27.942000 6833 torch/_dynamo/convert_frame.py:1016] [0/8] To log all recompilation reasons, use TORCH_LOGS=&quot;recompiles&quot;.
+W1023 17:21:27.942000 6833 torch/_dynamo/convert_frame.py:1016] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.
+W1023 17:21:27.948000 6833 torch/_dynamo/convert_frame.py:1016] [0/9] torch._dynamo hit config.recompile_limit (8)
+W1023 17:21:27.948000 6833 torch/_dynamo/convert_frame.py:1016] [0/9]    function: &#x27;torch_flash_base&#x27; (/__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cells/benchmark_default.py:18)
+W1023 17:21:27.948000 6833 torch/_dynamo/convert_frame.py:1016] [0/9]    last reason: 0/7: GLOBAL_STATE changed: num_threads
+W1023 17:21:27.948000 6833 torch/_dynamo/convert_frame.py:1016] [0/9] To log all recompilation reasons, use TORCH_LOGS=&quot;recompiles&quot;.
+W1023 17:21:27.948000 6833 torch/_dynamo/convert_frame.py:1016] [0/9] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark_default/attn_default.jsonl" class="artifact" target="_blank">attn_default.jsonl</a>
 </div>
 </div>
 </div>
 <h2>Flash Attention with torch.compile(mode="max-autotune")</h2>
+<div class="cell" id="cell-benchmark_max_autotune">
+<div class="cell-header">
+<span class="collapse-indicators">
+<span onclick="toggleCode('benchmark_max_autotune')" style="cursor: pointer;">▼ code</span>
+<span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
+</span> |
+Cell: benchmark_max_autotune | 18.98s
+ | <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
+<a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
+<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/compiled_variants.md" target="_blank" class="github-btn">GitHub</a>
+</div>
+<div id="code-benchmark_max_autotune" class="cell-code" data-lines="70">
+<div class="code-wrap">
+<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
+<span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
+<span class="c1"># dependencies = [</span>
+<span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
+<span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
+<span class="c1"># ]</span>
+<span class="c1">#</span>
+<span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
+<span class="c1"># ///</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
+<span class="k">def</span><span class="w"> </span><span class="nf">torch_flash_base</span><span class="p">(</span><span class="n">q</span><span class="p">,</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span><span class="p">):</span>
+    <span class="n">qt</span><span class="p">,</span> <span class="n">kt</span><span class="p">,</span> <span class="n">vt</span> <span class="o">=</span> <span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="p">(</span><span class="n">q</span><span class="p">,</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span><span class="p">))</span>
+    <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">sdpa_kernel</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">SDPBackend</span><span class="o">.</span><span class="n">FLASH_ATTENTION</span><span class="p">):</span>
+        <span class="n">o</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">scaled_dot_product_attention</span><span class="p">(</span><span class="n">qt</span><span class="p">,</span> <span class="n">kt</span><span class="p">,</span> <span class="n">vt</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">o</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span>
+<span class="c1"># Compile with max-autotune mode</span>
+<span class="n">compiled_flash_max_autotune</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">torch_flash_base</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;max-autotune&quot;</span><span class="p">,</span> <span class="n">fullgraph</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">dynamic</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+<span class="n">kbt</span><span class="o">.</span><span class="n">add</span><span class="p">(</span>
+    <span class="s2">&quot;torch_flash_compiled_max_autotune&quot;</span><span class="p">,</span>
+    <span class="n">compiled_flash_max_autotune</span><span class="p">,</span>
+    <span class="n">tags</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;family&quot;</span><span class="p">:</span> <span class="s2">&quot;torch-sdpa&quot;</span><span class="p">,</span> <span class="s2">&quot;backend&quot;</span><span class="p">:</span> <span class="s2">&quot;FLASH&quot;</span><span class="p">,</span> <span class="s2">&quot;compile&quot;</span><span class="p">:</span> <span class="s2">&quot;max-autotune&quot;</span><span class="p">},</span>
+<span class="p">)</span>
+<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
+    <span class="n">device</span> <span class="o">=</span> <span class="s2">&quot;cuda&quot;</span> <span class="k">if</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">is_available</span><span class="p">()</span> <span class="k">else</span> <span class="s2">&quot;cpu&quot;</span>
+    <span class="n">dtype</span> <span class="o">=</span> <span class="s2">&quot;float32&quot;</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">&quot;cpu&quot;</span> <span class="k">else</span> <span class="s2">&quot;bfloat16&quot;</span>
+    <span class="c1"># Flux-like workloads</span>
+    <span class="n">base</span> <span class="o">=</span> <span class="mi">1024</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">&quot;cuda&quot;</span> <span class="k">else</span> <span class="mi">512</span>
+    <span class="n">flux_sizes</span> <span class="o">=</span> <span class="p">(</span>
+        <span class="p">[</span><span class="mi">128</span><span class="p">,</span> <span class="mi">256</span><span class="p">,</span> <span class="mi">320</span><span class="p">,</span> <span class="mi">384</span><span class="p">,</span> <span class="mi">448</span><span class="p">,</span> <span class="mi">512</span><span class="p">]</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">&quot;cuda&quot;</span> <span class="k">else</span> <span class="p">[</span><span class="mi">64</span><span class="p">,</span> <span class="mi">128</span><span class="p">,</span> <span class="mi">192</span><span class="p">,</span> <span class="mi">256</span><span class="p">]</span>
+    <span class="p">)</span>
+    <span class="n">heads</span> <span class="o">=</span> <span class="mi">24</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">&quot;cuda&quot;</span> <span class="k">else</span> <span class="mi">8</span>
+    <span class="n">head_dim</span> <span class="o">=</span> <span class="mi">128</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">&quot;cuda&quot;</span> <span class="k">else</span> <span class="mi">64</span>
+    <span class="n">wl</span> <span class="o">=</span> <span class="p">[]</span>
+    <span class="k">for</span> <span class="n">L</span> <span class="ow">in</span> <span class="n">flux_sizes</span><span class="p">:</span>
+        <span class="n">wl</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
+            <span class="p">{</span>
+                <span class="s2">&quot;name&quot;</span><span class="p">:</span> <span class="sa">f</span><span class="s2">&quot;flux_L</span><span class="si">{</span><span class="n">L</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">,</span>
+                <span class="s2">&quot;batch&quot;</span><span class="p">:</span> <span class="mi">1</span><span class="p">,</span>
+                <span class="s2">&quot;seq_len&quot;</span><span class="p">:</span> <span class="n">base</span> <span class="o">+</span> <span class="n">L</span><span class="p">,</span>
+                <span class="s2">&quot;heads&quot;</span><span class="p">:</span> <span class="n">heads</span><span class="p">,</span>
+                <span class="s2">&quot;head_dim&quot;</span><span class="p">:</span> <span class="n">head_dim</span><span class="p">,</span>
+                <span class="s2">&quot;dtype&quot;</span><span class="p">:</span> <span class="n">dtype</span><span class="p">,</span>
+                <span class="s2">&quot;device&quot;</span><span class="p">:</span> <span class="n">device</span><span class="p">,</span>
+                <span class="s2">&quot;seed&quot;</span><span class="p">:</span> <span class="mi">0</span><span class="p">,</span>
+            <span class="p">}</span>
+        <span class="p">)</span>
+    <span class="n">kbt</span><span class="o">.</span><span class="n">run</span><span class="p">(</span>
+        <span class="n">wl</span><span class="p">,</span>
+        <span class="n">jsonl</span><span class="o">=</span><span class="s2">&quot;attn_max_autotune.jsonl&quot;</span><span class="p">,</span>
+        <span class="n">reps</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
+        <span class="n">warmup</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
+        <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
+        <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
+        <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
+    <span class="p">)</span>
+    <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn_max_autotune.jsonl&quot;</span><span class="p">])</span>
+</pre></div>
+<div class="code-line-highlight" id="line-highlight-benchmark_max_autotune"></div>
+</div>
+</div>
+<div id="output-benchmark_max_autotune" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">impl                     wl                  p50(ms)  ok
+torch_flash_compiled_max_autotune flux_L128              0.19  True
+torch_flash_compiled_max_autotune flux_L256              0.20  True
+torch_flash_compiled_max_autotune flux_L320              0.21  True
+torch_flash_compiled_max_autotune flux_L384              0.21  True
+torch_flash_compiled_max_autotune flux_L448             FAIL  False
+  Error: recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value.
+torch_flash_compiled_max_autotune flux_L512             FAIL  False
+  Error: recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value.
+</pre></div>
+<div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Installed 37 packages in 208ms
+</div>
+</div>
+<div class="cell-stderr">W1023 17:21:15.860000 6116 torch/_dynamo/convert_frame.py:1016] [0/8] torch._dynamo hit config.recompile_limit (8)
+W1023 17:21:15.860000 6116 torch/_dynamo/convert_frame.py:1016] [0/8]    function: &#x27;torch_flash_base&#x27; (/__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cells/benchmark_max_autotune.py:18)
+W1023 17:21:15.860000 6116 torch/_dynamo/convert_frame.py:1016] [0/8]    last reason: 0/7: GLOBAL_STATE changed: num_threads
+W1023 17:21:15.860000 6116 torch/_dynamo/convert_frame.py:1016] [0/8] To log all recompilation reasons, use TORCH_LOGS=&quot;recompiles&quot;.
+W1023 17:21:15.860000 6116 torch/_dynamo/convert_frame.py:1016] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.
+W1023 17:21:15.866000 6116 torch/_dynamo/convert_frame.py:1016] [0/9] torch._dynamo hit config.recompile_limit (8)
+W1023 17:21:15.866000 6116 torch/_dynamo/convert_frame.py:1016] [0/9]    function: &#x27;torch_flash_base&#x27; (/__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cells/benchmark_max_autotune.py:18)
+W1023 17:21:15.866000 6116 torch/_dynamo/convert_frame.py:1016] [0/9]    last reason: 0/7: GLOBAL_STATE changed: num_threads
+W1023 17:21:15.866000 6116 torch/_dynamo/convert_frame.py:1016] [0/9] To log all recompilation reasons, use TORCH_LOGS=&quot;recompiles&quot;.
+W1023 17:21:15.866000 6116 torch/_dynamo/convert_frame.py:1016] [0/9] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark_max_autotune/attn_max_autotune.jsonl" class="artifact" target="_blank">attn_max_autotune.jsonl</a>
+</div>
+</div>
+</div>
     </div>
 </body>

flash_attn/impls/flash_attention.html CHANGED Viewed

@@ -706,6 +706,29 @@
             white-space: pre-wrap;
             color: var(--text-primary);
         }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
-                    if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
             }
         }
-        // Live reload functionality (robust SSE handling)
-        (function(){
-            if (!('EventSource' in window)) {
-                console.warn('SSE not supported in this browser');
-                return;
-            }
-            let source = new EventSource('/events');
-            let isOpen = false;
-            source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
-            source.onmessage = function(e){
-                const msg=(e.data||'').trim(); if(!msg) return;
-                console.log('SSE message:', msg);
-                if (msg==='reload' || msg==='incremental') { location.reload(); }
-                // Ignore 'loading' to avoid premature reload loops
-            };
-            source.onerror = function(e){
-                // Let EventSource auto-reconnect instead of forcing a reload
-                if (isOpen) console.warn('SSE error after open, retrying...', e);
-            };
-            window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
-        })();
         document.addEventListener('DOMContentLoaded', function() {
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
@@ -3843,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: nv | 0.23s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3860,7 +3888,7 @@ Cell: nv | 0.23s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout">Wed Oct 22 08:58:24 2025
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3869,7 +3897,7 @@ Cell: nv | 0.23s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   26C    P8             22W /  350W |       0MiB /  46068MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
@@ -3881,19 +3909,19 @@ Cell: nv | 0.23s
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
-</div>
 </div>
 </div>
 <h2>Flash Attention Benchmark</h2>
-<div class="cell cell-failed" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 0.01s | FAILED
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3905,12 +3933,12 @@ Cell: benchmark | 0.01s | FAILED
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
-<span class="c1">#     &quot;torch&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
-<span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3974,9 +4002,209 @@ Cell: benchmark | 0.01s | FAILED
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
-<div class="cell-stderr">  × Failed to resolve script requirement
-  ╰─▶ Distribution not found at:
-      file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 </div>
 </div>
 </div>

             white-space: pre-wrap;
             color: var(--text-primary);
         }
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-primary);
+            /* key bits */
+            overflow: auto;          /* show scrollbars when needed */
+            max-width: 100%;         /* respects whatever layout width you give it */
+        }
+        .cell-stdout .stdout-text {
+            margin: 0;               /* reset pre default margin */
+            white-space: pre;        /* keep line breaks, NO wrapping */
+            display: inline-block;   /* shrink-to-content */
+            min-width: max-content;  /* allow very long lines to define intrinsic width */
+            font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+            tab-size: 2;
+        }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
             }
         }
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
         document.addEventListener('DOMContentLoaded', function() {
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
+Cell: nv | 0.21s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 </div>
 </div>
 <div id="output-nv" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">Thu Oct 23 17:22:15 2025
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   37C    P0             91W /  350W |       0MiB /  46068MiB |     26%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
+</pre></div>
 </div>
 </div>
 <h2>Flash Attention Benchmark</h2>
+<div class="cell" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 3.60s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">
+======================================================================
+PROFILE TRACE: torch_flash_ma | flux_L128
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us     799.070us       225.43%     799.070us     799.070us             1
+                                         torch_flash_ma        14.65%     361.148us        99.74%       2.458ms       2.458ms       0.000us         0.00%     362.241us     362.241us             1
+                     aten::scaled_dot_product_attention         1.75%      43.042us         9.34%     230.141us      76.714us       0.000us         0.00%     266.207us      88.736us             3
+              aten::_scaled_dot_product_flash_attention         1.09%      26.961us         7.59%     187.099us      62.366us       0.000us         0.00%     266.207us      88.736us             3
+                         aten::_flash_attention_forward         1.68%      41.361us         5.54%     136.527us      45.509us     266.207us        75.10%     266.207us      88.736us             3
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us     266.207us        75.10%     266.207us      88.736us             3
+                                       aten::contiguous         0.64%      15.860us        72.86%       1.796ms     149.661us       0.000us         0.00%      96.034us       8.003us            12
+                                            aten::clone         1.71%      42.134us        72.21%       1.780ms     148.339us       0.000us         0.00%      96.034us       8.003us            12
+                                            aten::copy_         3.86%      95.153us        66.84%       1.648ms     137.298us      88.258us        24.90%      96.034us       8.003us            12
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      88.258us        24.90%      88.258us       7.355us            12
+                                Activity Buffer Request        58.01%       1.430ms        58.01%       1.430ms       1.430ms       7.776us         2.19%       7.776us       7.776us             1
+                                        aten::transpose         2.95%      72.712us         3.85%      94.884us       3.954us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::as_strided         0.90%      22.172us         0.90%      22.172us       0.924us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::empty_like         1.13%      27.832us         4.55%     112.245us       7.483us       0.000us         0.00%       0.000us       0.000us            15
+                                            aten::empty         4.09%     100.886us         4.09%     100.886us       4.204us       0.000us         0.00%       0.000us       0.000us            24
+                                       cudaLaunchKernel         5.96%     146.998us         5.96%     146.998us       9.800us       0.000us         0.00%       0.000us       0.000us            15
+                                    aten::empty_strided         0.65%      15.960us         0.65%      15.960us       5.320us       0.000us         0.00%       0.000us       0.000us             3
+                                 cudaDeviceGetAttribute         0.12%       2.850us         0.12%       2.850us       0.475us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.54%      13.411us         0.54%      13.411us       4.470us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         0.26%       6.530us         0.26%       6.530us       6.530us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.465ms
+Self CUDA time total: 354.465us
+======================================================================
+PROFILE TRACE: torch_flash_ma | flux_L256
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us     680.541us       161.63%     680.541us     680.541us             1
+                                         torch_flash_ma        11.51%     254.710us        99.74%       2.208ms       2.208ms       0.000us         0.00%     430.783us     430.783us             1
+                     aten::scaled_dot_product_attention         1.09%      24.080us         8.33%     184.408us      61.469us       0.000us         0.00%     312.064us     104.021us             3
+              aten::_scaled_dot_product_flash_attention         0.81%      17.821us         7.24%     160.328us      53.443us       0.000us         0.00%     312.064us     104.021us             3
+                         aten::_flash_attention_forward         1.85%      41.011us         5.37%     118.956us      39.652us     312.064us        74.11%     312.064us     104.021us             3
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us     312.064us        74.11%     312.064us     104.021us             3
+                                       aten::contiguous         0.42%       9.258us        77.80%       1.722ms     143.509us       0.000us         0.00%     118.719us       9.893us            12
+                                            aten::clone         1.32%      29.284us        77.38%       1.713ms     142.737us       0.000us         0.00%     118.719us       9.893us            12
+                                            aten::copy_         3.64%      80.568us        73.02%       1.616ms     134.703us     108.991us        25.89%     118.719us       9.893us            12
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     108.991us        25.89%     108.991us       9.083us            12
+                                Activity Buffer Request        65.56%       1.451ms        65.56%       1.451ms       1.451ms       9.728us         2.31%       9.728us       9.728us             1
+                                        aten::transpose         2.36%      52.224us         3.17%      70.126us       2.922us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::as_strided         0.81%      17.902us         0.81%      17.902us       0.746us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::empty_like         0.96%      21.191us         3.98%      88.123us       5.875us       0.000us         0.00%       0.000us       0.000us            15
+                                            aten::empty         3.58%      79.273us         3.58%      79.273us       3.303us       0.000us         0.00%       0.000us       0.000us            24
+                                       cudaLaunchKernel         4.85%     107.363us         4.85%     107.363us       7.158us       0.000us         0.00%       0.000us       0.000us            15
+                                    aten::empty_strided         0.70%      15.410us         0.70%      15.410us       5.137us       0.000us         0.00%       0.000us       0.000us             3
+                                 cudaDeviceGetAttribute         0.09%       2.071us         0.09%       2.071us       0.345us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.20%       4.321us         0.20%       4.321us       1.440us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         0.26%       5.841us         0.26%       5.841us       5.841us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.214ms
+Self CUDA time total: 421.055us
+======================================================================
+PROFILE TRACE: torch_flash_ma | flux_L320
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us     690.203us       159.06%     690.203us     690.203us             1
+                                         torch_flash_ma        11.42%     254.276us        99.18%       2.209ms       2.209ms       0.000us         0.00%     443.100us     443.100us             1
+                     aten::scaled_dot_product_attention         1.09%      24.201us         8.13%     181.079us      60.360us       0.000us         0.00%     330.557us     110.186us             3
+              aten::_scaled_dot_product_flash_attention         0.78%      17.350us         7.04%     156.878us      52.293us       0.000us         0.00%     330.557us     110.186us             3
+                         aten::_flash_attention_forward         1.80%      40.093us         5.30%     118.035us      39.345us     330.557us        76.18%     330.557us     110.186us             3
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us     330.557us        76.18%     330.557us     110.186us             3
+                                       aten::contiguous         0.42%       9.369us        77.58%       1.728ms     143.991us       0.000us         0.00%     112.543us       9.379us            12
+                                            aten::clone         1.34%      29.740us        77.16%       1.719ms     143.210us       0.000us         0.00%     112.543us       9.379us            12
+                                            aten::copy_         3.81%      84.905us        72.90%       1.624ms     135.305us     103.359us        23.82%     112.543us       9.379us            12
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     103.359us        23.82%     103.359us       8.613us            12
+                                Activity Buffer Request        65.38%       1.456ms        65.38%       1.456ms       1.456ms       9.184us         2.12%       9.184us       9.184us             1
+                                        aten::transpose         2.26%      50.400us         3.02%      67.214us       2.801us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::as_strided         0.75%      16.814us         0.75%      16.814us       0.701us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::empty_like         0.96%      21.489us         3.82%      85.044us       5.670us       0.000us         0.00%       0.000us       0.000us            15
+                                            aten::empty         3.43%      76.464us         3.43%      76.464us       3.186us       0.000us         0.00%       0.000us       0.000us            24
+                                       cudaLaunchKernel         4.82%     107.405us         4.82%     107.405us       7.160us       0.000us         0.00%       0.000us       0.000us            15
+                                    aten::empty_strided         0.66%      14.631us         0.66%      14.631us       4.877us       0.000us         0.00%       0.000us       0.000us             3
+                                 cudaDeviceGetAttribute         0.08%       1.710us         0.08%       1.710us       0.285us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.18%       3.930us         0.18%       3.930us       1.310us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         0.82%      18.331us         0.82%      18.331us      18.331us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.227ms
+Self CUDA time total: 433.916us
+======================================================================
+PROFILE TRACE: torch_flash_ma | flux_L384
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us     691.645us       147.68%     691.645us     691.645us             1
+                                         torch_flash_ma        10.40%     252.243us        99.18%       2.405ms       2.405ms       0.000us         0.00%     481.117us     481.117us             1
+                     aten::scaled_dot_product_attention         1.00%      24.352us         7.27%     176.289us      58.763us       0.000us         0.00%     341.277us     113.759us             3
+              aten::_scaled_dot_product_flash_attention         0.73%      17.811us         6.27%     151.937us      50.646us       0.000us         0.00%     341.277us     113.759us             3
+                         aten::_flash_attention_forward         1.38%      33.540us         4.54%     110.186us      36.729us     341.277us        72.87%     341.277us     113.759us             3
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us     341.277us        72.87%     341.277us     113.759us             3
+                                       aten::contiguous         0.39%       9.522us        79.59%       1.930ms     160.818us       0.000us         0.00%     139.840us      11.653us            12
+                                            aten::clone         1.25%      30.240us        79.20%       1.920ms     160.024us       0.000us         0.00%     139.840us      11.653us            12
+                                            aten::copy_         3.35%      81.274us        75.28%       1.825ms     152.111us     127.072us        27.13%     139.840us      11.653us            12
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     127.072us        27.13%     127.072us      10.589us            12
+                                Activity Buffer Request        59.91%       1.453ms        59.91%       1.453ms       1.453ms      12.768us         2.73%      12.768us      12.768us             1
+                                        aten::transpose         2.18%      52.871us         2.90%      70.271us       2.928us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::as_strided         0.72%      17.400us         0.72%      17.400us       0.725us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::empty_like         0.83%      20.083us         3.47%      84.148us       5.610us       0.000us         0.00%       0.000us       0.000us            15
+                                            aten::empty         3.18%      77.125us         3.18%      77.125us       3.214us       0.000us         0.00%       0.000us       0.000us            24
+                                       cudaLaunchKernel        13.00%     315.205us        13.00%     315.205us      21.014us       0.000us         0.00%       0.000us       0.000us            15
+                                    aten::empty_strided         0.61%      14.781us         0.61%      14.781us       4.927us       0.000us         0.00%       0.000us       0.000us             3
+                                 cudaDeviceGetAttribute         0.07%       1.670us         0.07%       1.670us       0.278us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.16%       3.970us         0.16%       3.970us       1.323us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         0.82%      19.911us         0.82%      19.911us      19.911us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.425ms
+Self CUDA time total: 468.349us
+======================================================================
+PROFILE TRACE: torch_flash_ma | flux_L448
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us     799.966us       130.76%     799.966us     799.966us             1
+                                         torch_flash_ma        12.25%     304.685us        97.28%       2.419ms       2.419ms       0.000us         0.00%     624.638us     624.638us             1
+                     aten::scaled_dot_product_attention         0.97%      24.122us         7.38%     183.559us      61.186us       0.000us         0.00%     485.886us     161.962us             3
+              aten::_scaled_dot_product_flash_attention         0.71%      17.700us         6.41%     159.437us      53.146us       0.000us         0.00%     485.886us     161.962us             3
+                         aten::_flash_attention_forward         1.59%      39.459us         4.74%     117.796us      39.265us     485.886us        79.42%     485.886us     161.962us             3
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us     485.886us        79.42%     485.886us     161.962us             3
+                                       aten::contiguous         0.39%       9.743us        75.79%       1.885ms     157.075us       0.000us         0.00%     138.752us      11.563us            12
+                                            aten::clone         1.21%      30.098us        75.40%       1.875ms     156.263us       0.000us         0.00%     138.752us      11.563us            12
+                                            aten::copy_         3.39%      84.237us        71.41%       1.776ms     147.998us     125.888us        20.58%     138.752us      11.563us            12
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     125.888us        20.58%     125.888us      10.491us            12
+                                Activity Buffer Request        58.51%       1.455ms        58.51%       1.455ms       1.455ms      12.864us         2.10%      12.864us      12.864us             1
+                                        aten::transpose         2.11%      52.456us         2.81%      69.984us       2.916us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::as_strided         0.70%      17.528us         0.70%      17.528us       0.730us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::empty_like         0.83%      20.690us         3.57%      88.794us       5.920us       0.000us         0.00%       0.000us       0.000us            15
+                                            aten::empty         3.29%      81.917us         3.29%      81.917us       3.413us       0.000us         0.00%       0.000us       0.000us            24
+                                       cudaLaunchKernel        10.48%     260.751us        10.48%     260.751us      17.383us       0.000us         0.00%       0.000us       0.000us            15
+                                    aten::empty_strided         0.58%      14.540us         0.58%      14.540us       4.847us       0.000us         0.00%       0.000us       0.000us             3
+                                 cudaDeviceGetAttribute         0.09%       2.170us         0.09%       2.170us       0.362us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.16%       3.911us         0.16%       3.911us       1.304us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         2.72%      67.754us         2.72%      67.754us      67.754us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.487ms
+Self CUDA time total: 611.774us
+======================================================================
+PROFILE TRACE: torch_flash_ma | flux_L512
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us     754.076us       118.52%     754.076us     754.076us             1
+                                         torch_flash_ma        10.33%     251.863us        96.72%       2.358ms       2.358ms       0.000us         0.00%     647.964us     647.964us             1
+                     aten::scaled_dot_product_attention         1.02%      24.850us         7.50%     182.789us      60.930us       0.000us         0.00%     507.517us     169.172us             3
+              aten::_scaled_dot_product_flash_attention         0.72%      17.614us         6.48%     157.939us      52.646us       0.000us         0.00%     507.517us     169.172us             3
+                         aten::_flash_attention_forward         1.67%      40.594us         4.82%     117.465us      39.155us     507.517us        79.77%     507.517us     169.172us             3
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us     507.517us        79.77%     507.517us     169.172us             3
+                                       aten::contiguous         0.38%       9.202us        77.00%       1.877ms     156.434us       0.000us         0.00%     140.447us      11.704us            12
+                                            aten::clone         1.22%      29.851us        76.63%       1.868ms     155.667us       0.000us         0.00%     140.447us      11.704us            12
+                                            aten::copy_         3.45%      84.032us        72.63%       1.771ms     147.547us     128.703us        20.23%     140.447us      11.704us            12
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     128.703us        20.23%     128.703us      10.725us            12
+                                Activity Buffer Request        59.63%       1.454ms        59.63%       1.454ms       1.454ms      11.744us         1.85%      11.744us      11.744us             1
+                                        aten::transpose         2.09%      51.002us         2.82%      68.782us       2.866us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::as_strided         0.73%      17.780us         0.73%      17.780us       0.741us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::empty_like         0.85%      20.819us         3.58%      87.161us       5.811us       0.000us         0.00%       0.000us       0.000us            15
+                                            aten::empty         3.27%      79.813us         3.27%      79.813us       3.326us       0.000us         0.00%       0.000us       0.000us            24
+                                       cudaLaunchKernel        10.50%     256.026us        10.50%     256.026us      17.068us       0.000us         0.00%       0.000us       0.000us            15
+                                    aten::empty_strided         0.59%      14.340us         0.59%      14.340us       4.780us       0.000us         0.00%       0.000us       0.000us             3
+                                 cudaDeviceGetAttribute         0.08%       1.949us         0.08%       1.949us       0.325us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.18%       4.440us         0.18%       4.440us       1.480us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         3.28%      80.003us         3.28%      80.003us      80.003us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.438ms
+Self CUDA time total: 636.220us
+impl                     wl                  p50(ms)  ok
+torch_flash_ma           flux_L128              0.18  True
+torch_flash_ma           flux_L256              0.21  True
+torch_flash_ma           flux_L320              0.22  True
+torch_flash_ma           flux_L384              0.22  True
+torch_flash_ma           flux_L448              0.27  True
+torch_flash_ma           flux_L512              0.28  True
+</pre></div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
 </div>
 </div>
 </div>

flash_attn/impls/hf_kernels_flash_attn.html CHANGED Viewed

@@ -706,6 +706,29 @@
             white-space: pre-wrap;
             color: var(--text-primary);
         }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
-                    if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
             }
         }
-        // Live reload functionality (robust SSE handling)
-        (function(){
-            if (!('EventSource' in window)) {
-                console.warn('SSE not supported in this browser');
-                return;
-            }
-            let source = new EventSource('/events');
-            let isOpen = false;
-            source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
-            source.onmessage = function(e){
-                const msg=(e.data||'').trim(); if(!msg) return;
-                console.log('SSE message:', msg);
-                if (msg==='reload' || msg==='incremental') { location.reload(); }
-                // Ignore 'loading' to avoid premature reload loops
-            };
-            source.onerror = function(e){
-                // Let EventSource auto-reconnect instead of forcing a reload
-                if (isOpen) console.warn('SSE error after open, retrying...', e);
-            };
-            window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
-        })();
         document.addEventListener('DOMContentLoaded', function() {
@@ -3829,21 +3857,21 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     <div class="main-content">
         <h1>HF Kernels - Flash Attention</h1>
 <h2>HuggingFace Kernels Flash Attention Benchmark</h2>
-<div class="cell cell-failed" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 0.01s | FAILED
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3862,7 +3890,7 @@ Cell: benchmark | 0.01s | FAILED
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
-<span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3931,9 +3959,166 @@ Cell: benchmark | 0.01s | FAILED
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
-<div class="cell-stderr">  × Failed to resolve script requirement
-  ╰─▶ Distribution not found at:
-      file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 </div>
 </div>
 </div>

             white-space: pre-wrap;
             color: var(--text-primary);
         }
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-primary);
+            /* key bits */
+            overflow: auto;          /* show scrollbars when needed */
+            max-width: 100%;         /* respects whatever layout width you give it */
+        }
+        .cell-stdout .stdout-text {
+            margin: 0;               /* reset pre default margin */
+            white-space: pre;        /* keep line breaks, NO wrapping */
+            display: inline-block;   /* shrink-to-content */
+            min-width: max-content;  /* allow very long lines to define intrinsic width */
+            font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+            tab-size: 2;
+        }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
             }
         }
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
         document.addEventListener('DOMContentLoaded', function() {
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     <div class="main-content">
         <h1>HF Kernels - Flash Attention</h1>
 <h2>HuggingFace Kernels Flash Attention Benchmark</h2>
+<div class="cell" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 5.95s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | flux_L128
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                  hf_kernels_flash_attn         8.36%     154.078us        96.88%       1.786ms       1.786ms       0.000us         0.00%     362.493us     362.493us             1
+                               _flash_attn_9e27194::fwd         3.99%      73.523us        88.52%       1.632ms     543.906us     271.102us       100.00%     362.493us     120.831us             3
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us     272.638us       100.57%     272.638us     272.638us             1
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us     271.102us       100.00%     271.102us      90.367us             3
+                                Activity Buffer Request        76.97%       1.419ms        76.97%       1.419ms       1.419ms      91.391us        33.71%      91.391us      91.391us             1
+                                 cudaDeviceGetAttribute         0.25%       4.549us         0.25%       4.549us       0.303us       0.000us         0.00%       0.000us       0.000us            15
+                                       aten::empty_like         0.95%      17.511us         2.83%      52.153us      17.384us       0.000us         0.00%       0.000us       0.000us             3
+                                    aten::empty_strided         1.88%      34.642us         1.88%      34.642us      11.547us       0.000us         0.00%       0.000us       0.000us             3
+                                            aten::empty         1.44%      26.603us         1.44%      26.603us       2.956us       0.000us         0.00%       0.000us       0.000us             9
+                                   cudaFuncSetAttribute         0.78%      14.320us         0.78%      14.320us       4.773us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         2.27%      41.882us         2.27%      41.882us      13.961us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         3.12%      57.433us         3.12%      57.433us      57.433us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 1.843ms
+Self CUDA time total: 271.102us
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | flux_L256
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                  hf_kernels_flash_attn         6.38%     115.656us        91.71%       1.662ms       1.662ms       0.000us         0.00%     396.671us     396.671us             1
+                               _flash_attn_9e27194::fwd         2.82%      51.131us        85.33%       1.547ms     515.555us     298.303us       100.00%     396.671us     132.224us             3
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us     299.743us       100.48%     299.743us     299.743us             1
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us     298.303us       100.00%     298.303us      99.434us             3
+                                Activity Buffer Request        77.99%       1.414ms        77.99%       1.414ms       1.414ms      98.368us        32.98%      98.368us      98.368us             1
+                                 cudaDeviceGetAttribute         0.22%       3.931us         0.22%       3.931us       0.262us       0.000us         0.00%       0.000us       0.000us            15
+                                       aten::empty_like         0.40%       7.190us         1.33%      24.041us       8.014us       0.000us         0.00%       0.000us       0.000us             3
+                                    aten::empty_strided         0.93%      16.851us         0.93%      16.851us       5.617us       0.000us         0.00%       0.000us       0.000us             3
+                                            aten::empty         1.25%      22.681us         1.25%      22.681us       2.520us       0.000us         0.00%       0.000us       0.000us             9
+                                   cudaFuncSetAttribute         0.21%       3.730us         0.21%       3.730us       1.243us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         1.51%      27.451us         1.51%      27.451us       9.150us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         8.29%     150.237us         8.29%     150.237us     150.237us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 1.813ms
+Self CUDA time total: 298.303us
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | flux_L320
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                  hf_kernels_flash_attn         6.16%     112.885us        90.78%       1.663ms       1.663ms       0.000us         0.00%     427.613us     427.613us             1
+                               _flash_attn_9e27194::fwd         2.80%      51.281us        84.62%       1.550ms     516.788us     318.526us       100.00%     427.613us     142.538us             3
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us     319.901us       100.43%     319.901us     319.901us             1
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us     318.526us       100.00%     318.526us     106.175us             3
+                                Activity Buffer Request        77.28%       1.416ms        77.28%       1.416ms       1.416ms     109.087us        34.25%     109.087us     109.087us             1
+                                 cudaDeviceGetAttribute         0.21%       3.930us         0.21%       3.930us       0.262us       0.000us         0.00%       0.000us       0.000us            15
+                                       aten::empty_like         0.41%       7.431us         1.40%      25.731us       8.577us       0.000us         0.00%       0.000us       0.000us             3
+                                    aten::empty_strided         1.00%      18.300us         1.00%      18.300us       6.100us       0.000us         0.00%       0.000us       0.000us             3
+                                            aten::empty         1.26%      23.051us         1.26%      23.051us       2.561us       0.000us         0.00%       0.000us       0.000us             9
+                                   cudaFuncSetAttribute         0.22%       4.001us         0.22%       4.001us       1.334us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         1.45%      26.532us         1.45%      26.532us       8.844us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         9.22%     168.858us         9.22%     168.858us     168.858us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 1.832ms
+Self CUDA time total: 318.526us
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | flux_L384
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                  hf_kernels_flash_attn         5.43%     111.055us        91.19%       1.866ms       1.866ms       0.000us         0.00%     446.776us     446.776us             1
+                               _flash_attn_9e27194::fwd         2.54%      51.901us        85.76%       1.755ms     584.928us     331.162us       100.00%     446.776us     148.925us             3
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us     332.667us       100.45%     332.667us     332.667us             1
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us     331.162us       100.00%     331.162us     110.387us             3
+                                Activity Buffer Request        69.78%       1.428ms        69.78%       1.428ms       1.428ms     115.614us        34.91%     115.614us     115.614us             1
+                                 cudaDeviceGetAttribute         0.19%       3.942us         0.19%       3.942us       0.263us       0.000us         0.00%       0.000us       0.000us            15
+                                       aten::empty_like         0.39%       8.070us         1.24%      25.461us       8.487us       0.000us         0.00%       0.000us       0.000us             3
+                                    aten::empty_strided         0.85%      17.391us         0.85%      17.391us       5.797us       0.000us         0.00%       0.000us       0.000us             3
+                                            aten::empty         1.08%      22.080us         1.08%      22.080us       2.453us       0.000us         0.00%       0.000us       0.000us             9
+                                   cudaFuncSetAttribute         0.19%       3.861us         0.19%       3.861us       1.287us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel        10.75%     219.880us        10.75%     219.880us      73.293us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         8.81%     180.219us         8.81%     180.219us     180.219us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.046ms
+Self CUDA time total: 331.162us
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | flux_L448
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                  hf_kernels_flash_attn         4.92%     108.784us        84.29%       1.864ms       1.864ms       0.000us         0.00%     663.288us     663.288us             1
+                               _flash_attn_9e27194::fwd         2.26%      49.951us        79.37%       1.755ms     585.135us     493.882us       100.00%     663.288us     221.096us             3
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us     495.418us       100.31%     495.418us     495.418us             1
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us     493.882us       100.00%     493.882us     164.627us             3
+                                Activity Buffer Request        65.22%       1.442ms        65.22%       1.442ms       1.442ms     169.406us        34.30%     169.406us     169.406us             1
+                                 cudaDeviceGetAttribute         0.18%       3.990us         0.18%       3.990us       0.266us       0.000us         0.00%       0.000us       0.000us            15
+                                       aten::empty_like         0.34%       7.522us         1.12%      24.742us       8.247us       0.000us         0.00%       0.000us       0.000us             3
+                                    aten::empty_strided         0.78%      17.220us         0.78%      17.220us       5.740us       0.000us         0.00%       0.000us       0.000us             3
+                                            aten::empty         0.96%      21.140us         0.96%      21.140us       2.349us       0.000us         0.00%       0.000us       0.000us             9
+                                   cudaFuncSetAttribute         0.19%       4.121us         0.19%       4.121us       1.374us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         9.45%     209.092us         9.45%     209.092us      69.697us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize        15.71%     347.407us        15.71%     347.407us     347.407us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.212ms
+Self CUDA time total: 493.882us
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | flux_L512
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                  hf_kernels_flash_attn         4.96%     110.355us        83.23%       1.852ms       1.852ms       0.000us         0.00%     697.540us     697.540us             1
+                               _flash_attn_9e27194::fwd         2.27%      50.469us        78.28%       1.742ms     580.665us     518.659us       100.00%     697.540us     232.513us             3
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us     520.068us       100.27%     520.068us     520.068us             1
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us     518.659us       100.00%     518.659us     172.886us             3
+                                Activity Buffer Request        64.27%       1.430ms        64.27%       1.430ms       1.430ms     178.881us        34.49%     178.881us     178.881us             1
+                                 cudaDeviceGetAttribute         0.17%       3.832us         0.17%       3.832us       0.255us       0.000us         0.00%       0.000us       0.000us            15
+                                       aten::empty_like         0.33%       7.341us         1.15%      25.571us       8.524us       0.000us         0.00%       0.000us       0.000us             3
+                                    aten::empty_strided         0.82%      18.230us         0.82%      18.230us       6.077us       0.000us         0.00%       0.000us       0.000us             3
+                                            aten::empty         0.94%      20.812us         0.94%      20.812us       2.312us       0.000us         0.00%       0.000us       0.000us             9
+                                   cudaFuncSetAttribute         0.19%       4.171us         0.19%       4.171us       1.390us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         9.29%     206.809us         9.29%     206.809us      68.936us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize        16.77%     373.119us        16.77%     373.119us     373.119us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.225ms
+Self CUDA time total: 518.659us
+impl                     wl                  p50(ms)  ok
+hf_kernels_flash_attn    flux_L128              0.12  True
+hf_kernels_flash_attn    flux_L256              0.14  True
+hf_kernels_flash_attn    flux_L320              0.14  True
+hf_kernels_flash_attn    flux_L384              0.15  True
+hf_kernels_flash_attn    flux_L448              0.20  True
+hf_kernels_flash_attn    flux_L512              0.20  True
+</pre></div>
+<div class="cell-stderr">
+Fetching 20 files:   0%|          | 0/20 [00:00&lt;?, ?it/s]
+Fetching 20 files:  10%|█         | 2/20 [00:01&lt;00:16,  1.08it/s]
+Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 10.78it/s]
+</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
 </div>
 </div>
 </div>

flash_attn/impls/hf_kernels_flash_attn3.html CHANGED Viewed

@@ -706,6 +706,29 @@
             white-space: pre-wrap;
             color: var(--text-primary);
         }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
-                    if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
             }
         }
-        // Live reload functionality (robust SSE handling)
-        (function(){
-            if (!('EventSource' in window)) {
-                console.warn('SSE not supported in this browser');
-                return;
-            }
-            let source = new EventSource('/events');
-            let isOpen = false;
-            source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
-            source.onmessage = function(e){
-                const msg=(e.data||'').trim(); if(!msg) return;
-                console.log('SSE message:', msg);
-                if (msg==='reload' || msg==='incremental') { location.reload(); }
-                // Ignore 'loading' to avoid premature reload loops
-            };
-            source.onerror = function(e){
-                // Let EventSource auto-reconnect instead of forcing a reload
-                if (isOpen) console.warn('SSE error after open, retrying...', e);
-            };
-            window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
-        })();
         document.addEventListener('DOMContentLoaded', function() {
@@ -3829,21 +3857,21 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     <div class="main-content">
         <h1>HF Kernels - Flash Attention 3</h1>
 <h2>HuggingFace Kernels Flash Attention 3 Benchmark</h2>
-<div class="cell cell-failed" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 0.05s | FAILED
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3856,13 +3884,13 @@ Cell: benchmark | 0.05s | FAILED
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
-<span class="c1">#     &quot;torch&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1">#     &quot;kernels&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
-<span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3930,9 +3958,154 @@ Cell: benchmark | 0.05s | FAILED
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
-<div class="cell-stderr">  × Failed to resolve script requirement
-  ╰─▶ Distribution not found at:
-      file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 </div>
 </div>
 </div>

             white-space: pre-wrap;
             color: var(--text-primary);
         }
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-primary);
+            /* key bits */
+            overflow: auto;          /* show scrollbars when needed */
+            max-width: 100%;         /* respects whatever layout width you give it */
+        }
+        .cell-stdout .stdout-text {
+            margin: 0;               /* reset pre default margin */
+            white-space: pre;        /* keep line breaks, NO wrapping */
+            display: inline-block;   /* shrink-to-content */
+            min-width: max-content;  /* allow very long lines to define intrinsic width */
+            font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+            tab-size: 2;
+        }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
             }
         }
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
         document.addEventListener('DOMContentLoaded', function() {
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     <div class="main-content">
         <h1>HF Kernels - Flash Attention 3</h1>
 <h2>HuggingFace Kernels Flash Attention 3 Benchmark</h2>
+<div class="cell" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 5.65s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1">#     &quot;kernels&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | flux_L128
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                 hf_kernels_flash_attn3         9.00%     178.129us        99.63%       1.971ms       1.971ms       0.000us         0.00%     345.823us     345.823us             1
+                                          FlashAttnFunc         6.66%     131.797us        90.63%       1.793ms     597.659us       0.000us         0.00%     345.823us     115.274us             3
+                        _flash_attn3_48fe103_dirty::fwd         4.56%      90.256us        83.97%       1.661ms     553.727us     259.583us       100.00%     345.823us     115.274us             3
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us     292.158us       112.55%     292.158us     292.158us             1
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     259.583us       100.00%     259.583us      86.528us             3
+                                Activity Buffer Request        73.82%       1.460ms        73.82%       1.460ms       1.460ms      86.240us        33.22%      86.240us      86.240us             1
+                                            aten::empty         2.53%      50.052us         2.53%      50.052us       8.342us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.86%      16.921us         0.86%      16.921us       5.640us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         2.20%      43.551us         2.20%      43.551us      14.517us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         0.37%       7.311us         0.37%       7.311us       7.311us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 1.978ms
+Self CUDA time total: 259.583us
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | flux_L256
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                 hf_kernels_flash_attn3         7.20%     133.787us        96.41%       1.793ms       1.793ms       0.000us         0.00%     393.753us     393.753us             1
+                                          FlashAttnFunc         5.05%      93.854us        89.22%       1.659ms     552.953us       0.000us         0.00%     393.753us     131.251us             3
+                        _flash_attn3_48fe103_dirty::fwd         2.68%      49.913us        84.17%       1.565ms     521.669us     293.595us       100.00%     393.753us     131.251us             3
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us     295.003us       100.48%     295.003us     295.003us             1
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     293.595us       100.00%     293.595us      97.865us             3
+                                Activity Buffer Request        78.08%       1.452ms        78.08%       1.452ms       1.452ms     100.158us        34.11%     100.158us     100.158us             1
+                                            aten::empty         1.44%      26.770us         1.44%      26.770us       4.462us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.31%       5.680us         0.31%       5.680us       1.893us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         1.66%      30.852us         1.66%      30.852us      10.284us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         3.59%      66.713us         3.59%      66.713us      66.713us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 1.859ms
+Self CUDA time total: 293.595us
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | flux_L320
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                 hf_kernels_flash_attn3         6.76%     125.695us        94.13%       1.750ms       1.750ms       0.000us         0.00%     430.748us     430.748us             1
+                                          FlashAttnFunc         4.90%      91.016us        87.37%       1.624ms     541.277us       0.000us         0.00%     430.748us     143.583us             3
+                        _flash_attn3_48fe103_dirty::fwd         2.79%      51.770us        82.47%       1.533ms     510.938us     324.541us       100.00%     430.748us     143.583us             3
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us     325.948us       100.43%     325.948us     325.948us             1
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     324.541us       100.00%     324.541us     108.180us             3
+                                Activity Buffer Request        76.46%       1.421ms        76.46%       1.421ms       1.421ms     106.207us        32.73%     106.207us     106.207us             1
+                                            aten::empty         1.41%      26.162us         1.41%      26.162us       4.360us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.27%       5.061us         0.27%       5.061us       1.687us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         1.55%      28.862us         1.55%      28.862us       9.621us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         5.87%     109.015us         5.87%     109.015us     109.015us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 1.859ms
+Self CUDA time total: 324.541us
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | flux_L384
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                 hf_kernels_flash_attn3         6.04%     124.874us        95.07%       1.964ms       1.964ms       0.000us         0.00%     429.567us     429.567us             1
+                                          FlashAttnFunc         4.57%      94.345us        89.03%       1.840ms     613.174us       0.000us         0.00%     429.567us     143.189us             3
+                        _flash_attn3_48fe103_dirty::fwd         2.60%      53.754us        84.46%       1.745ms     581.725us     322.591us       100.00%     429.567us     143.189us             3
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us     324.063us       100.46%     324.063us     324.063us             1
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     322.591us       100.00%     322.591us     107.530us             3
+                                Activity Buffer Request        69.43%       1.434ms        69.43%       1.434ms       1.434ms     106.976us        33.16%     106.976us     106.976us             1
+                                            aten::empty         1.29%      26.591us         1.29%      26.591us       4.432us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.25%       5.220us         0.25%       5.220us       1.740us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel        10.90%     225.141us        10.90%     225.141us      75.047us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         4.93%     101.805us         4.93%     101.805us     101.805us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.066ms
+Self CUDA time total: 322.591us
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | flux_L448
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                 hf_kernels_flash_attn3         5.77%     124.745us        87.87%       1.900ms       1.900ms       0.000us         0.00%     654.301us     654.301us             1
+                                          FlashAttnFunc         4.37%      94.576us        82.10%       1.775ms     591.589us       0.000us         0.00%     654.301us     218.100us             3
+                        _flash_attn3_48fe103_dirty::fwd         2.37%      51.203us        77.72%       1.680ms     560.064us     488.670us       100.00%     654.301us     218.100us             3
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us     490.142us       100.30%     490.142us     490.142us             1
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     488.670us       100.00%     488.670us     162.890us             3
+                                Activity Buffer Request        66.37%       1.435ms        66.37%       1.435ms       1.435ms     165.631us        33.89%     165.631us     165.631us             1
+                                            aten::empty         1.25%      26.990us         1.25%      26.990us       4.498us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.24%       5.250us         0.24%       5.250us       1.750us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         7.49%     161.858us         7.49%     161.858us      53.953us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize        12.13%     262.313us        12.13%     262.313us     262.313us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.162ms
+Self CUDA time total: 488.670us
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | flux_L512
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                 hf_kernels_flash_attn3         5.69%     119.216us        86.59%       1.815ms       1.815ms       0.000us         0.00%     666.625us     666.625us             1
+                                          FlashAttnFunc         4.40%      92.224us        80.91%       1.696ms     565.401us       0.000us         0.00%     666.625us     222.208us             3
+                        _flash_attn3_48fe103_dirty::fwd         2.44%      51.234us        76.51%       1.604ms     534.659us     497.473us       100.00%     666.625us     222.208us             3
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us     498.849us       100.28%     498.849us     498.849us             1
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     497.473us       100.00%     497.473us     165.824us             3
+                                Activity Buffer Request        64.99%       1.363ms        64.99%       1.363ms       1.363ms     169.152us        34.00%     169.152us     169.152us             1
+                                            aten::empty         1.25%      26.300us         1.25%      26.300us       4.383us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.27%       5.600us         0.27%       5.600us       1.867us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         7.55%     158.288us         7.55%     158.288us      52.763us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize        13.41%     281.113us        13.41%     281.113us     281.113us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.097ms
+Self CUDA time total: 497.473us
+impl                     wl                  p50(ms)  ok
+hf_kernels_flash_attn3   flux_L128              0.13  True
+hf_kernels_flash_attn3   flux_L256              0.15  True
+hf_kernels_flash_attn3   flux_L320              0.16  True
+hf_kernels_flash_attn3   flux_L384              0.16  True
+hf_kernels_flash_attn3   flux_L448              0.21  True
+hf_kernels_flash_attn3   flux_L512              0.21  True
+</pre></div>
+<div class="cell-stderr">
+Fetching 4 files:   0%|          | 0/4 [00:00&lt;?, ?it/s]
+Fetching 4 files:  50%|█████     | 2/4 [00:01&lt;00:01,  1.23it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00,  2.46it/s]
+</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
 </div>
 </div>
 </div>

flash_attn/impls/mem_efficient_attention.html CHANGED Viewed

@@ -706,6 +706,29 @@
             white-space: pre-wrap;
             color: var(--text-primary);
         }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
-                    if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
             }
         }
-        // Live reload functionality (robust SSE handling)
-        (function(){
-            if (!('EventSource' in window)) {
-                console.warn('SSE not supported in this browser');
-                return;
-            }
-            let source = new EventSource('/events');
-            let isOpen = false;
-            source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
-            source.onmessage = function(e){
-                const msg=(e.data||'').trim(); if(!msg) return;
-                console.log('SSE message:', msg);
-                if (msg==='reload' || msg==='incremental') { location.reload(); }
-                // Ignore 'loading' to avoid premature reload loops
-            };
-            source.onerror = function(e){
-                // Let EventSource auto-reconnect instead of forcing a reload
-                if (isOpen) console.warn('SSE error after open, retrying...', e);
-            };
-            window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
-        })();
         document.addEventListener('DOMContentLoaded', function() {
@@ -3829,21 +3857,21 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     <div class="main-content">
         <h1>Memory Efficient Attention Implementation</h1>
 <h2>Memory Efficient SDPA Benchmark</h2>
-<div class="cell cell-failed" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 0.01s | FAILED
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3855,12 +3883,12 @@ Cell: benchmark | 0.01s | FAILED
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
-<span class="c1">#     &quot;torch&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
-<span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3926,9 +3954,203 @@ Cell: benchmark | 0.01s | FAILED
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
-<div class="cell-stderr">  × Failed to resolve script requirement
-  ╰─▶ Distribution not found at:
-      file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 </div>
 </div>
 </div>

             white-space: pre-wrap;
             color: var(--text-primary);
         }
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-primary);
+            /* key bits */
+            overflow: auto;          /* show scrollbars when needed */
+            max-width: 100%;         /* respects whatever layout width you give it */
+        }
+        .cell-stdout .stdout-text {
+            margin: 0;               /* reset pre default margin */
+            white-space: pre;        /* keep line breaks, NO wrapping */
+            display: inline-block;   /* shrink-to-content */
+            min-width: max-content;  /* allow very long lines to define intrinsic width */
+            font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+            tab-size: 2;
+        }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
             }
         }
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
         document.addEventListener('DOMContentLoaded', function() {
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     <div class="main-content">
         <h1>Memory Efficient Attention Implementation</h1>
 <h2>Memory Efficient SDPA Benchmark</h2>
+<div class="cell" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 3.60s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">
+======================================================================
+PROFILE TRACE: torch_mem_eff | flux_L128
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us     743.839us       143.68%     743.839us     743.839us             1
+                                          torch_mem_eff        14.97%     353.534us        98.94%       2.336ms       2.336ms       0.000us         0.00%     525.535us     525.535us             1
+                     aten::scaled_dot_product_attention         1.34%      31.582us         7.53%     177.879us      59.293us       0.000us         0.00%     451.039us     150.346us             3
+          aten::_scaled_dot_product_efficient_attention         0.99%      23.447us         6.20%     146.297us      48.766us       0.000us         0.00%     451.039us     150.346us             3
+                     aten::_efficient_attention_forward         1.49%      35.270us         4.27%     100.806us      33.602us     451.039us        87.12%     451.039us     150.346us             3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us     451.039us        87.12%     451.039us     150.346us             3
+                                       aten::contiguous         0.56%      13.241us        73.52%       1.736ms     192.899us       0.000us         0.00%      74.496us       8.277us             9
+                                            aten::clone         1.47%      34.702us        72.96%       1.723ms     191.428us       0.000us         0.00%      74.496us       8.277us             9
+                                            aten::copy_         3.23%      76.247us        68.33%       1.614ms     179.290us      66.656us        12.88%      74.496us       8.277us             9
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      66.656us        12.88%      66.656us       7.406us             9
+                                Activity Buffer Request        61.73%       1.458ms        61.73%       1.458ms       1.458ms       7.840us         1.51%       7.840us       7.840us             1
+                                        aten::transpose         2.92%      68.989us         3.85%      90.910us       3.788us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::as_strided         0.93%      21.921us         0.93%      21.921us       0.913us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::empty_like         0.77%      18.239us         3.16%      74.542us       8.282us       0.000us         0.00%       0.000us       0.000us             9
+                                            aten::empty         3.59%      84.706us         3.59%      84.706us       4.034us       0.000us         0.00%       0.000us       0.000us            21
+                                       cudaLaunchKernel         4.35%     102.715us         4.35%     102.715us       8.560us       0.000us         0.00%       0.000us       0.000us            12
+                                  cudaStreamIsCapturing         0.16%       3.710us         0.16%       3.710us       1.237us       0.000us         0.00%       0.000us       0.000us             3
+                                   cudaFuncSetAttribute         0.44%      10.440us         0.44%      10.440us       3.480us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         1.06%      24.961us         1.06%      24.961us      24.961us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.361ms
+Self CUDA time total: 517.695us
+======================================================================
+PROFILE TRACE: torch_mem_eff | flux_L256
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us     704.155us       121.71%     704.155us     704.155us             1
+                                          torch_mem_eff        11.29%     250.325us        93.54%       2.073ms       2.073ms       0.000us         0.00%     586.972us     586.972us             1
+                     aten::scaled_dot_product_attention         0.83%      18.299us         6.32%     139.996us      46.665us       0.000us         0.00%     507.229us     169.076us             3
+          aten::_scaled_dot_product_efficient_attention         0.91%      20.123us         5.49%     121.697us      40.566us       0.000us         0.00%     507.229us     169.076us             3
+                     aten::_efficient_attention_forward         1.32%      29.201us         3.61%      80.034us      26.678us     507.229us        87.67%     507.229us     169.076us             3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us     507.229us        87.67%     507.229us     169.076us             3
+                                       aten::contiguous         0.32%       7.068us        74.05%       1.641ms     182.386us       0.000us         0.00%      79.743us       8.860us             9
+                                            aten::clone         1.01%      22.352us        73.73%       1.634ms     181.601us       0.000us         0.00%      79.743us       8.860us             9
+                                            aten::copy_         2.89%      63.964us        70.44%       1.562ms     173.503us      71.327us        12.33%      79.743us       8.860us             9
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      71.327us        12.33%      71.327us       7.925us             9
+                                Activity Buffer Request        64.67%       1.433ms        64.67%       1.433ms       1.433ms       8.416us         1.45%       8.416us       8.416us             1
+                                        aten::transpose         2.15%      47.759us         2.85%      63.231us       2.635us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::as_strided         0.70%      15.472us         0.70%      15.472us       0.645us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::empty_like         0.52%      11.480us         2.28%      50.532us       5.615us       0.000us         0.00%       0.000us       0.000us             9
+                                            aten::empty         2.90%      64.203us         2.90%      64.203us       3.057us       0.000us         0.00%       0.000us       0.000us            21
+                                       cudaLaunchKernel         3.80%      84.195us         3.80%      84.195us       7.016us       0.000us         0.00%       0.000us       0.000us            12
+                                  cudaStreamIsCapturing         0.10%       2.170us         0.10%       2.170us       0.723us       0.000us         0.00%       0.000us       0.000us             3
+                                   cudaFuncSetAttribute         0.15%       3.380us         0.15%       3.380us       1.127us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         6.46%     143.197us         6.46%     143.197us     143.197us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.217ms
+Self CUDA time total: 578.556us
+======================================================================
+PROFILE TRACE: torch_mem_eff | flux_L320
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us     741.345us       118.22%     741.345us     741.345us             1
+                                          torch_mem_eff        10.83%     244.352us        91.98%       2.075ms       2.075ms       0.000us         0.00%     636.768us     636.768us             1
+                     aten::scaled_dot_product_attention         0.80%      18.001us         6.18%     139.437us      46.479us       0.000us         0.00%     543.969us     181.323us             3
+          aten::_scaled_dot_product_efficient_attention         0.80%      18.160us         5.38%     121.436us      40.479us       0.000us         0.00%     543.969us     181.323us             3
+                     aten::_efficient_attention_forward         1.26%      28.484us         3.53%      79.573us      26.524us     543.969us        86.74%     543.969us     181.323us             3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us     543.969us        86.74%     543.969us     181.323us             3
+                                       aten::contiguous         0.34%       7.591us        72.87%       1.644ms     182.689us       0.000us         0.00%      92.799us      10.311us             9
+                                            aten::clone         1.02%      22.973us        72.53%       1.637ms     181.846us       0.000us         0.00%      92.799us      10.311us             9
+                                            aten::copy_         2.84%      64.004us        69.28%       1.563ms     173.686us      83.135us        13.26%      92.799us      10.311us             9
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      83.135us        13.26%      83.135us       9.237us             9
+                                Activity Buffer Request        63.58%       1.435ms        63.58%       1.435ms       1.435ms       9.664us         1.54%       9.664us       9.664us             1
+                                        aten::transpose         2.42%      54.684us         3.15%      71.104us       2.963us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::as_strided         0.73%      16.420us         0.73%      16.420us       0.684us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::empty_like         0.53%      12.038us         2.24%      50.461us       5.607us       0.000us         0.00%       0.000us       0.000us             9
+                                            aten::empty         2.78%      62.772us         2.78%      62.772us       2.989us       0.000us         0.00%       0.000us       0.000us            21
+                                       cudaLaunchKernel         3.80%      85.752us         3.80%      85.752us       7.146us       0.000us         0.00%       0.000us       0.000us            12
+                                  cudaStreamIsCapturing         0.10%       2.260us         0.10%       2.260us       0.753us       0.000us         0.00%       0.000us       0.000us             3
+                                   cudaFuncSetAttribute         0.15%       3.330us         0.15%       3.330us       1.110us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         8.02%     181.009us         8.02%     181.009us     181.009us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.256ms
+Self CUDA time total: 627.104us
+======================================================================
+PROFILE TRACE: torch_mem_eff | flux_L384
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us     762.814us       117.08%     762.814us     762.814us             1
+                                          torch_mem_eff        10.94%     270.925us        93.63%       2.319ms       2.319ms       0.000us         0.00%     663.068us     663.068us             1
+                     aten::scaled_dot_product_attention         0.75%      18.610us         6.03%     149.368us      49.789us       0.000us         0.00%     560.285us     186.762us             3
+          aten::_scaled_dot_product_efficient_attention         0.84%      20.750us         5.28%     130.758us      43.586us       0.000us         0.00%     560.285us     186.762us             3
+                     aten::_efficient_attention_forward         1.24%      30.680us         3.47%      85.933us      28.644us     560.285us        85.99%     560.285us     186.762us             3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us     560.285us        85.99%     560.285us     186.762us             3
+                                       aten::contiguous         0.34%       8.310us        74.76%       1.851ms     205.718us       0.000us         0.00%     102.783us      11.420us             9
+                                            aten::clone         0.93%      23.120us        74.43%       1.843ms     204.794us       0.000us         0.00%     102.783us      11.420us             9
+                                            aten::copy_         2.76%      68.243us        71.46%       1.770ms     196.615us      91.263us        14.01%     102.783us      11.420us             9
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      91.263us        14.01%      91.263us      10.140us             9
+                                Activity Buffer Request        57.69%       1.429ms        57.69%       1.429ms       1.429ms      11.520us         1.77%      11.520us      11.520us             1
+                                        aten::transpose         2.18%      53.884us         2.86%      70.837us       2.952us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::as_strided         0.68%      16.953us         0.68%      16.953us       0.706us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::empty_like         0.46%      11.381us         2.04%      50.492us       5.610us       0.000us         0.00%       0.000us       0.000us             9
+                                            aten::empty         2.62%      64.842us         2.62%      64.842us       3.088us       0.000us         0.00%       0.000us       0.000us            21
+                                       cudaLaunchKernel        11.97%     296.414us        11.97%     296.414us      24.701us       0.000us         0.00%       0.000us       0.000us            12
+                                  cudaStreamIsCapturing         0.10%       2.540us         0.10%       2.540us       0.847us       0.000us         0.00%       0.000us       0.000us             3
+                                   cudaFuncSetAttribute         0.13%       3.261us         0.13%       3.261us       1.087us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         6.37%     157.857us         6.37%     157.857us     157.857us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.476ms
+Self CUDA time total: 651.548us
+======================================================================
+PROFILE TRACE: torch_mem_eff | flux_L448
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us     811.582us       115.69%     811.582us     811.582us             1
+                                          torch_mem_eff        10.28%     258.922us        90.15%       2.271ms       2.271ms       0.000us         0.00%     712.095us     712.095us             1
+                     aten::scaled_dot_product_attention         0.74%      18.760us         5.47%     137.886us      45.962us       0.000us         0.00%     611.487us     203.829us             3
+          aten::_scaled_dot_product_efficient_attention         0.72%      18.189us         4.73%     119.126us      39.709us       0.000us         0.00%     611.487us     203.829us             3
+                     aten::_efficient_attention_forward         1.11%      28.033us         3.12%      78.704us      26.235us     611.487us        87.16%     611.487us     203.829us             3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us     611.487us        87.16%     611.487us     203.829us             3
+                                       aten::contiguous         0.29%       7.191us        72.68%       1.831ms     203.401us       0.000us         0.00%     100.608us      11.179us             9
+                                            aten::clone         0.89%      22.393us        72.40%       1.823ms     202.602us       0.000us         0.00%     100.608us      11.179us             9
+                                            aten::copy_         2.57%      64.604us        69.47%       1.750ms     194.423us      90.048us        12.84%     100.608us      11.179us             9
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.048us        12.84%      90.048us      10.005us             9
+                                Activity Buffer Request        58.13%       1.464ms        58.13%       1.464ms       1.464ms      10.560us         1.51%      10.560us      10.560us             1
+                                        aten::transpose         1.95%      49.033us         2.60%      65.375us       2.724us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::as_strided         0.65%      16.342us         0.65%      16.342us       0.681us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::empty_like         0.51%      12.912us         2.03%      51.223us       5.691us       0.000us         0.00%       0.000us       0.000us             9
+                                            aten::empty         2.50%      62.890us         2.50%      62.890us       2.995us       0.000us         0.00%       0.000us       0.000us            21
+                                       cudaLaunchKernel         9.59%     241.441us         9.59%     241.441us      20.120us       0.000us         0.00%       0.000us       0.000us            12
+                                  cudaStreamIsCapturing         0.09%       2.220us         0.09%       2.220us       0.740us       0.000us         0.00%       0.000us       0.000us             3
+                                   cudaFuncSetAttribute         0.14%       3.650us         0.14%       3.650us       1.217us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize         9.85%     248.062us         9.85%     248.062us     248.062us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.519ms
+Self CUDA time total: 701.535us
+======================================================================
+PROFILE TRACE: torch_mem_eff | flux_L512
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us     955.976us       112.33%     955.976us     955.976us             1
+                                          torch_mem_eff         9.37%     248.255us        85.34%       2.262ms       2.262ms       0.000us         0.00%     865.703us     865.703us             1
+                     aten::scaled_dot_product_attention         0.68%      17.990us         5.29%     140.316us      46.772us       0.000us         0.00%     738.854us     246.285us             3
+          aten::_scaled_dot_product_efficient_attention         0.72%      19.111us         4.61%     122.326us      40.775us       0.000us         0.00%     738.854us     246.285us             3
+                     aten::_efficient_attention_forward         1.10%      29.141us         2.98%      78.926us      26.309us     738.854us        86.81%     738.854us     246.285us             3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us     738.854us        86.81%     738.854us     246.285us             3
+                                       aten::contiguous         0.28%       7.521us        68.87%       1.825ms     202.832us       0.000us         0.00%     126.849us      14.094us             9
+                                            aten::clone         0.86%      22.848us        68.58%       1.818ms     201.996us       0.000us         0.00%     126.849us      14.094us             9
+                                            aten::copy_         2.53%      66.983us        65.79%       1.744ms     193.757us     112.225us        13.19%     126.849us      14.094us             9
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     112.225us        13.19%     112.225us      12.469us             9
+                                Activity Buffer Request        55.19%       1.463ms        55.19%       1.463ms       1.463ms      14.624us         1.72%      14.624us      14.624us             1
+                                        aten::transpose         2.08%      55.231us         2.73%      72.342us       3.014us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::as_strided         0.65%      17.111us         0.65%      17.111us       0.713us       0.000us         0.00%       0.000us       0.000us            24
+                                       aten::empty_like         0.44%      11.730us         1.94%      51.302us       5.700us       0.000us         0.00%       0.000us       0.000us             9
+                                            aten::empty         2.40%      63.653us         2.40%      63.653us       3.031us       0.000us         0.00%       0.000us       0.000us            21
+                                       cudaLaunchKernel         8.85%     234.503us         8.85%     234.503us      19.542us       0.000us         0.00%       0.000us       0.000us            12
+                                  cudaStreamIsCapturing         0.08%       2.150us         0.08%       2.150us       0.717us       0.000us         0.00%       0.000us       0.000us             3
+                                   cudaFuncSetAttribute         0.11%       2.981us         0.11%       2.981us       0.994us       0.000us         0.00%       0.000us       0.000us             3
+                                  cudaDeviceSynchronize        14.66%     388.669us        14.66%     388.669us     388.669us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.651ms
+Self CUDA time total: 851.079us
+impl                     wl                  p50(ms)  ok
+torch_mem_eff            flux_L128              0.23  True
+torch_mem_eff            flux_L256              0.26  True
+torch_mem_eff            flux_L320              0.28  True
+torch_mem_eff            flux_L384              0.28  True
+torch_mem_eff            flux_L448              0.30  True
+torch_mem_eff            flux_L512              0.34  True
+</pre></div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
 </div>
 </div>
 </div>

flash_attn/impls/sage_attention.html CHANGED Viewed

@@ -706,6 +706,29 @@
             white-space: pre-wrap;
             color: var(--text-primary);
         }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
-                    if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
             }
         }
-        // Live reload functionality (robust SSE handling)
-        (function(){
-            if (!('EventSource' in window)) {
-                console.warn('SSE not supported in this browser');
-                return;
-            }
-            let source = new EventSource('/events');
-            let isOpen = false;
-            source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
-            source.onmessage = function(e){
-                const msg=(e.data||'').trim(); if(!msg) return;
-                console.log('SSE message:', msg);
-                if (msg==='reload' || msg==='incremental') { location.reload(); }
-                // Ignore 'loading' to avoid premature reload loops
-            };
-            source.onerror = function(e){
-                // Let EventSource auto-reconnect instead of forcing a reload
-                if (isOpen) console.warn('SSE error after open, retrying...', e);
-            };
-            window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
-        })();
         document.addEventListener('DOMContentLoaded', function() {
@@ -3829,21 +3857,21 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     <div class="main-content">
         <h1>SageAttention Implementation</h1>
 <h2>SageAttention Benchmark (INT8 Quantized)</h2>
-<div class="cell cell-failed" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 0.05s | FAILED
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3855,14 +3883,14 @@ Cell: benchmark | 0.05s | FAILED
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
-<span class="c1">#     &quot;torch&quot;,</span>
 <span class="c1">#     &quot;kernels&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1">#     &quot;sageattention&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
-<span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3937,9 +3965,80 @@ Cell: benchmark | 0.05s | FAILED
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
-<div class="cell-stderr">  × Failed to resolve script requirement
-  ╰─▶ Distribution not found at:
-      file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 </div>
 </div>
 </div>

             white-space: pre-wrap;
             color: var(--text-primary);
         }
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-primary);
+            /* key bits */
+            overflow: auto;          /* show scrollbars when needed */
+            max-width: 100%;         /* respects whatever layout width you give it */
+        }
+        .cell-stdout .stdout-text {
+            margin: 0;               /* reset pre default margin */
+            white-space: pre;        /* keep line breaks, NO wrapping */
+            display: inline-block;   /* shrink-to-content */
+            min-width: max-content;  /* allow very long lines to define intrinsic width */
+            font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+            tab-size: 2;
+        }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
             }
         }
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
         document.addEventListener('DOMContentLoaded', function() {
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     <div class="main-content">
         <h1>SageAttention Implementation</h1>
 <h2>SageAttention Benchmark (INT8 Quantized)</h2>
+<div class="cell" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 34.80s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
 <span class="c1">#     &quot;kernels&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1">#     &quot;sageattention&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">impl                     wl                  p50(ms)  ok
+sage_int8_fp16           flux_L128             FAIL  False
+  Error: module &#x27;sage_attention_c88aae76123df82b&#x27; has no attribute &#x27;fwd&#x27;
+sage_int8_fp16           flux_L256             FAIL  False
+  Error: module &#x27;sage_attention_c88aae76123df82b&#x27; has no attribute &#x27;fwd&#x27;
+sage_int8_fp16           flux_L320             FAIL  False
+  Error: module &#x27;sage_attention_c88aae76123df82b&#x27; has no attribute &#x27;fwd&#x27;
+sage_int8_fp16           flux_L384             FAIL  False
+  Error: module &#x27;sage_attention_c88aae76123df82b&#x27; has no attribute &#x27;fwd&#x27;
+sage_int8_fp16           flux_L448             FAIL  False
+  Error: module &#x27;sage_attention_c88aae76123df82b&#x27; has no attribute &#x27;fwd&#x27;
+sage_int8_fp16           flux_L512             FAIL  False
+  Error: module &#x27;sage_attention_c88aae76123df82b&#x27; has no attribute &#x27;fwd&#x27;
+</pre></div>
+<div class="uv-install-logs" id="uv-logs-benchmark">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+   Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading sympy (6.0MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading matplotlib (8.3MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading setuptools (1.1MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading numpy (16.2MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading torch (846.9MiB)
+Downloading triton (148.3MiB)
+Downloading fonttools (4.7MiB)
+Downloading pillow (6.7MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading hf-xet (3.0MiB)
+ Downloading nvidia-cufile-cu12
+ Downloading kiwisolver
+ Downloading hf-xet
+ Downloading setuptools
+ Downloading networkx
+ Downloading fonttools
+ Downloading pillow
+      Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading matplotlib
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 48 packages in 236ms
+</div>
+</div>
+<div class="cell-stderr">Fetching 11 files:   0%|          | 0/11 [00:00&lt;?, ?it/s]
+Fetching 11 files:  73%|███████▎  | 8/11 [00:00&lt;00:00,  9.16it/s]
+Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 12.59it/s]</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
 </div>
 </div>
 </div>

flash_attn/impls/xformers.html CHANGED Viewed

@@ -706,6 +706,29 @@
             white-space: pre-wrap;
             color: var(--text-primary);
         }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
-                    if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
             }
         }
-        // Live reload functionality (robust SSE handling)
-        (function(){
-            if (!('EventSource' in window)) {
-                console.warn('SSE not supported in this browser');
-                return;
-            }
-            let source = new EventSource('/events');
-            let isOpen = false;
-            source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
-            source.onmessage = function(e){
-                const msg=(e.data||'').trim(); if(!msg) return;
-                console.log('SSE message:', msg);
-                if (msg==='reload' || msg==='incremental') { location.reload(); }
-                // Ignore 'loading' to avoid premature reload loops
-            };
-            source.onerror = function(e){
-                // Let EventSource auto-reconnect instead of forcing a reload
-                if (isOpen) console.warn('SSE error after open, retrying...', e);
-            };
-            window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
-        })();
         document.addEventListener('DOMContentLoaded', function() {
@@ -3829,21 +3857,21 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     <div class="main-content">
         <h1>xFormers Memory Efficient Attention</h1>
 <h2>xFormers Benchmark</h2>
-<div class="cell cell-failed" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 0.01s | FAILED
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3855,13 +3883,13 @@ Cell: benchmark | 0.01s | FAILED
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
-<span class="c1">#     &quot;torch&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1">#     &quot;xformers&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
-<span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3926,9 +3954,169 @@ Cell: benchmark | 0.01s | FAILED
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
-<div class="cell-stderr">  × Failed to resolve script requirement
-  ╰─▶ Distribution not found at:
-      file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 </div>
 </div>
 </div>

             white-space: pre-wrap;
             color: var(--text-primary);
         }
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-primary);
+            /* key bits */
+            overflow: auto;          /* show scrollbars when needed */
+            max-width: 100%;         /* respects whatever layout width you give it */
+        }
+        .cell-stdout .stdout-text {
+            margin: 0;               /* reset pre default margin */
+            white-space: pre;        /* keep line breaks, NO wrapping */
+            display: inline-block;   /* shrink-to-content */
+            min-width: max-content;  /* allow very long lines to define intrinsic width */
+            font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+            tab-size: 2;
+        }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
             }
         }
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
         document.addEventListener('DOMContentLoaded', function() {
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
     <div class="main-content">
         <h1>xFormers Memory Efficient Attention</h1>
 <h2>xFormers Benchmark</h2>
+<div class="cell" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 4.83s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1">#     &quot;xformers&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">
+======================================================================
+PROFILE TRACE: xformers_meff | flux_L128
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us     506.718us       193.09%     506.718us     506.718us             1
+                                          xformers_meff        20.33%     479.463us        99.70%       2.351ms       2.351ms       0.000us         0.00%     351.872us     351.872us             1
+                             xformers_flash3::flash_fwd         8.78%     206.960us        77.92%       1.837ms     612.487us       0.000us         0.00%     351.872us     117.291us             3
+                                      flash_attn_3::fwd         3.33%      78.433us        69.14%       1.631ms     543.500us     262.432us       100.00%     351.872us     117.291us             3
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     262.432us       100.00%     262.432us      87.477us             3
+                                Activity Buffer Request        61.85%       1.459ms        61.85%       1.459ms       1.459ms      89.440us        34.08%      89.440us      89.440us             1
+                                            aten::empty         1.44%      34.032us         1.44%      34.032us       5.672us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.62%      14.682us         0.62%      14.682us       4.894us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         1.89%      44.672us         1.89%      44.672us      14.891us       0.000us         0.00%       0.000us       0.000us             3
+                                          aten::reshape         0.50%      11.821us         1.45%      34.232us       5.705us       0.000us         0.00%       0.000us       0.000us             6
+                                             aten::view         0.95%      22.411us         0.95%      22.411us       3.735us       0.000us         0.00%       0.000us       0.000us             6
+                                  cudaDeviceSynchronize         0.30%       7.110us         0.30%       7.110us       7.110us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.358ms
+Self CUDA time total: 262.432us
+======================================================================
+PROFILE TRACE: xformers_meff | flux_L256
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us     457.756us       155.59%     457.756us     457.756us             1
+                                          xformers_meff        14.84%     310.507us        99.07%       2.072ms       2.072ms       0.000us         0.00%     391.132us     391.132us             1
+                             xformers_flash3::flash_fwd         7.41%     154.907us        83.06%       1.737ms     579.115us       0.000us         0.00%     391.132us     130.377us             3
+                                      flash_attn_3::fwd         2.73%      57.112us        75.65%       1.582ms     527.479us     294.205us       100.00%     391.132us     130.377us             3
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     294.205us       100.00%     294.205us      98.068us             3
+                                Activity Buffer Request        69.53%       1.454ms        69.53%       1.454ms       1.454ms      96.927us        32.95%      96.927us      96.927us             1
+                                            aten::empty         1.38%      28.932us         1.38%      28.932us       4.822us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.38%       7.960us         0.38%       7.960us       2.653us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         1.63%      34.022us         1.63%      34.022us      11.341us       0.000us         0.00%       0.000us       0.000us             3
+                                          aten::reshape         0.48%      10.060us         1.17%      24.410us       4.068us       0.000us         0.00%       0.000us       0.000us             6
+                                             aten::view         0.69%      14.350us         0.69%      14.350us       2.392us       0.000us         0.00%       0.000us       0.000us             6
+                                  cudaDeviceSynchronize         0.93%      19.421us         0.93%      19.421us      19.421us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.092ms
+Self CUDA time total: 294.205us
+======================================================================
+PROFILE TRACE: xformers_meff | flux_L320
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us     455.327us       140.30%     455.327us     455.327us             1
+                                          xformers_meff        14.70%     303.895us        98.43%       2.034ms       2.034ms       0.000us         0.00%     429.791us     429.791us             1
+                             xformers_flash3::flash_fwd         7.05%     145.707us        82.60%       1.707ms     568.998us       0.000us         0.00%     429.791us     143.264us             3
+                                      flash_attn_3::fwd         2.62%      54.152us        75.55%       1.561ms     520.429us     324.543us       100.00%     429.791us     143.264us             3
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     324.543us       100.00%     324.543us     108.181us             3
+                                Activity Buffer Request        69.54%       1.437ms        69.54%       1.437ms       1.437ms     105.248us        32.43%     105.248us     105.248us             1
+                                            aten::empty         1.47%      30.342us         1.47%      30.342us       5.057us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.27%       5.580us         0.27%       5.580us       1.860us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         1.65%      34.132us         1.65%      34.132us      11.377us       0.000us         0.00%       0.000us       0.000us             3
+                                          aten::reshape         0.42%       8.741us         1.13%      23.401us       3.900us       0.000us         0.00%       0.000us       0.000us             6
+                                             aten::view         0.71%      14.660us         0.71%      14.660us       2.443us       0.000us         0.00%       0.000us       0.000us             6
+                                  cudaDeviceSynchronize         1.57%      32.391us         1.57%      32.391us      32.391us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.067ms
+Self CUDA time total: 324.543us
+======================================================================
+PROFILE TRACE: xformers_meff | flux_L384
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us     460.189us       141.34%     460.189us     460.189us             1
+                                          xformers_meff        13.29%     304.067us        98.75%       2.259ms       2.259ms       0.000us         0.00%     433.468us     433.468us             1
+                             xformers_flash3::flash_fwd         6.63%     151.806us        84.43%       1.932ms     643.925us       0.000us         0.00%     433.468us     144.489us             3
+                                      flash_attn_3::fwd         2.38%      54.492us        77.79%       1.780ms     593.323us     325.597us       100.00%     433.468us     144.489us             3
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     325.597us       100.00%     325.597us     108.532us             3
+                                Activity Buffer Request        63.32%       1.449ms        63.32%       1.449ms       1.449ms     107.871us        33.13%     107.871us     107.871us             1
+                                            aten::empty         1.26%      28.813us         1.26%      28.813us       4.802us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.27%       6.140us         0.27%       6.140us       2.047us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel        10.56%     241.573us        10.56%     241.573us      80.524us       0.000us         0.00%       0.000us       0.000us             3
+                                          aten::reshape         0.41%       9.348us         1.03%      23.589us       3.931us       0.000us         0.00%       0.000us       0.000us             6
+                                             aten::view         0.62%      14.241us         0.62%      14.241us       2.374us       0.000us         0.00%       0.000us       0.000us             6
+                                  cudaDeviceSynchronize         1.25%      28.691us         1.25%      28.691us      28.691us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.288ms
+Self CUDA time total: 325.597us
+======================================================================
+PROFILE TRACE: xformers_meff | flux_L448
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                          xformers_meff        14.32%     335.208us        96.41%       2.256ms       2.256ms       0.000us         0.00%     650.207us     650.207us             1
+                             xformers_flash3::flash_fwd         6.57%     153.746us        81.05%       1.897ms     632.294us       0.000us         0.00%     650.207us     216.736us             3
+                                      flash_attn_3::fwd         2.39%      56.024us        74.48%       1.743ms     581.045us     487.359us       100.00%     650.207us     216.736us             3
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us     545.022us       111.83%     545.022us     545.022us             1
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     487.359us       100.00%     487.359us     162.453us             3
+                                Activity Buffer Request        62.65%       1.466ms        62.65%       1.466ms       1.466ms     162.848us        33.41%     162.848us     162.848us             1
+                                            aten::empty         1.29%      30.110us         1.29%      30.110us       5.018us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.25%       5.800us         0.25%       5.800us       1.933us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         7.91%     185.030us         7.91%     185.030us      61.677us       0.000us         0.00%       0.000us       0.000us             3
+                                          aten::reshape         0.42%       9.770us         1.04%      24.390us       4.065us       0.000us         0.00%       0.000us       0.000us             6
+                                             aten::view         0.62%      14.620us         0.62%      14.620us       2.437us       0.000us         0.00%       0.000us       0.000us             6
+                                  cudaDeviceSynchronize         3.59%      83.934us         3.59%      83.934us      83.934us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.340ms
+Self CUDA time total: 487.359us
+======================================================================
+PROFILE TRACE: xformers_meff | flux_L512
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                          xformers_meff        13.07%     298.846us        95.47%       2.183ms       2.183ms       0.000us         0.00%     676.610us     676.610us             1
+                             xformers_flash3::flash_fwd         6.50%     148.626us        81.42%       1.862ms     620.693us       0.000us         0.00%     676.610us     225.537us             3
+                                      flash_attn_3::fwd         2.33%      53.191us        74.93%       1.713ms     571.151us     505.889us       100.00%     676.610us     225.537us             3
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us     520.769us       102.94%     520.769us     520.769us             1
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     505.889us       100.00%     505.889us     168.630us             3
+                                Activity Buffer Request        63.62%       1.455ms        63.62%       1.455ms       1.455ms     170.721us        33.75%     170.721us     170.721us             1
+                                            aten::empty         1.23%      28.092us         1.23%      28.092us       4.682us       0.000us         0.00%       0.000us       0.000us             6
+                                   cudaFuncSetAttribute         0.25%       5.790us         0.25%       5.790us       1.930us       0.000us         0.00%       0.000us       0.000us             3
+                                       cudaLaunchKernel         7.50%     171.540us         7.50%     171.540us      57.180us       0.000us         0.00%       0.000us       0.000us             3
+                                          aten::reshape         0.38%       8.590us         0.98%      22.470us       3.745us       0.000us         0.00%       0.000us       0.000us             6
+                                             aten::view         0.61%      13.880us         0.61%      13.880us       2.313us       0.000us         0.00%       0.000us       0.000us             6
+                                  cudaDeviceSynchronize         4.53%     103.496us         4.53%     103.496us     103.496us       0.000us         0.00%       0.000us       0.000us             1
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+Self CPU time total: 2.287ms
+Self CUDA time total: 505.889us
+impl                     wl                  p50(ms)  ok
+xformers_meff            flux_L128              0.20  True
+xformers_meff            flux_L256              0.21  True
+xformers_meff            flux_L320              0.22  True
+xformers_meff            flux_L384              0.22  True
+xformers_meff            flux_L448              0.28  True
+xformers_meff            flux_L512              0.27  True
+</pre></div>
+<div class="uv-install-logs" id="uv-logs-benchmark">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Downloading xformers (111.8MiB)
+ Downloading xformers
+Installed 1 package in 14ms
+</div>
+</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
 </div>
 </div>
 </div>

flash_attn/results/artifacts/combine/latency.svg CHANGED Viewed

Git LFS Details

SHA256: af1280c87fa60ce034a98afb4f52eca9686cbe35c7ed7a9fc31248f6d6c05ea2
Pointer size: 130 Bytes
Size of remote file: 29.8 kB

Git LFS Details

SHA256: dcf3186873eba7261121e895010b1119e477b4cfef20b846ea699f8779951f5d
Pointer size: 130 Bytes
Size of remote file: 27 kB

flash_attn/results/cells/combine.py CHANGED Viewed

@@ -1,319 +1,69 @@
 # /// script
 # requires-python = ">=3.10"
-# dependencies = [
-#     "numpy",
-#     "torch",
-#     "kernels-benchmark-tools",
-#     "matplotlib",
-# ]
-#
 # [tool.uv.sources]
-# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
 # ///
-import os
-import sys
-from pathlib import Path
-import json
-import torch  # noqa: F401  # imported because upstream may expect torch to be importable
-import kernels_benchmark_tools as kbt
-# --- Matplotlib setup and helpers ------------------------------------------------
-import matplotlib as mpl
-import matplotlib.pyplot as plt
-import csv
-# Keep text as text (not paths) so CSS can style fonts, size, etc.
-mpl.rcParams["svg.fonttype"] = "none"
-# Make ids deterministic across builds
-mpl.rcParams["svg.hashsalt"] = "latency-benchmark-combined"
-# Avoid auto-closed figures interfering with our tagging
-mpl.rcParams["figure.autolayout"] = True
-# Make background transparent
-mpl.rcParams["figure.facecolor"] = "none"
-mpl.rcParams["axes.facecolor"] = "none"
-mpl.rcParams["savefig.facecolor"] = "none"
-mpl.rcParams["savefig.edgecolor"] = "none"
-def _slugify(s: str) -> str:
-    s = (s or "").strip().lower()
-    keep = []
-    for ch in s:
-        if ch.isalnum():
-            keep.append(ch)
-        elif ch in (" ", "-", "_", "/", ".", ":"):
-            keep.append("-")
-        else:
-            keep.append("")
-    out = "".join(keep)
-    while "--" in out:
-        out = out.replace("--", "-")
-    return out.strip("-") or "unnamed"
-def _tag_current_figure(default_series_prefix="series"):
-    """Attach SVG ids (gid) to key artists so they can be targeted from CSS."""
-    fig = plt.gcf()
-    if fig is None:
-        return
-    # Tag the figure itself
-    fig.set_gid("figure--latency")
-    for ax_idx, ax in enumerate(fig.get_axes(), start=1):
-        ax.set_gid(f"axes--{ax_idx}")
-        # Axis labels & title
-        if ax.get_title():
-            for t in ax.texts:
-                if t.get_text() == ax.get_title():
-                    t.set_gid("title--main")
-        if ax.xaxis and ax.xaxis.get_label():
-            ax.xaxis.label.set_gid("label--x")
-        if ax.yaxis and ax.yaxis.get_label():
-            ax.yaxis.label.set_gid("label--y")
-        # Gridlines
-        for i, gl in enumerate(ax.get_xgridlines(), start=1):
-            gl.set_gid(f"grid-x--{i}")
-        for i, gl in enumerate(ax.get_ygridlines(), start=1):
-            gl.set_gid(f"grid-y--{i}")
-        # Legend block & entries
-        leg = ax.get_legend()
-        if leg is not None:
-            leg.set_gid("legend")
-            for i, txt in enumerate(leg.get_texts(), start=1):
-                label_slug = _slugify(txt.get_text())
-                txt.set_gid(f"legend-label--{label_slug or i}")
-        # Series (lines, patches)
-        # Lines
-        line_seen = {}
-        for ln in getattr(ax, "lines", []):
-            raw_label = ln.get_label() or ""
-            # Matplotlib uses labels beginning with "_" for non-legendable items
-            label = raw_label if not raw_label.startswith("_") else f"{default_series_prefix}"
-            slug = _slugify(label)
-            line_seen[slug] = line_seen.get(slug, 0) + 1
-            suffix = "" if line_seen[slug] == 1 else f"-{line_seen[slug]}"
-            ln.set_gid(f"series--{slug}{suffix}")
-        # Patches (bars, areas)
-        patch_seen = {}
-        for pt in getattr(ax, "patches", []):
-            label = getattr(pt, "get_label", lambda: "")() or f"{default_series_prefix}"
-            if isinstance(label, str) and label.startswith("_"):
-                label = default_series_prefix
-            slug = _slugify(label)
-            patch_seen[slug] = patch_seen.get(slug, 0) + 1
-            suffix = "" if patch_seen[slug] == 1 else f"-{patch_seen[slug]}"
-            pt.set_gid(f"series--{slug}{suffix}")
-def _postprocess_svg_add_classes(svg_path: Path):
-    """Add convenient CSS classes alongside ids (e.g., class='series grid grid-x')."""
-    try:
-        import xml.etree.ElementTree as ET
-        ET.register_namespace("", "http://www.w3.org/2000/svg")
-        tree = ET.parse(svg_path)
-        root = tree.getroot()
-        for el in root.iter():
-            el_id = el.attrib.get("id", "")
-            if not el_id:
-                continue
-            cls = []
-            if el_id.startswith("figure--"):
-                cls.append("figure")
-            elif el_id.startswith("axes--"):
-                cls.append("axes")
-            elif el_id.startswith("grid-x--"):
-                cls += ["grid", "grid-x"]
-            elif el_id.startswith("grid-y--"):
-                cls += ["grid", "grid-y"]
-            elif el_id.startswith("legend"):
-                cls.append("legend")
-            elif el_id.startswith("label--x"):
-                cls.append("xlabel")
-            elif el_id.startswith("label--y"):
-                cls.append("ylabel")
-            elif el_id.startswith("title--"):
-                cls.append("title")
-            elif el_id.startswith("series--"):
-                cls.append("series")
-            if cls:
-                # Preserve any existing class (unlikely from Matplotlib)
-                existing = el.attrib.get("class", "")
-                el.set("class", (existing + " " + " ".join(cls)).strip())
-        tree.write(svg_path, encoding="utf-8", xml_declaration=True)
-    except Exception as e:
-        print(f"✗ SVG postprocess (classes) skipped: {e}")
-# Monkey-patch savefig to force SVG & ensure tagging occurs even if kbt.viz saves internally.
-_orig_savefig = plt.savefig
-def _savefig_svg(fname, *args, **kwargs):
-    # Always save as SVG at a stable path for the artifact system
-    out = Path("latency.svg")
-    kwargs["format"] = "svg"
-    # Ensure everything we care about has ids before export
-    _tag_current_figure()
-    res = _orig_savefig(out, *args, **kwargs)
-    # Add helpful CSS classes on top of ids
-    _postprocess_svg_add_classes(out)
-    print(f"✓ Combined visualization saved as {out}")
-    return res
-plt.savefig = _savefig_svg  # apply patch
-# Capture close calls in case kbt.viz() closes figures before we re-save
-_orig_close = plt.close
-_last_closed = {"fig": None}
-def _capture_close(arg=None):
-    try:
-        if hasattr(arg, "savefig"):  # looks like a Figure
-            _last_closed["fig"] = arg
-        else:
-            _last_closed["fig"] = plt.gcf()
-    finally:
-        return _orig_close(arg)
-plt.close = _capture_close
-# --- Locate benchmark artifacts --------------------------------------------------
-cache_dirs = {
-    "Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
-    "MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
-    "Flash Attn 2": os.environ.get('UVNOTE_FILE_FLASH_ATTN2_BENCHMARK'),
-    "xFormers": os.environ.get('UVNOTE_FILE_XFORMERS_BENCHMARK'),
-    "SageAttention": os.environ.get('UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK'),
-    "Compiled (default)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT'),
-    "Compiled (max-autotune)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE'),
-    "HF Kernels Flash Attn": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK'),
-    "HF Kernels Flash Attn3": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK'),
 }
-print("LOADING BENCHMARK DATA")
-for name, cache_dir in cache_dirs.items():
-    print(f"{name:30s}: {cache_dir}")
-print()
 file_mapping = {
-    "Flash (PyTorch SDPA)": "attn.jsonl",
-    "MemEff (PyTorch SDPA)": "attn.jsonl",
-    "Flash Attn 2": "attn.jsonl",
-    "xFormers": "attn.jsonl",
-    "SageAttention": "attn.jsonl",
     "Compiled (default)": "attn_default.jsonl",
     "Compiled (max-autotune)": "attn_max_autotune.jsonl",
-    "HF Kernels Flash Attn": "attn.jsonl",
-    "HF Kernels Flash Attn3": "attn.jsonl",
 }
 all_paths = []
-for name, cache_dir in cache_dirs.items():
     if cache_dir:
-        path = Path(cache_dir) / file_mapping[name]
         if path.exists() and path.stat().st_size > 0:
             all_paths.append(str(path))
             print(f"✓ Found {name}: {path}")
         else:
-            print(f"⊘ Empty/Missing {name}: {path}")
     else:
-        print(f"✗ No cache dir for {name}")
-print()
 if not all_paths:
     print("ERROR: No benchmark data files found!")
-    # restore patched functions before exiting
-    plt.savefig = _orig_savefig
-    plt.close = _orig_close
     sys.exit(1)
-# --- Summary + Visualization -----------------------------------------------------
-print("COMBINED BENCHMARK SUMMARY\n")
-kbt.summarize(all_paths)
-print("\nGENERATING COMBINED VISUALIZATION\n")
 try:
-    # If kbt.viz saves internally, our patched savefig ensures SVG gets written,
-    # and it will carry ids/classes for CSS styling.
-    kbt.viz(all_paths)
-    # Safety net: if kbt.viz didn't save, save now.
-    # if not Path("latency.svg").exists():
-    #     _tag_current_figure()
-    # plt.savefig("latency.svg")
-    plt.savefig("latency.svg")  # ensure saved with tagging
-    print("✓ SVG visualization ready: latency.svg!")
-except ImportError as e:
-    print(f"✗ Visualization requires matplotlib: {e}")
-except Exception as e:
-    print(f"✗ Visualization failed: {e}")
 finally:
-    # Clean up patches to avoid side effects in later cells
     plt.savefig = _orig_savefig
-    plt.close = _orig_close
-print()
-print("ANALYSIS COMPLETE")
-print(f"Total implementations analyzed: {len(all_paths)}")
-print(f"\nImplementations included:")
-for name, cache_dir in cache_dirs.items():
-    if cache_dir:
-        path = Path(cache_dir) / file_mapping[name]
-        if path.exists() and path.stat().st_size > 0:
-            print(f"  ✓ {name}")
-# Collect all benchmark data and export to CSV
-all_data = {}
-for name, cache_dir in cache_dirs.items():
-    if cache_dir:
-        path = Path(cache_dir) / file_mapping[name]
-        if path.exists() and path.stat().st_size > 0:
-            with open(path, 'r') as f:
-                records = [json.loads(line) for line in f]
-                all_data[name] = records
-# Export to CSV
-csv_path = Path("latency.csv")
-with open(csv_path, 'w', newline='') as csvfile:
-    writer = csv.writer(csvfile)
-    # Write header
-    header = ["Implementation", "Impl ID", "Workload", "Batch", "Seq Length", "Heads", "Head Dim", "Dtype",
-              "Mean (ms)", "P10 (ms)", "P50 (ms)", "P90 (ms)", "Reps",
-            #   "Compile (ms)",
-              "Peak Mem (MB)", "Backend", "Family"]
-    writer.writerow(header)
-    # Write data rows
-    for impl_name, records in all_data.items():
-        for record in records:
-            wl = record.get('wl', {})
-            lat = record.get('lat_ms', {})
-            tags = record.get('tags', {})
-            row = [
-                impl_name,
-                record.get('impl', ''),
-                wl.get('name', ''),
-                wl.get('batch', ''),
-                wl.get('seq_len', ''),
-                wl.get('heads', ''),
-                wl.get('head_dim', ''),
-                wl.get('dtype', ''),
-                lat.get('mean', ''),
-                lat.get('p10', ''),
-                lat.get('p50', ''),
-                lat.get('p90', ''),
-                lat.get('reps', ''),
-                # record.get('compile_ms', ''),
-                round(record.get('peak_bytes', 0) / 1024 / 1024, 2) if record.get('peak_bytes') else '',
-                tags.get('backend', ''),
-                tags.get('family', ''),
-            ]
-            writer.writerow(row)
-print(f"✓ CSV export complete: {csv_path}")
-print(f"Total implementations: {len(all_data)}")
-print(f"Total records: {sum(len(records) for records in all_data.values())}")

 # /// script
 # requires-python = ">=3.10"
+# dependencies = ["torch", "kernels-benchmark-tools", "matplotlib"]
 # [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
 # ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+# Note: Flash attention has multiple implementations with different output files
+# Some use attn.jsonl, compiled variants use attn_default.jsonl and attn_max_autotune.jsonl
+cache_env_map = {
+    "Flash (PyTorch SDPA)": "UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK",
+    "MemEff (PyTorch SDPA)": "UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK",
+    "xFormers": "UVNOTE_FILE_XFORMERS_BENCHMARK",
+    "Compiled (default)": "UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT",
+    "Compiled (max-autotune)": "UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE",
+    "HF Kernels Flash Attn": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK",
+    "HF Kernels Flash Attn3": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK",
 }
+# For flash attention, we need custom file mapping
+import os
+from pathlib import Path
 file_mapping = {
     "Compiled (default)": "attn_default.jsonl",
     "Compiled (max-autotune)": "attn_max_autotune.jsonl",
 }
+# Collect paths with custom file names for compiled variants
 all_paths = []
+for name, env_var in cache_env_map.items():
+    cache_dir = os.environ.get(env_var)
     if cache_dir:
+        filename = file_mapping.get(name, "attn.jsonl")
+        path = Path(cache_dir) / filename
         if path.exists() and path.stat().st_size > 0:
             all_paths.append(str(path))
             print(f"✓ Found {name}: {path}")
         else:
+            print(f"⊘ Skipped {name}: {path}")
     else:
+        print(f"✗ Missing {name}")
 if not all_paths:
     print("ERROR: No benchmark data files found!")
+    import sys
     sys.exit(1)
+# Use the simplified visualization
+from kernels_benchmark_tools.core import tools
+from kernels_benchmark_tools.core.visuals import setup_svg_matplotlib, create_svg_with_tagging
+setup_svg_matplotlib()
+_orig_savefig, _orig_close = create_svg_with_tagging("latency.svg", "flash-attention")
 try:
+    print("\nCOMBINED BENCHMARK SUMMARY\n")
+    tools.summarize(all_paths)
+    print("\nGENERATING COMBINED VISUALIZATION\n")
+    tools.viz(all_paths)
+    import matplotlib.pyplot as plt
+    plt.savefig("latency.svg")
+    print("✓ SVG visualization ready!")
 finally:
     plt.savefig = _orig_savefig
+    plt.close = _orig_close

flash_attn/results/combined_results.html CHANGED Viewed

The diff for this file is too large to render. See raw diff

layer_norm/impls/artifacts/benchmark/ln.jsonl ADDED Viewed

	@@ -0,0 +1,8 @@

+{"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S512_D4096", "batch": 1, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.039122000089264475, "p50": 0.04020200003651553, "p90": 0.04062199991494708, "mean": 0.040302199977304554, "iqr": 0.00047999992602854036, "raw_times": [0.04020200003651553, 0.04142299985687714, 0.04062199991494708, 0.04014199998891854, 0.039122000089264475], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.049882999974215636, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S512_D8192", "batch": 1, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.03869200008921325, "p50": 0.039361000062854146, "p90": 0.03952199995183037, "mean": 0.039353600004687905, "iqr": 0.0002899998889915878, "raw_times": [0.03923200006283878, 0.03996099985670298, 0.03952199995183037, 0.039361000062854146, 0.03869200008921325], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04323200005273975, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S1024_D4096", "batch": 1, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.038322000136759016, "p50": 0.039080999840734876, "p90": 0.03983200008406129, "mean": 0.03918759998668975, "iqr": 0.0012000000424450263, "raw_times": [0.038322000136759016, 0.038632000041616266, 0.03983200008406129, 0.039080999840734876, 0.04007099983027729], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04126199996790092, "peak_bytes": 75513856, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0016021728515625, "mse": 1.1682510375976562e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S1024_D8192", "batch": 1, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.038531999962287955, "p50": 0.03957200010518136, "p90": 0.040011999999478576, "mean": 0.040755799955149996, "iqr": 0.0013210001270635985, "raw_times": [0.04697199983638711, 0.03957200010518136, 0.040011999999478576, 0.03869099987241498, 0.038531999962287955], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04860299986830796, "peak_bytes": 151027712, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S2048_D4096", "batch": 1, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.03818100003627478, "p50": 0.039942000057635596, "p90": 0.04086199987796135, "mean": 0.044605999983104994, "iqr": 0.0025399997412023367, "raw_times": [0.06572299980689422, 0.04086199987796135, 0.03818100003627478, 0.039942000057635596, 0.038322000136759016], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.046752999878663104, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S2048_D8192", "batch": 1, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.046271999963209964, "p50": 0.046712000084880856, "p90": 0.0469120000161638, "mean": 0.04688640001404565, "iqr": 0.00020900006347801536, "raw_times": [0.046712000084880856, 0.04783300005328783, 0.046271999963209964, 0.046702999952685786, 0.0469120000161638], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.049573000069358386, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S4096_D4096", "batch": 1, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044022000110999215, "p50": 0.04556199996841315, "p90": 0.045742000111204106, "mean": 0.04578840002977813, "iqr": 0.000619000047663576, "raw_times": [0.04512300006354053, 0.04556199996841315, 0.045742000111204106, 0.044022000110999215, 0.04849299989473366], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04771299995809386, "peak_bytes": 302006272, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
+{"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S4096_D8192", "batch": 1, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.20399000004545087, "p50": 0.20584999992934172, "p90": 0.20648999998229556, "mean": 0.20627999997486768, "iqr": 0.0007099999947968172, "raw_times": [0.20399000004545087, 0.2092899999297515, 0.20577999998749874, 0.20648999998229556, 0.20584999992934172], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.20653999990827288, "peak_bytes": 604012544, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}

layer_norm/impls/cells/benchmark.py CHANGED Viewed

@@ -2,13 +2,13 @@
 # requires-python = ">=3.10"
 # dependencies = [
 #     "numpy",
-#     "torch",
 #     "kernels",
 #     "kernels-benchmark-tools",
 # ]
 #
 # [tool.uv.sources]
-# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
 # ///
 import torch
 from kernels import get_kernel

 # requires-python = ">=3.10"
 # dependencies = [
 #     "numpy",
+#     "torch==2.8.0",
 #     "kernels",
 #     "kernels-benchmark-tools",
 # ]
 #
 # [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
 # ///
 import torch
 from kernels import get_kernel

layer_norm/impls/hf_kernels_layer_norm.html CHANGED Viewed

@@ -706,6 +706,29 @@
             white-space: pre-wrap;
             color: var(--text-primary);
         }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
-                    if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
             }
         }
-        // Live reload functionality (robust SSE handling)
-        (function(){
-            if (!('EventSource' in window)) {
-                console.warn('SSE not supported in this browser');
-                return;
-            }
-            let source = new EventSource('/events');
-            let isOpen = false;
-            source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
-            source.onmessage = function(e){
-                const msg=(e.data||'').trim(); if(!msg) return;
-                console.log('SSE message:', msg);
-                if (msg==='reload' || msg==='incremental') { location.reload(); }
-                // Ignore 'loading' to avoid premature reload loops
-            };
-            source.onerror = function(e){
-                // Let EventSource auto-reconnect instead of forcing a reload
-                if (isOpen) console.warn('SSE error after open, retrying...', e);
-            };
-            window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
-        })();
         document.addEventListener('DOMContentLoaded', function() {
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
@@ -3838,14 +3866,14 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <h1>HF Kernels LayerNorm Implementation</h1>
 <p>Based on kernels-community <code>layer-norm</code> kernel.</p>
 <h2>LayerNorm Benchmark (HF Kernels)</h2>
-<div class="cell cell-failed" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 0.05s | FAILED
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3856,13 +3884,13 @@ Cell: benchmark | 0.05s | FAILED
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
-<span class="c1">#     &quot;torch&quot;,</span>
 <span class="c1">#     &quot;kernels&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
-<span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
@@ -3920,9 +3948,28 @@ Cell: benchmark | 0.05s | FAILED
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
-<div class="cell-stderr">  × Failed to resolve script requirement
-  ╰─▶ Distribution not found at:
-      file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 </div>
 </div>
 </div>

             white-space: pre-wrap;
             color: var(--text-primary);
         }
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-primary);
+            /* key bits */
+            overflow: auto;          /* show scrollbars when needed */
+            max-width: 100%;         /* respects whatever layout width you give it */
+        }
+        .cell-stdout .stdout-text {
+            margin: 0;               /* reset pre default margin */
+            white-space: pre;        /* keep line breaks, NO wrapping */
+            display: inline-block;   /* shrink-to-content */
+            min-width: max-content;  /* allow very long lines to define intrinsic width */
+            font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+            tab-size: 2;
+        }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
             }
         }
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
         document.addEventListener('DOMContentLoaded', function() {
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
 <h1>HF Kernels LayerNorm Implementation</h1>
 <p>Based on kernels-community <code>layer-norm</code> kernel.</p>
 <h2>LayerNorm Benchmark (HF Kernels)</h2>
+<div class="cell" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 5.52s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
 <span class="c1">#     &quot;kernels&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">impl                     wl                  p50(ms)  ok
+hf_kernels_layer_norm    llama_S1024_D4096      0.04  False
+hf_kernels_layer_norm    llama_S1024_D8192      0.04  False
+hf_kernels_layer_norm    llama_S2048_D4096      0.04  False
+hf_kernels_layer_norm    llama_S2048_D8192      0.05  False
+hf_kernels_layer_norm    llama_S4096_D4096      0.05  False
+hf_kernels_layer_norm    llama_S4096_D8192      0.21  False
+hf_kernels_layer_norm    llama_S512_D4096       0.04  False
+hf_kernels_layer_norm    llama_S512_D8192       0.04  False
+</pre></div>
+<div class="uv-install-logs" id="uv-logs-benchmark">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Installed 10 packages in 16ms
+</div>
+</div>
+<div class="cell-stderr">Fetching 4 files:   0%|          | 0/4 [00:00&lt;?, ?it/s]
+Fetching 4 files:  50%|█████     | 2/4 [00:01&lt;00:01,  1.02it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00,  2.04it/s]</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark/ln.jsonl" class="artifact" target="_blank">ln.jsonl</a>
 </div>
 </div>
 </div>

layer_norm/impls/torch_layer_norm.html CHANGED Viewed

@@ -706,6 +706,29 @@
             white-space: pre-wrap;
             color: var(--text-primary);
         }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
-                    if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
             }
         }
-        // Live reload functionality (robust SSE handling)
-        (function(){
-            if (!('EventSource' in window)) {
-                console.warn('SSE not supported in this browser');
-                return;
-            }
-            let source = new EventSource('/events');
-            let isOpen = false;
-            source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
-            source.onmessage = function(e){
-                const msg=(e.data||'').trim(); if(!msg) return;
-                console.log('SSE message:', msg);
-                if (msg==='reload' || msg==='incremental') { location.reload(); }
-                // Ignore 'loading' to avoid premature reload loops
-            };
-            source.onerror = function(e){
-                // Let EventSource auto-reconnect instead of forcing a reload
-                if (isOpen) console.warn('SSE error after open, retrying...', e);
-            };
-            window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
-        })();
         document.addEventListener('DOMContentLoaded', function() {
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
@@ -3844,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: nv | 0.22s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3859,7 +3887,7 @@ Cell: nv | 0.22s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout">Wed Oct 22 08:58:23 2025
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3868,7 +3896,7 @@ Cell: nv | 0.22s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   26C    P8             22W /  350W |       0MiB /  46068MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
@@ -3880,19 +3908,19 @@ Cell: nv | 0.22s
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
-</div>
 </div>
 </div>
 <h2>LayerNorm Benchmark (PyTorch)</h2>
-<div class="cell cell-failed" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 0.01s | FAILED
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3903,12 +3931,12 @@ Cell: benchmark | 0.01s | FAILED
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
-<span class="c1">#     &quot;torch&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
-<span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
@@ -3946,9 +3974,25 @@ Cell: benchmark | 0.01s | FAILED
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
-<div class="cell-stderr">  × Failed to resolve script requirement
-  ╰─▶ Distribution not found at:
-      file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 </div>
 </div>
 </div>

             white-space: pre-wrap;
             color: var(--text-primary);
         }
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-primary);
+            /* key bits */
+            overflow: auto;          /* show scrollbars when needed */
+            max-width: 100%;         /* respects whatever layout width you give it */
+        }
+        .cell-stdout .stdout-text {
+            margin: 0;               /* reset pre default margin */
+            white-space: pre;        /* keep line breaks, NO wrapping */
+            display: inline-block;   /* shrink-to-content */
+            min-width: max-content;  /* allow very long lines to define intrinsic width */
+            font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+            tab-size: 2;
+        }
         .cell-stderr {
             background: var(--bg-error);
             border-left: 2px solid var(--border-error);
                 if(output){
                     output.classList.remove('output-stale');
                     let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
                     console.log('UV Logs:', data);
                     if(data.stderr) {
                         // Split UV logs from regular stderr
             }
         }
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
         document.addEventListener('DOMContentLoaded', function() {
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
         </div>
     </div>
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
+Cell: nv | 0.21s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 </div>
 </div>
 <div id="output-nv" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">Thu Oct 23 17:20:58 2025
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   35C    P0             70W /  350W |       0MiB /  46068MiB |     26%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
+</pre></div>
 </div>
 </div>
 <h2>LayerNorm Benchmark (PyTorch)</h2>
+<div class="cell" id="cell-benchmark">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 4.50s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 <span class="c1"># dependencies = [</span>
 <span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
 <span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
 <span class="c1"># ]</span>
 <span class="c1">#</span>
 <span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
 <span class="c1"># ///</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
 </div>
 </div>
 <div id="output-benchmark" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">impl                     wl                  p50(ms)  ok
+torch_layer_norm         llama_S1024_D4096      0.03  False
+torch_layer_norm         llama_S1024_D8192      0.03  False
+torch_layer_norm         llama_S2048_D4096      0.03  False
+torch_layer_norm         llama_S2048_D8192      0.05  False
+torch_layer_norm         llama_S4096_D4096      0.04  False
+torch_layer_norm         llama_S4096_D8192      0.20  False
+torch_layer_norm         llama_S512_D4096       0.03  False
+torch_layer_norm         llama_S512_D8192       0.03  False
+</pre></div>
+<div class="uv-install-logs" id="uv-logs-benchmark">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Installed 37 packages in 245ms
+</div>
+</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark/ln.jsonl" class="artifact" target="_blank">ln.jsonl</a>
 </div>
 </div>
 </div>

layer_norm/results/artifacts/combine/latency.svg ADDED Viewed

Git LFS Details

SHA256: dcfdf13c3e578bdaaa538562f5d5eb70e73719a5e2194d8058bdda8e34157d54
Pointer size: 128 Bytes
Size of remote file: 949 Bytes

layer_norm/results/cells/combine.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# /// script
+# requires-python = ">=3.10"
+# dependencies = ["torch", "kernels-benchmark-tools", "matplotlib"]
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+cache_env_map = {
+    "Torch LayerNorm": "UVNOTE_FILE_TORCH_LAYER_NORM_BENCHMARK",
+    "HF Kernels LayerNorm": "UVNOTE_FILE_HF_KERNELS_LAYER_NORM_BENCHMARK",
+}
+generate_combined_results(
+    cache_env_map=cache_env_map,
+    output_filename="ln.jsonl",
+    svg_filename="latency.svg",
+    figure_id="layernorm"
+)

layer_norm/results/combined_results.html ADDED Viewed

The diff for this file is too large to render. See raw diff