drbh HF Staff commited on
Commit
81fff32
·
verified ·
1 Parent(s): aa3ac98

Upload folder using huggingface_hub

Browse files
Files changed (31) hide show
  1. activation/impls/artifacts/benchmark/activation.jsonl +3 -0
  2. activation/impls/cells/benchmark.py +2 -2
  3. activation/impls/compiled_swiglu.html +172 -35
  4. activation/impls/hf_kernels_swiglu.html +139 -35
  5. activation/impls/torch_swiglu.html +138 -34
  6. activation/results/artifacts/combine/latency.svg +3 -0
  7. activation/results/cells/combine.py +27 -0
  8. activation/results/combined_results.html +0 -0
  9. flash_attn/impls/artifacts/benchmark/attn.jsonl +6 -6
  10. flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl +6 -6
  11. flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl +6 -6
  12. flash_attn/impls/cells/benchmark.py +11 -13
  13. flash_attn/impls/cells/benchmark_default.py +2 -2
  14. flash_attn/impls/cells/benchmark_max_autotune.py +2 -2
  15. flash_attn/impls/compiled_variants.html +336 -31
  16. flash_attn/impls/flash_attention.html +262 -34
  17. flash_attn/impls/hf_kernels_flash_attn.html +214 -29
  18. flash_attn/impls/hf_kernels_flash_attn3.html +203 -30
  19. flash_attn/impls/mem_efficient_attention.html +252 -30
  20. flash_attn/impls/sage_attention.html +130 -31
  21. flash_attn/impls/xformers.html +219 -31
  22. flash_attn/results/artifacts/combine/latency.svg +2 -2
  23. flash_attn/results/cells/combine.py +39 -289
  24. flash_attn/results/combined_results.html +0 -0
  25. layer_norm/impls/artifacts/benchmark/ln.jsonl +8 -0
  26. layer_norm/impls/cells/benchmark.py +2 -2
  27. layer_norm/impls/hf_kernels_layer_norm.html +78 -31
  28. layer_norm/impls/torch_layer_norm.html +79 -35
  29. layer_norm/results/artifacts/combine/latency.svg +3 -0
  30. layer_norm/results/cells/combine.py +19 -0
  31. layer_norm/results/combined_results.html +0 -0
activation/impls/artifacts/benchmark/activation.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {"ts": "2025-10-23T17:22:01Z", "run": "2868ab5dc1ce4d49ac015295dd5ab8d5", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "triton", "compile": "none"}, "wl": {"name": "llama_T512_D4096", "num_tokens": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.023811000119167147, "p50": 0.024261000135084032, "p90": 0.024421000034635654, "mean": 0.024255200014522416, "iqr": 0.00023000006876827683, "raw_times": [0.024261000135084032, 0.023811000119167147, 0.024190999965867377, 0.024591999817857868, 0.024421000034635654], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03041099989786744, "peak_bytes": 46139392, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
2
+ {"ts": "2025-10-23T17:22:01Z", "run": "2868ab5dc1ce4d49ac015295dd5ab8d5", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "triton", "compile": "none"}, "wl": {"name": "llama_T512_D8192", "num_tokens": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.030561000130546745, "p50": 0.031221000199366244, "p90": 0.031622000051356736, "mean": 0.031125600116865826, "iqr": 0.001030000021273736, "raw_times": [0.030561000130546745, 0.031221000199366244, 0.030592000030083, 0.031622000051356736, 0.031632000172976404], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03244200001972786, "peak_bytes": 92276736, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
3
+ {"ts": "2025-10-23T17:22:01Z", "run": "2868ab5dc1ce4d49ac015295dd5ab8d5", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "triton", "compile": "none"}, "wl": {"name": "llama_T512_D11008", "num_tokens": 512, "hidden_dim": 11008, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0339219998295448, "p50": 0.03464199994596129, "p90": 0.0347420000252896, "mean": 0.03469179991952842, "iqr": 0.00024100017981254496, "raw_times": [0.0339219998295448, 0.0347420000252896, 0.03464199994596129, 0.03565199995136936, 0.034500999845477054], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03648099982456188, "peak_bytes": 124520448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.01, "atol": 0.1, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "relmax": 0.0, "ref": "swiglu_bfloat16"}, "err": null}
activation/impls/cells/benchmark.py CHANGED
@@ -2,13 +2,13 @@
2
  # requires-python = ">=3.10"
3
  # dependencies = [
4
  # "numpy",
5
- # "torch",
6
  # "kernels-benchmark-tools",
7
  # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
11
- # kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
12
  # ///
13
  import torch
14
  import sys
 
2
  # requires-python = ">=3.10"
3
  # dependencies = [
4
  # "numpy",
5
+ # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
  # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
11
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
12
  # ///
13
  import torch
14
  import sys
activation/impls/compiled_swiglu.html CHANGED
@@ -706,6 +706,29 @@
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  .cell-stderr {
710
  background: var(--bg-error);
711
  border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3556
  if(output){
3557
  output.classList.remove('output-stale');
3558
  let html='';
3559
- if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
 
 
 
 
 
3560
  console.log('UV Logs:', data);
3561
  if(data.stderr) {
3562
  // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3678
  }
3679
  }
3680
 
3681
- // Live reload functionality (robust SSE handling)
3682
- (function(){
3683
- if (!('EventSource' in window)) {
3684
- console.warn('SSE not supported in this browser');
3685
- return;
3686
- }
3687
- let source = new EventSource('/events');
3688
- let isOpen = false;
3689
- source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3690
- source.onmessage = function(e){
3691
- const msg=(e.data||'').trim(); if(!msg) return;
3692
- console.log('SSE message:', msg);
3693
- if (msg==='reload' || msg==='incremental') { location.reload(); }
3694
- // Ignore 'loading' to avoid premature reload loops
3695
- };
3696
- source.onerror = function(e){
3697
- // Let EventSource auto-reconnect instead of forcing a reload
3698
- if (isOpen) console.warn('SSE error after open, retrying...', e);
3699
- };
3700
- window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3701
- })();
3702
 
3703
 
3704
  document.addEventListener('DOMContentLoaded', function() {
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
@@ -3843,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3843
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3844
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: nv | 0.25s
3847
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3849
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3859,7 +3887,7 @@ Cell: nv | 0.25s
3859
  </div>
3860
  </div>
3861
  <div id="output-nv" class="cell-output">
3862
- <div class="cell-stdout">Wed Oct 22 08:58:23 2025
3863
  +-----------------------------------------------------------------------------------------+
3864
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3865
  |-----------------------------------------+------------------------+----------------------+
@@ -3868,7 +3896,7 @@ Cell: nv | 0.25s
3868
  | | | MIG M. |
3869
  |=========================================+========================+======================|
3870
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3871
- | N/A 26C P8 22W / 350W | 0MiB / 46068MiB | 0% Default |
3872
  | | | N/A |
3873
  +-----------------------------------------+------------------------+----------------------+
3874
 
@@ -3880,19 +3908,19 @@ Cell: nv | 0.25s
3880
  | No running processes found |
3881
  +-----------------------------------------------------------------------------------------+
3882
 
3883
- </div>
3884
  </div>
3885
  </div>
3886
 
3887
  <h2>SwiGLU Benchmark (torch.compile)</h2>
3888
- <div class="cell cell-failed" id="cell-benchmark">
3889
  <div class="cell-header">
3890
  <span class="collapse-indicators">
3891
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3892
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3893
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3894
  </span> |
3895
- Cell: benchmark | 0.05s | FAILED
3896
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3897
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3898
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,12 +3932,12 @@ Cell: benchmark | 0.05s | FAILED
3904
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3905
  <span class="c1"># dependencies = [</span>
3906
  <span class="c1"># &quot;numpy&quot;,</span>
3907
- <span class="c1"># &quot;torch&quot;,</span>
3908
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3909
  <span class="c1"># ]</span>
3910
  <span class="c1">#</span>
3911
  <span class="c1"># [tool.uv.sources]</span>
3912
- <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3913
  <span class="c1"># ///</span>
3914
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3915
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3967,9 +3995,118 @@ Cell: benchmark | 0.05s | FAILED
3967
  </div>
3968
  </div>
3969
  <div id="output-benchmark" class="cell-output">
3970
- <div class="cell-stderr"> × Failed to resolve script requirement
3971
- ╰─▶ Distribution not found at:
3972
- file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3973
  </div>
3974
  </div>
3975
  </div>
 
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
709
+
710
+ .cell-stdout {
711
+ background: var(--bg-tertiary);
712
+ padding: 0.75rem;
713
+ border-radius: 1px;
714
+ font-family: inherit;
715
+ font-size: 0.9rem;
716
+ color: var(--text-primary);
717
+
718
+ /* key bits */
719
+ overflow: auto; /* show scrollbars when needed */
720
+ max-width: 100%; /* respects whatever layout width you give it */
721
+ }
722
+
723
+ .cell-stdout .stdout-text {
724
+ margin: 0; /* reset pre default margin */
725
+ white-space: pre; /* keep line breaks, NO wrapping */
726
+ display: inline-block; /* shrink-to-content */
727
+ min-width: max-content; /* allow very long lines to define intrinsic width */
728
+ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
729
+ tab-size: 2;
730
+ }
731
+
732
  .cell-stderr {
733
  background: var(--bg-error);
734
  border-left: 2px solid var(--border-error);
 
3579
  if(output){
3580
  output.classList.remove('output-stale');
3581
  let html='';
3582
+ if (data.stdout) {
3583
+ html += '<div class="cell-stdout"><pre class="stdout-text">'
3584
+ + escapeHtml(data.stdout)
3585
+ + '</pre></div>';
3586
+ }
3587
+
3588
  console.log('UV Logs:', data);
3589
  if(data.stderr) {
3590
  // Split UV logs from regular stderr
 
3706
  }
3707
  }
3708
 
3709
+ // // Live reload functionality (robust SSE handling)
3710
+ // (function(){
3711
+ // if (!('EventSource' in window)) {
3712
+ // console.warn('SSE not supported in this browser');
3713
+ // return;
3714
+ // }
3715
+ // let source = new EventSource('/events');
3716
+ // let isOpen = false;
3717
+ // source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3718
+ // source.onmessage = function(e){
3719
+ // const msg=(e.data||'').trim(); if(!msg) return;
3720
+ // console.log('SSE message:', msg);
3721
+ // if (msg==='reload' || msg==='incremental') { location.reload(); }
3722
+ // // Ignore 'loading' to avoid premature reload loops
3723
+ // };
3724
+ // source.onerror = function(e){
3725
+ // // Let EventSource auto-reconnect instead of forcing a reload
3726
+ // if (isOpen) console.warn('SSE error after open, retrying...', e);
3727
+ // };
3728
+ // window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3729
+ // })();
3730
 
3731
 
3732
  document.addEventListener('DOMContentLoaded', function() {
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.23s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
+ <div class="cell-stdout"><pre class="stdout-text">Thu Oct 23 17:21:49 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
 
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
+ | N/A 37C P0 80W / 350W | 0MiB / 46068MiB | 13% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
 
3908
  | No running processes found |
3909
  +-----------------------------------------------------------------------------------------+
3910
 
3911
+ </pre></div>
3912
  </div>
3913
  </div>
3914
 
3915
  <h2>SwiGLU Benchmark (torch.compile)</h2>
3916
+ <div class="cell" id="cell-benchmark">
3917
  <div class="cell-header">
3918
  <span class="collapse-indicators">
3919
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
+ Cell: benchmark | 14.79s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3932
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3933
  <span class="c1"># dependencies = [</span>
3934
  <span class="c1"># &quot;numpy&quot;,</span>
3935
+ <span class="c1"># &quot;torch==2.8.0&quot;,</span>
3936
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3937
  <span class="c1"># ]</span>
3938
  <span class="c1">#</span>
3939
  <span class="c1"># [tool.uv.sources]</span>
3940
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
3941
  <span class="c1"># ///</span>
3942
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3943
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
3995
  </div>
3996
  </div>
3997
  <div id="output-benchmark" class="cell-output">
3998
+ <div class="cell-stdout"><pre class="stdout-text">Running SwiGLU benchmarks on cuda with bfloat16
3999
+ Testing 3 workloads
4000
+
4001
+ ======================================================================
4002
+ PROFILE TRACE: compiled_swiglu_max_autotune | llama_T512_D4096
4003
+ ======================================================================
4004
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4006
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
+ compiled_swiglu_max_autotune 0.00% 0.000us 0.00% 0.000us 0.000us 1.851ms 5297.74% 1.851ms 925.622us 2
4008
+ compiled_swiglu_max_autotune 0.10% 159.779us 99.99% 166.375ms 166.375ms 0.000us 0.00% 38.816us 38.816us 1
4009
+ Torch-Compiled Region: 0/1 1.45% 2.415ms 99.86% 166.157ms 55.386ms 11.007us 31.50% 38.816us 12.939us 3
4010
+ aten::_foreach_copy_ 0.02% 39.542us 0.05% 87.165us 29.055us 21.600us 61.81% 21.600us 7.200us 3
4011
+ void at::native::(anonymous namespace)::multi_tensor... 0.00% 0.000us 0.00% 0.000us 0.000us 21.600us 61.81% 21.600us 7.200us 3
4012
+ CUDAGraphNode.record (dynamo_timed) 0.00% 0.000us 0.00% 0.000us 0.000us 20.673us 59.16% 20.673us 20.673us 1
4013
+ triton_poi_fused_mul_silu_0 0.00% 0.000us 0.00% 0.000us 0.000us 11.007us 31.50% 11.007us 3.669us 3
4014
+ Activity Buffer Request 0.86% 1.424ms 0.86% 1.424ms 1.424ms 3.872us 11.08% 3.872us 3.872us 1
4015
+ CUDAGraphNode.record (dynamo_timed) 96.87% 161.185ms 97.39% 162.045ms 162.045ms 0.000us 0.00% 2.337us 2.337us 1
4016
+ aten::fill_ 0.02% 34.251us 0.05% 74.934us 37.467us 2.337us 6.69% 2.337us 1.168us 2
4017
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.337us 6.69% 2.337us 1.168us 2
4018
+ TorchDynamo Cache Lookup 0.03% 57.633us 0.03% 57.633us 19.211us 0.000us 0.00% 0.000us 0.000us 3
4019
+ Pregraph bytecode 0.01% 12.280us 0.01% 12.280us 4.093us 0.000us 0.00% 0.000us 0.000us 3
4020
+ AOTDispatcher Runtime Wrapper Prologue 0.01% 21.352us 0.01% 21.352us 7.117us 0.000us 0.00% 0.000us 0.000us 3
4021
+ cudaDeviceSynchronize 0.07% 111.205us 0.07% 111.205us 18.534us 0.000us 0.00% 0.000us 0.000us 6
4022
+ cudaStreamIsCapturing 0.01% 10.600us 0.01% 10.600us 0.815us 0.000us 0.00% 0.000us 0.000us 13
4023
+ cudaEventRecordWithFlags 0.00% 4.751us 0.00% 4.751us 1.584us 0.000us 0.00% 0.000us 0.000us 3
4024
+ cudaStreamWaitEvent 0.00% 4.550us 0.00% 4.550us 1.517us 0.000us 0.00% 0.000us 0.000us 3
4025
+ aten::empty_strided 0.01% 14.680us 0.01% 14.680us 4.893us 0.000us 0.00% 0.000us 0.000us 3
4026
+ cudaLaunchKernel 0.05% 88.306us 0.05% 88.306us 17.661us 0.000us 0.00% 0.000us 0.000us 5
4027
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
+ Self CPU time total: 166.389ms
4029
+ Self CUDA time total: 34.944us
4030
+
4031
+
4032
+
4033
+ ======================================================================
4034
+ PROFILE TRACE: compiled_swiglu_max_autotune | llama_T512_D8192
4035
+ ======================================================================
4036
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
+ compiled_swiglu_max_autotune 0.00% 0.000us 0.00% 0.000us 0.000us 1.882ms 2857.54% 1.882ms 940.918us 2
4040
+ compiled_swiglu_max_autotune 0.08% 131.855us 99.99% 174.569ms 174.569ms 0.000us 0.00% 72.799us 72.799us 1
4041
+ Torch-Compiled Region: 0/3 1.26% 2.204ms 99.89% 174.392ms 58.131ms 18.240us 27.70% 72.799us 24.266us 3
4042
+ aten::_foreach_copy_ 0.02% 39.114us 0.05% 88.345us 29.448us 45.247us 68.71% 45.247us 15.082us 3
4043
+ void at::native::(anonymous namespace)::multi_tensor... 0.00% 0.000us 0.00% 0.000us 0.000us 45.247us 68.71% 45.247us 15.082us 3
4044
+ CUDAGraphNode.record (dynamo_timed) 0.00% 0.000us 0.00% 0.000us 0.000us 19.904us 30.22% 19.904us 19.904us 1
4045
+ triton_poi_fused_mul_silu_0 0.00% 0.000us 0.00% 0.000us 0.000us 18.240us 27.70% 18.240us 6.080us 3
4046
+ Activity Buffer Request 0.83% 1.441ms 0.83% 1.441ms 1.441ms 6.944us 10.54% 6.944us 6.944us 1
4047
+ CUDAGraphNode.record (dynamo_timed) 96.65% 168.746ms 97.67% 170.521ms 170.521ms 0.000us 0.00% 2.368us 2.368us 1
4048
+ aten::fill_ 0.02% 36.482us 0.04% 78.354us 39.177us 2.368us 3.60% 2.368us 1.184us 2
4049
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.368us 3.60% 2.368us 1.184us 2
4050
+ TorchDynamo Cache Lookup 0.03% 45.013us 0.03% 45.013us 15.004us 0.000us 0.00% 0.000us 0.000us 3
4051
+ Pregraph bytecode 0.01% 9.190us 0.01% 9.190us 3.063us 0.000us 0.00% 0.000us 0.000us 3
4052
+ AOTDispatcher Runtime Wrapper Prologue 0.01% 17.071us 0.01% 17.071us 5.690us 0.000us 0.00% 0.000us 0.000us 3
4053
+ cudaDeviceSynchronize 0.04% 76.533us 0.04% 76.533us 12.755us 0.000us 0.00% 0.000us 0.000us 6
4054
+ cudaStreamIsCapturing 0.01% 9.681us 0.01% 9.681us 0.745us 0.000us 0.00% 0.000us 0.000us 13
4055
+ cudaEventRecordWithFlags 0.00% 3.672us 0.00% 3.672us 1.224us 0.000us 0.00% 0.000us 0.000us 3
4056
+ cudaStreamWaitEvent 0.00% 3.040us 0.00% 3.040us 1.013us 0.000us 0.00% 0.000us 0.000us 3
4057
+ aten::empty_strided 0.01% 12.061us 0.01% 12.061us 4.020us 0.000us 0.00% 0.000us 0.000us 3
4058
+ cudaLaunchKernel 0.05% 91.103us 0.05% 91.103us 18.221us 0.000us 0.00% 0.000us 0.000us 5
4059
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
+ Self CPU time total: 174.590ms
4061
+ Self CUDA time total: 65.855us
4062
+
4063
+
4064
+
4065
+ ======================================================================
4066
+ PROFILE TRACE: compiled_swiglu_max_autotune | llama_T512_D11008
4067
+ ======================================================================
4068
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4069
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4070
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
+ compiled_swiglu_max_autotune 0.00% 0.000us 0.00% 0.000us 0.000us 1.863ms 1771.89% 1.863ms 931.590us 2
4072
+ compiled_swiglu_max_autotune 0.07% 121.234us 99.99% 174.986ms 174.986ms 0.000us 0.00% 113.760us 113.760us 1
4073
+ Torch-Compiled Region: 0/5 1.21% 2.117ms 99.90% 174.826ms 58.275ms 24.864us 23.65% 113.760us 37.920us 3
4074
+ aten::_foreach_copy_ 0.02% 36.152us 0.05% 83.124us 27.708us 78.144us 74.32% 78.144us 26.048us 3
4075
+ void at::native::(anonymous namespace)::multi_tensor... 0.00% 0.000us 0.00% 0.000us 0.000us 78.144us 74.32% 78.144us 26.048us 3
4076
+ triton_poi_fused_mul_silu_0 0.00% 0.000us 0.00% 0.000us 0.000us 24.864us 23.65% 24.864us 8.288us 3
4077
+ CUDAGraphNode.record (dynamo_timed) 0.00% 0.000us 0.00% 0.000us 0.000us 19.776us 18.81% 19.776us 19.776us 1
4078
+ Activity Buffer Request 0.77% 1.349ms 0.77% 1.349ms 1.349ms 8.608us 8.19% 8.608us 8.608us 1
4079
+ CUDAGraphNode.record (dynamo_timed) 96.23% 168.408ms 97.80% 171.145ms 171.145ms 0.000us 0.00% 2.144us 2.144us 1
4080
+ aten::fill_ 0.02% 32.121us 0.04% 72.933us 36.467us 2.144us 2.04% 2.144us 1.072us 2
4081
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.144us 2.04% 2.144us 1.072us 2
4082
+ TorchDynamo Cache Lookup 0.02% 38.274us 0.02% 38.274us 12.758us 0.000us 0.00% 0.000us 0.000us 3
4083
+ Pregraph bytecode 0.01% 9.421us 0.01% 9.421us 3.140us 0.000us 0.00% 0.000us 0.000us 3
4084
+ AOTDispatcher Runtime Wrapper Prologue 0.01% 14.201us 0.01% 14.201us 4.734us 0.000us 0.00% 0.000us 0.000us 3
4085
+ cudaDeviceSynchronize 0.04% 73.664us 0.04% 73.664us 12.277us 0.000us 0.00% 0.000us 0.000us 6
4086
+ cudaStreamIsCapturing 0.01% 9.722us 0.01% 9.722us 0.748us 0.000us 0.00% 0.000us 0.000us 13
4087
+ cudaEventRecordWithFlags 0.00% 3.409us 0.00% 3.409us 1.136us 0.000us 0.00% 0.000us 0.000us 3
4088
+ cudaStreamWaitEvent 0.00% 2.910us 0.00% 2.910us 0.970us 0.000us 0.00% 0.000us 0.000us 3
4089
+ aten::empty_strided 0.01% 11.600us 0.01% 11.600us 3.867us 0.000us 0.00% 0.000us 0.000us 3
4090
+ cudaLaunchKernel 0.05% 87.784us 0.05% 87.784us 17.557us 0.000us 0.00% 0.000us 0.000us 5
4091
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4092
+ Self CPU time total: 175.003ms
4093
+ Self CUDA time total: 105.152us
4094
+
4095
+
4096
+ impl wl p50(ms) ok
4097
+ compiled_swiglu_max_autotune llama_T512_D11008 0.11 True
4098
+ compiled_swiglu_max_autotune llama_T512_D4096 0.10 True
4099
+ compiled_swiglu_max_autotune llama_T512_D8192 0.11 True
4100
+ </pre></div>
4101
+ <div class="uv-install-logs" id="uv-logs-benchmark">
4102
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4103
+ <div class="uv-logs-content" style="display: none;">
4104
+ Installed 37 packages in 247ms
4105
+ </div>
4106
+ </div>
4107
+ <div class="cell-artifacts">
4108
+ <h4>Artifacts:</h4>
4109
+ <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
4110
  </div>
4111
  </div>
4112
  </div>
activation/impls/hf_kernels_swiglu.html CHANGED
@@ -706,6 +706,29 @@
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  .cell-stderr {
710
  background: var(--bg-error);
711
  border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3556
  if(output){
3557
  output.classList.remove('output-stale');
3558
  let html='';
3559
- if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
 
 
 
 
 
3560
  console.log('UV Logs:', data);
3561
  if(data.stderr) {
3562
  // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3678
  }
3679
  }
3680
 
3681
- // Live reload functionality (robust SSE handling)
3682
- (function(){
3683
- if (!('EventSource' in window)) {
3684
- console.warn('SSE not supported in this browser');
3685
- return;
3686
- }
3687
- let source = new EventSource('/events');
3688
- let isOpen = false;
3689
- source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3690
- source.onmessage = function(e){
3691
- const msg=(e.data||'').trim(); if(!msg) return;
3692
- console.log('SSE message:', msg);
3693
- if (msg==='reload' || msg==='incremental') { location.reload(); }
3694
- // Ignore 'loading' to avoid premature reload loops
3695
- };
3696
- source.onerror = function(e){
3697
- // Let EventSource auto-reconnect instead of forcing a reload
3698
- if (isOpen) console.warn('SSE error after open, retrying...', e);
3699
- };
3700
- window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3701
- })();
3702
 
3703
 
3704
  document.addEventListener('DOMContentLoaded', function() {
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
@@ -3843,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3843
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3844
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: nv | 0.25s
3847
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3849
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3859,7 +3887,7 @@ Cell: nv | 0.25s
3859
  </div>
3860
  </div>
3861
  <div id="output-nv" class="cell-output">
3862
- <div class="cell-stdout">Wed Oct 22 08:58:23 2025
3863
  +-----------------------------------------------------------------------------------------+
3864
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3865
  |-----------------------------------------+------------------------+----------------------+
@@ -3868,7 +3896,7 @@ Cell: nv | 0.25s
3868
  | | | MIG M. |
3869
  |=========================================+========================+======================|
3870
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3871
- | N/A 26C P8 22W / 350W | 0MiB / 46068MiB | 0% Default |
3872
  | | | N/A |
3873
  +-----------------------------------------+------------------------+----------------------+
3874
 
@@ -3880,19 +3908,19 @@ Cell: nv | 0.25s
3880
  | No running processes found |
3881
  +-----------------------------------------------------------------------------------------+
3882
 
3883
- </div>
3884
  </div>
3885
  </div>
3886
 
3887
  <h2>SwiGLU Benchmark</h2>
3888
- <div class="cell cell-failed" id="cell-benchmark">
3889
  <div class="cell-header">
3890
  <span class="collapse-indicators">
3891
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3892
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3893
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3894
  </span> |
3895
- Cell: benchmark | 0.01s | FAILED
3896
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3897
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3898
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,13 +3932,13 @@ Cell: benchmark | 0.01s | FAILED
3904
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3905
  <span class="c1"># dependencies = [</span>
3906
  <span class="c1"># &quot;numpy&quot;,</span>
3907
- <span class="c1"># &quot;torch&quot;,</span>
3908
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3909
  <span class="c1"># &quot;kernels&quot;,</span>
3910
  <span class="c1"># ]</span>
3911
  <span class="c1">#</span>
3912
  <span class="c1"># [tool.uv.sources]</span>
3913
- <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3914
  <span class="c1"># ///</span>
3915
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3916
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3970,9 +3998,85 @@ Cell: benchmark | 0.01s | FAILED
3970
  </div>
3971
  </div>
3972
  <div id="output-benchmark" class="cell-output">
3973
- <div class="cell-stderr"> × Failed to resolve script requirement
3974
- ╰─▶ Distribution not found at:
3975
- file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3976
  </div>
3977
  </div>
3978
  </div>
 
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
709
+
710
+ .cell-stdout {
711
+ background: var(--bg-tertiary);
712
+ padding: 0.75rem;
713
+ border-radius: 1px;
714
+ font-family: inherit;
715
+ font-size: 0.9rem;
716
+ color: var(--text-primary);
717
+
718
+ /* key bits */
719
+ overflow: auto; /* show scrollbars when needed */
720
+ max-width: 100%; /* respects whatever layout width you give it */
721
+ }
722
+
723
+ .cell-stdout .stdout-text {
724
+ margin: 0; /* reset pre default margin */
725
+ white-space: pre; /* keep line breaks, NO wrapping */
726
+ display: inline-block; /* shrink-to-content */
727
+ min-width: max-content; /* allow very long lines to define intrinsic width */
728
+ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
729
+ tab-size: 2;
730
+ }
731
+
732
  .cell-stderr {
733
  background: var(--bg-error);
734
  border-left: 2px solid var(--border-error);
 
3579
  if(output){
3580
  output.classList.remove('output-stale');
3581
  let html='';
3582
+ if (data.stdout) {
3583
+ html += '<div class="cell-stdout"><pre class="stdout-text">'
3584
+ + escapeHtml(data.stdout)
3585
+ + '</pre></div>';
3586
+ }
3587
+
3588
  console.log('UV Logs:', data);
3589
  if(data.stderr) {
3590
  // Split UV logs from regular stderr
 
3706
  }
3707
  }
3708
 
3709
+ // // Live reload functionality (robust SSE handling)
3710
+ // (function(){
3711
+ // if (!('EventSource' in window)) {
3712
+ // console.warn('SSE not supported in this browser');
3713
+ // return;
3714
+ // }
3715
+ // let source = new EventSource('/events');
3716
+ // let isOpen = false;
3717
+ // source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3718
+ // source.onmessage = function(e){
3719
+ // const msg=(e.data||'').trim(); if(!msg) return;
3720
+ // console.log('SSE message:', msg);
3721
+ // if (msg==='reload' || msg==='incremental') { location.reload(); }
3722
+ // // Ignore 'loading' to avoid premature reload loops
3723
+ // };
3724
+ // source.onerror = function(e){
3725
+ // // Let EventSource auto-reconnect instead of forcing a reload
3726
+ // if (isOpen) console.warn('SSE error after open, retrying...', e);
3727
+ // };
3728
+ // window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3729
+ // })();
3730
 
3731
 
3732
  document.addEventListener('DOMContentLoaded', function() {
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.23s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
+ <div class="cell-stdout"><pre class="stdout-text">Thu Oct 23 17:21:49 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
 
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
+ | N/A 37C P0 80W / 350W | 0MiB / 46068MiB | 13% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
 
3908
  | No running processes found |
3909
  +-----------------------------------------------------------------------------------------+
3910
 
3911
+ </pre></div>
3912
  </div>
3913
  </div>
3914
 
3915
  <h2>SwiGLU Benchmark</h2>
3916
+ <div class="cell" id="cell-benchmark">
3917
  <div class="cell-header">
3918
  <span class="collapse-indicators">
3919
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
+ Cell: benchmark | 4.10s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3932
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3933
  <span class="c1"># dependencies = [</span>
3934
  <span class="c1"># &quot;numpy&quot;,</span>
3935
+ <span class="c1"># &quot;torch==2.8.0&quot;,</span>
3936
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3937
  <span class="c1"># &quot;kernels&quot;,</span>
3938
  <span class="c1"># ]</span>
3939
  <span class="c1">#</span>
3940
  <span class="c1"># [tool.uv.sources]</span>
3941
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
3942
  <span class="c1"># ///</span>
3943
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3944
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
3998
  </div>
3999
  </div>
4000
  <div id="output-benchmark" class="cell-output">
4001
+ <div class="cell-stdout"><pre class="stdout-text">Running SwiGLU benchmarks on cuda with bfloat16
4002
+ Testing 3 workloads
4003
+
4004
+ ======================================================================
4005
+ PROFILE TRACE: hf_kernels_swiglu | llama_T512_D4096
4006
+ ======================================================================
4007
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4009
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4010
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 77.600us 379.50% 77.600us 77.600us 1
4011
+ hf_kernels_swiglu 9.39% 165.439us 99.61% 1.754ms 1.754ms 0.000us 0.00% 27.360us 27.360us 1
4012
+ _activation_beeaae6::silu_and_mul 1.24% 21.822us 87.35% 1.539ms 512.861us 20.448us 100.00% 27.360us 9.120us 3
4013
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 20.448us 100.00% 20.448us 6.816us 3
4014
+ Activity Buffer Request 83.94% 1.478ms 83.94% 1.478ms 1.478ms 6.912us 33.80% 6.912us 6.912us 1
4015
+ aten::empty 2.87% 50.462us 2.87% 50.462us 16.821us 0.000us 0.00% 0.000us 0.000us 3
4016
+ cudaLaunchKernel 2.17% 38.291us 2.17% 38.291us 12.764us 0.000us 0.00% 0.000us 0.000us 3
4017
+ cudaDeviceSynchronize 0.39% 6.830us 0.39% 6.830us 6.830us 0.000us 0.00% 0.000us 0.000us 1
4018
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
+ Self CPU time total: 1.761ms
4020
+ Self CUDA time total: 20.448us
4021
+
4022
+
4023
+
4024
+ ======================================================================
4025
+ PROFILE TRACE: hf_kernels_swiglu | llama_T512_D8192
4026
+ ======================================================================
4027
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4029
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4030
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.720us 154.22% 70.720us 70.720us 1
4031
+ hf_kernels_swiglu 5.60% 88.845us 99.68% 1.581ms 1.581ms 0.000us 0.00% 69.152us 69.152us 1
4032
+ _activation_beeaae6::silu_and_mul 1.32% 20.881us 92.90% 1.474ms 491.244us 45.856us 100.00% 69.152us 23.051us 3
4033
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 45.856us 100.00% 45.856us 15.285us 3
4034
+ Activity Buffer Request 89.94% 1.427ms 89.94% 1.427ms 1.427ms 23.296us 50.80% 23.296us 23.296us 1
4035
+ aten::empty 1.18% 18.690us 1.18% 18.690us 6.230us 0.000us 0.00% 0.000us 0.000us 3
4036
+ cudaLaunchKernel 1.64% 25.971us 1.64% 25.971us 8.657us 0.000us 0.00% 0.000us 0.000us 3
4037
+ cudaDeviceSynchronize 0.32% 5.141us 0.32% 5.141us 5.141us 0.000us 0.00% 0.000us 0.000us 1
4038
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
+ Self CPU time total: 1.586ms
4040
+ Self CUDA time total: 45.856us
4041
+
4042
+
4043
+
4044
+ ======================================================================
4045
+ PROFILE TRACE: hf_kernels_swiglu | llama_T512_D11008
4046
+ ======================================================================
4047
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4049
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4050
+ hf_kernels_swiglu 5.54% 88.883us 99.68% 1.600ms 1.600ms 0.000us 0.00% 123.326us 123.326us 1
4051
+ _activation_beeaae6::silu_and_mul 1.34% 21.482us 92.90% 1.491ms 497.111us 75.967us 100.00% 123.326us 41.109us 3
4052
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 81.632us 107.46% 81.632us 81.632us 1
4053
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 75.967us 100.00% 75.967us 25.322us 3
4054
+ Activity Buffer Request 89.90% 1.443ms 89.90% 1.443ms 1.443ms 47.359us 62.34% 47.359us 47.359us 1
4055
+ aten::empty 1.25% 19.991us 1.25% 19.991us 6.664us 0.000us 0.00% 0.000us 0.000us 3
4056
+ cudaLaunchKernel 1.65% 26.561us 1.65% 26.561us 8.854us 0.000us 0.00% 0.000us 0.000us 3
4057
+ cudaDeviceSynchronize 0.32% 5.170us 0.32% 5.170us 5.170us 0.000us 0.00% 0.000us 0.000us 1
4058
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
+ Self CPU time total: 1.605ms
4060
+ Self CUDA time total: 75.967us
4061
+
4062
+
4063
+ impl wl p50(ms) ok
4064
+ hf_kernels_swiglu llama_T512_D11008 0.03 True
4065
+ hf_kernels_swiglu llama_T512_D4096 0.02 True
4066
+ hf_kernels_swiglu llama_T512_D8192 0.03 True
4067
+ </pre></div>
4068
+ <div class="uv-install-logs" id="uv-logs-benchmark">
4069
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4070
+ <div class="uv-logs-content" style="display: none;">
4071
+ Installed 10 packages in 14ms
4072
+ </div>
4073
+ </div>
4074
+ <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4075
+ Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 14.56it/s]
4076
+ Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 20.37it/s]</div>
4077
+ <div class="cell-artifacts">
4078
+ <h4>Artifacts:</h4>
4079
+ <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
4080
  </div>
4081
  </div>
4082
  </div>
activation/impls/torch_swiglu.html CHANGED
@@ -706,6 +706,29 @@
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  .cell-stderr {
710
  background: var(--bg-error);
711
  border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3556
  if(output){
3557
  output.classList.remove('output-stale');
3558
  let html='';
3559
- if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
 
 
 
 
 
3560
  console.log('UV Logs:', data);
3561
  if(data.stderr) {
3562
  // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3678
  }
3679
  }
3680
 
3681
- // Live reload functionality (robust SSE handling)
3682
- (function(){
3683
- if (!('EventSource' in window)) {
3684
- console.warn('SSE not supported in this browser');
3685
- return;
3686
- }
3687
- let source = new EventSource('/events');
3688
- let isOpen = false;
3689
- source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3690
- source.onmessage = function(e){
3691
- const msg=(e.data||'').trim(); if(!msg) return;
3692
- console.log('SSE message:', msg);
3693
- if (msg==='reload' || msg==='incremental') { location.reload(); }
3694
- // Ignore 'loading' to avoid premature reload loops
3695
- };
3696
- source.onerror = function(e){
3697
- // Let EventSource auto-reconnect instead of forcing a reload
3698
- if (isOpen) console.warn('SSE error after open, retrying...', e);
3699
- };
3700
- window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3701
- })();
3702
 
3703
 
3704
  document.addEventListener('DOMContentLoaded', function() {
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
@@ -3843,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3843
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3844
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: nv | 0.25s
3847
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3849
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3859,7 +3887,7 @@ Cell: nv | 0.25s
3859
  </div>
3860
  </div>
3861
  <div id="output-nv" class="cell-output">
3862
- <div class="cell-stdout">Wed Oct 22 08:58:23 2025
3863
  +-----------------------------------------------------------------------------------------+
3864
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3865
  |-----------------------------------------+------------------------+----------------------+
@@ -3868,7 +3896,7 @@ Cell: nv | 0.25s
3868
  | | | MIG M. |
3869
  |=========================================+========================+======================|
3870
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3871
- | N/A 26C P8 22W / 350W | 0MiB / 46068MiB | 0% Default |
3872
  | | | N/A |
3873
  +-----------------------------------------+------------------------+----------------------+
3874
 
@@ -3880,19 +3908,19 @@ Cell: nv | 0.25s
3880
  | No running processes found |
3881
  +-----------------------------------------------------------------------------------------+
3882
 
3883
- </div>
3884
  </div>
3885
  </div>
3886
 
3887
  <h2>SwiGLU Benchmark (PyTorch Native)</h2>
3888
- <div class="cell cell-failed" id="cell-benchmark">
3889
  <div class="cell-header">
3890
  <span class="collapse-indicators">
3891
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3892
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3893
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3894
  </span> |
3895
- Cell: benchmark | 0.02s | FAILED
3896
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3897
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3898
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,12 +3932,12 @@ Cell: benchmark | 0.02s | FAILED
3904
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3905
  <span class="c1"># dependencies = [</span>
3906
  <span class="c1"># &quot;numpy&quot;,</span>
3907
- <span class="c1"># &quot;torch&quot;,</span>
3908
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3909
  <span class="c1"># ]</span>
3910
  <span class="c1">#</span>
3911
  <span class="c1"># [tool.uv.sources]</span>
3912
- <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3913
  <span class="c1"># ///</span>
3914
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3915
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3966,9 +3994,85 @@ Cell: benchmark | 0.02s | FAILED
3966
  </div>
3967
  </div>
3968
  <div id="output-benchmark" class="cell-output">
3969
- <div class="cell-stderr"> × Failed to resolve script requirement
3970
- ╰─▶ Distribution not found at:
3971
- file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3972
  </div>
3973
  </div>
3974
  </div>
 
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
709
+
710
+ .cell-stdout {
711
+ background: var(--bg-tertiary);
712
+ padding: 0.75rem;
713
+ border-radius: 1px;
714
+ font-family: inherit;
715
+ font-size: 0.9rem;
716
+ color: var(--text-primary);
717
+
718
+ /* key bits */
719
+ overflow: auto; /* show scrollbars when needed */
720
+ max-width: 100%; /* respects whatever layout width you give it */
721
+ }
722
+
723
+ .cell-stdout .stdout-text {
724
+ margin: 0; /* reset pre default margin */
725
+ white-space: pre; /* keep line breaks, NO wrapping */
726
+ display: inline-block; /* shrink-to-content */
727
+ min-width: max-content; /* allow very long lines to define intrinsic width */
728
+ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
729
+ tab-size: 2;
730
+ }
731
+
732
  .cell-stderr {
733
  background: var(--bg-error);
734
  border-left: 2px solid var(--border-error);
 
3579
  if(output){
3580
  output.classList.remove('output-stale');
3581
  let html='';
3582
+ if (data.stdout) {
3583
+ html += '<div class="cell-stdout"><pre class="stdout-text">'
3584
+ + escapeHtml(data.stdout)
3585
+ + '</pre></div>';
3586
+ }
3587
+
3588
  console.log('UV Logs:', data);
3589
  if(data.stderr) {
3590
  // Split UV logs from regular stderr
 
3706
  }
3707
  }
3708
 
3709
+ // // Live reload functionality (robust SSE handling)
3710
+ // (function(){
3711
+ // if (!('EventSource' in window)) {
3712
+ // console.warn('SSE not supported in this browser');
3713
+ // return;
3714
+ // }
3715
+ // let source = new EventSource('/events');
3716
+ // let isOpen = false;
3717
+ // source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3718
+ // source.onmessage = function(e){
3719
+ // const msg=(e.data||'').trim(); if(!msg) return;
3720
+ // console.log('SSE message:', msg);
3721
+ // if (msg==='reload' || msg==='incremental') { location.reload(); }
3722
+ // // Ignore 'loading' to avoid premature reload loops
3723
+ // };
3724
+ // source.onerror = function(e){
3725
+ // // Let EventSource auto-reconnect instead of forcing a reload
3726
+ // if (isOpen) console.warn('SSE error after open, retrying...', e);
3727
+ // };
3728
+ // window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3729
+ // })();
3730
 
3731
 
3732
  document.addEventListener('DOMContentLoaded', function() {
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.23s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
+ <div class="cell-stdout"><pre class="stdout-text">Thu Oct 23 17:21:49 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
 
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
+ | N/A 37C P0 80W / 350W | 0MiB / 46068MiB | 13% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
 
3908
  | No running processes found |
3909
  +-----------------------------------------------------------------------------------------+
3910
 
3911
+ </pre></div>
3912
  </div>
3913
  </div>
3914
 
3915
  <h2>SwiGLU Benchmark (PyTorch Native)</h2>
3916
+ <div class="cell" id="cell-benchmark">
3917
  <div class="cell-header">
3918
  <span class="collapse-indicators">
3919
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3922
  </span> |
3923
+ Cell: benchmark | 3.41s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3932
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3933
  <span class="c1"># dependencies = [</span>
3934
  <span class="c1"># &quot;numpy&quot;,</span>
3935
+ <span class="c1"># &quot;torch==2.8.0&quot;,</span>
3936
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3937
  <span class="c1"># ]</span>
3938
  <span class="c1">#</span>
3939
  <span class="c1"># [tool.uv.sources]</span>
3940
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
3941
  <span class="c1"># ///</span>
3942
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3943
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
3994
  </div>
3995
  </div>
3996
  <div id="output-benchmark" class="cell-output">
3997
+ <div class="cell-stdout"><pre class="stdout-text">Running SwiGLU benchmarks on cuda with bfloat16
3998
+ Testing 3 workloads
3999
+
4000
+ ======================================================================
4001
+ PROFILE TRACE: torch_swiglu | llama_T512_D4096
4002
+ ======================================================================
4003
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4005
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
+ torch_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 170.400us 513.50% 170.400us 170.400us 1
4007
+ torch_swiglu 10.35% 190.189us 99.61% 1.830ms 1.830ms 0.000us 0.00% 39.104us 39.104us 1
4008
+ aten::silu 3.11% 57.064us 83.35% 1.532ms 510.522us 17.280us 52.07% 23.200us 7.733us 3
4009
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 17.280us 52.07% 17.280us 5.760us 3
4010
+ aten::mul 2.20% 40.433us 3.25% 59.723us 19.908us 15.904us 47.93% 15.904us 5.301us 3
4011
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 15.904us 47.93% 15.904us 5.301us 3
4012
+ Activity Buffer Request 77.87% 1.431ms 77.87% 1.431ms 1.431ms 5.920us 17.84% 5.920us 5.920us 1
4013
+ aten::slice 2.14% 39.352us 2.66% 48.892us 8.149us 0.000us 0.00% 0.000us 0.000us 6
4014
+ aten::as_strided 0.52% 9.540us 0.52% 9.540us 1.590us 0.000us 0.00% 0.000us 0.000us 6
4015
+ cudaLaunchKernel 3.42% 62.871us 3.42% 62.871us 10.479us 0.000us 0.00% 0.000us 0.000us 6
4016
+ cudaDeviceSynchronize 0.39% 7.170us 0.39% 7.170us 7.170us 0.000us 0.00% 0.000us 0.000us 1
4017
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
+ Self CPU time total: 1.838ms
4019
+ Self CUDA time total: 33.184us
4020
+
4021
+
4022
+
4023
+ ======================================================================
4024
+ PROFILE TRACE: torch_swiglu | llama_T512_D8192
4025
+ ======================================================================
4026
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4028
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4029
+ torch_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 144.478us 207.68% 144.478us 144.478us 1
4030
+ torch_swiglu 6.51% 109.976us 99.67% 1.683ms 1.683ms 0.000us 0.00% 87.038us 87.038us 1
4031
+ aten::silu 2.61% 44.013us 89.15% 1.506ms 501.918us 36.351us 52.25% 53.823us 17.941us 3
4032
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 36.351us 52.25% 36.351us 12.117us 3
4033
+ aten::mul 1.57% 26.450us 2.46% 41.521us 13.840us 33.215us 47.75% 33.215us 11.072us 3
4034
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 33.215us 47.75% 33.215us 11.072us 3
4035
+ Activity Buffer Request 84.91% 1.434ms 84.91% 1.434ms 1.434ms 17.472us 25.12% 17.472us 17.472us 1
4036
+ aten::slice 1.23% 20.821us 1.55% 26.141us 4.357us 0.000us 0.00% 0.000us 0.000us 6
4037
+ aten::as_strided 0.31% 5.320us 0.31% 5.320us 0.887us 0.000us 0.00% 0.000us 0.000us 6
4038
+ cudaLaunchKernel 2.52% 42.602us 2.52% 42.602us 7.100us 0.000us 0.00% 0.000us 0.000us 6
4039
+ cudaDeviceSynchronize 0.33% 5.630us 0.33% 5.630us 5.630us 0.000us 0.00% 0.000us 0.000us 1
4040
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4041
+ Self CPU time total: 1.689ms
4042
+ Self CUDA time total: 69.566us
4043
+
4044
+
4045
+
4046
+ ======================================================================
4047
+ PROFILE TRACE: torch_swiglu | llama_T512_D11008
4048
+ ======================================================================
4049
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4050
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4051
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4052
+ torch_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 147.999us 151.09% 147.999us 147.999us 1
4053
+ torch_swiglu 7.64% 131.036us 99.70% 1.710ms 1.710ms 0.000us 0.00% 124.063us 124.063us 1
4054
+ aten::silu 2.56% 43.903us 88.06% 1.510ms 503.475us 50.015us 51.06% 76.127us 25.376us 3
4055
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 50.015us 51.06% 50.015us 16.672us 3
4056
+ aten::mul 1.50% 25.771us 2.43% 41.641us 13.880us 47.936us 48.94% 47.936us 15.979us 3
4057
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.936us 48.94% 47.936us 15.979us 3
4058
+ Activity Buffer Request 83.94% 1.440ms 83.94% 1.440ms 1.440ms 26.112us 26.66% 26.112us 26.112us 1
4059
+ aten::slice 1.28% 22.003us 1.58% 27.082us 4.514us 0.000us 0.00% 0.000us 0.000us 6
4060
+ aten::as_strided 0.30% 5.079us 0.30% 5.079us 0.846us 0.000us 0.00% 0.000us 0.000us 6
4061
+ cudaLaunchKernel 2.48% 42.561us 2.48% 42.561us 7.093us 0.000us 0.00% 0.000us 0.000us 6
4062
+ cudaDeviceSynchronize 0.30% 5.120us 0.30% 5.120us 5.120us 0.000us 0.00% 0.000us 0.000us 1
4063
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4064
+ Self CPU time total: 1.715ms
4065
+ Self CUDA time total: 97.951us
4066
+
4067
+
4068
+ impl wl p50(ms) ok
4069
+ torch_swiglu llama_T512_D11008 0.05 True
4070
+ torch_swiglu llama_T512_D4096 0.04 True
4071
+ torch_swiglu llama_T512_D8192 0.05 True
4072
+ </pre></div>
4073
+ <div class="cell-artifacts">
4074
+ <h4>Artifacts:</h4>
4075
+ <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
4076
  </div>
4077
  </div>
4078
  </div>
activation/results/artifacts/combine/latency.svg ADDED

Git LFS Details

  • SHA256: 1d1e9eae17f133adc5891c297d9d75eafd2f519d8bef5ddcb971d9333606511e
  • Pointer size: 130 Bytes
  • Size of remote file: 15.6 kB
activation/results/cells/combine.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch==2.8.0",
6
+ # "kernels-benchmark-tools",
7
+ # "matplotlib",
8
+ # ]
9
+ #
10
+ # [tool.uv.sources]
11
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
12
+ # ///
13
+ from kernels_benchmark_tools.core.visuals import generate_combined_results
14
+
15
+ # Map display names to uvnote environment variables
16
+ cache_env_map = {
17
+ "HF Kernels SwiGLU": "UVNOTE_FILE_HF_KERNELS_SWIGLU_BENCHMARK",
18
+ "PyTorch SwiGLU": "UVNOTE_FILE_TORCH_SWIGLU_BENCHMARK",
19
+ "Compiled SwiGLU": "UVNOTE_FILE_COMPILED_SWIGLU_BENCHMARK",
20
+ }
21
+
22
+ # Generate combined results with visualization
23
+ generate_combined_results(
24
+ cache_env_map=cache_env_map,
25
+ output_filename="activation.jsonl",
26
+ svg_filename="latency.svg"
27
+ )
activation/results/combined_results.html ADDED
The diff for this file is too large to render. See raw diff
 
flash_attn/impls/artifacts/benchmark/attn.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.3389439880847931, "p50": 0.3461120128631592, "p90": 0.3461120128631592, "mean": 0.3452928066253662, "reps": 5, "warmup": 2}, "compile_ms": 0.9463679790496826, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000362396240234375, "mse": 2.9206275939941406e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.40959998965263367, "p50": 0.41280001401901245, "p90": 0.41286399960517883, "mean": 0.41234560012817384, "reps": 5, "warmup": 2}, "compile_ms": 0.34329599142074585, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.4310399889945984, "p50": 0.4331519901752472, "p90": 0.4362240135669708, "mean": 0.4366208016872406, "reps": 5, "warmup": 2}, "compile_ms": 0.35942399501800537, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.4359680116176605, "p50": 0.44361600279808044, "p90": 0.447488009929657, "mean": 0.4450624048709869, "reps": 5, "warmup": 2}, "compile_ms": 0.3678080141544342, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.4711039960384369, "p50": 0.47513601183891296, "p90": 0.4763199985027313, "mean": 0.4750400006771088, "reps": 5, "warmup": 2}, "compile_ms": 0.40857601165771484, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.49663999676704407, "p50": 0.4997119903564453, "p90": 0.5038080215454102, "mean": 0.5009407997131348, "reps": 5, "warmup": 2}, "compile_ms": 0.43724799156188965, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-23T17:22:14Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17766900009519304, "p50": 0.1805790000162233, "p90": 0.1809689999845432, "mean": 0.18065700000988727, "iqr": 0.0005199999577598646, "raw_times": [0.17766900009519304, 0.18044900002678332, 0.1836189999266935, 0.1805790000162233, 0.1809689999845432], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18813999986377894, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-23T17:22:14Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2040300000771822, "p50": 0.208629999860932, "p90": 0.20883999991383462, "mean": 0.2071937999517104, "iqr": 0.004771000021719374, "raw_times": [0.208629999860932, 0.2104000000144879, 0.20883999991383462, 0.20406899989211524, 0.2040300000771822], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2113399998506793, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-23T17:22:15Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21214000003055844, "p50": 0.22414099998968595, "p90": 0.22725099984199915, "mean": 0.22296499996627972, "iqr": 0.014549999832524918, "raw_times": [0.22414099998968595, 0.21270100000947423, 0.23859199995968083, 0.21214000003055844, 0.22725099984199915], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.215200000184268, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-23T17:22:15Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2172510000946204, "p50": 0.21762999995189602, "p90": 0.229150999984995, "mean": 0.22471280003628635, "iqr": 0.011839999842777615, "raw_times": [0.2172510000946204, 0.229150999984995, 0.21762999995189602, 0.24222100000770297, 0.21731100014221738], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22035099982531392, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-23T17:22:15Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2690430001166533, "p50": 0.2719639999213541, "p90": 0.2809840000281838, "mean": 0.27520160006133665, "iqr": 0.011710999842762249, "raw_times": [0.2719639999213541, 0.26927300018542155, 0.2690430001166533, 0.2809840000281838, 0.2847440000550705], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.26890300000559364, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-23T17:22:15Z", "run": "1b435099e2fc4712a8a21c79100926bd", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.27566299991121923, "p50": 0.2808829999594309, "p90": 0.29306400006134936, "mean": 0.2846773999863217, "iqr": 0.01699100016594457, "raw_times": [0.29306400006134936, 0.2760729998954048, 0.2808829999594309, 0.29770400010420417, 0.27566299991121923], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2820939998855465, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-02T16:11:55Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.3563520014286041, "p50": 0.35942399501800537, "p90": 0.3624959886074066, "mean": 0.3856383919715881, "reps": 5, "warmup": 2}, "compile_ms": 2383.33544921875, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-02T16:11:55Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.4926080107688904, "p50": 0.49663999676704407, "p90": 0.5017600059509277, "mean": 0.4982912003993988, "reps": 5, "warmup": 2}, "compile_ms": 76.60860443115234, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-02T16:11:55Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5335040092468262, "p50": 0.5366079807281494, "p90": 0.5386239886283875, "mean": 0.5369919896125793, "reps": 5, "warmup": 2}, "compile_ms": 74.49088287353516, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-02T16:11:55Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5775359869003296, "p50": 0.5868800282478333, "p90": 0.5877760052680969, "mean": 0.5841408014297486, "reps": 5, "warmup": 2}, "compile_ms": 72.97433471679688, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-02T16:11:56Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.6072319746017456, "p50": 0.6113280057907104, "p90": 0.6144000291824341, "mean": 0.6184704065322876, "reps": 5, "warmup": 2}, "compile_ms": 215.12498474121094, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-02T16:11:56Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.6399999856948853, "p50": 0.6430720090866089, "p90": 0.6430720090866089, "mean": 0.6428672075271606, "reps": 5, "warmup": 2}, "compile_ms": 71.8028793334961, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-23T17:21:26Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.19892000000254484, "p50": 0.20128900018789864, "p90": 0.20218000008753734, "mean": 0.20126180006627692, "iqr": 0.0013400001535046613, "raw_times": [0.19892000000254484, 0.20083999993403268, 0.20128900018789864, 0.2030800001193711, 0.20218000008753734], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2210709999417304, "peak_bytes": 152174592, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-23T17:21:26Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.229150999984995, "p50": 0.22967100017012854, "p90": 0.23078200001691584, "mean": 0.23312540001825255, "iqr": 0.0012210000477352878, "raw_times": [0.23078200001691584, 0.22956099996918056, 0.22967100017012854, 0.2464619999500428, 0.229150999984995], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2712529999371327, "peak_bytes": 163971072, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-23T17:21:27Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2344019999327429, "p50": 0.23504099999627215, "p90": 0.23719199998595286, "mean": 0.23960979997355025, "iqr": 0.0026000000161729986, "raw_times": [0.2568219999830035, 0.23719199998595286, 0.2344019999327429, 0.23459199996977986, 0.23504099999627215], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.24443200004498067, "peak_bytes": 167116800, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-23T17:21:27Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.23659099997530575, "p50": 0.23880100002315885, "p90": 0.23884200004431477, "mean": 0.23843920002946106, "iqr": 0.0014209999790182337, "raw_times": [0.23880100002315885, 0.2405410000392294, 0.23742100006529654, 0.23884200004431477, 0.23659099997530575], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25097200000345765, "peak_bytes": 169345024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-23T17:21:27Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "FailOnRecompileLimitHit", "msg": "recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value."}}
6
+ {"ts": "2025-10-23T17:21:27Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "FailOnRecompileLimitHit", "msg": "recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value."}}
flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-02T16:11:08Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.3665919899940491, "p50": 0.3768320083618164, "p90": 0.41171199083328247, "mean": 0.40020479559898375, "reps": 5, "warmup": 2}, "compile_ms": 2910.97705078125, "peak_bytes": 85722112, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-02T16:11:08Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5160959959030151, "p50": 0.5489599704742432, "p90": 0.5631359815597534, "mean": 0.5535807967185974, "reps": 5, "warmup": 2}, "compile_ms": 85.84806060791016, "peak_bytes": 97387520, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-02T16:11:09Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.562175989151001, "p50": 0.6144000291824341, "p90": 0.6318079829216003, "mean": 0.6143999934196472, "reps": 5, "warmup": 2}, "compile_ms": 82.77401733398438, "peak_bytes": 99746816, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-02T16:11:09Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.6512640118598938, "p50": 0.6584320068359375, "p90": 0.6799359917640686, "mean": 0.6754495978355408, "reps": 5, "warmup": 2}, "compile_ms": 81.94969940185547, "peak_bytes": 101843968, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-02T16:11:09Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.6973119974136353, "p50": 0.7014080286026001, "p90": 0.7229440212249756, "mean": 0.7210752129554748, "reps": 5, "warmup": 2}, "compile_ms": 81.1141128540039, "peak_bytes": 103810048, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-02T16:11:10Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.7485439777374268, "p50": 0.7557439804077148, "p90": 0.7710719704627991, "mean": 0.7735359907150269, "reps": 5, "warmup": 2}, "compile_ms": 767.1397094726562, "peak_bytes": 106562560, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-23T17:21:09Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.18720899993240891, "p50": 0.19000999986928946, "p90": 0.1910489997953846, "mean": 0.1901993999581464, "iqr": 0.002129999757016776, "raw_times": [0.19381000015528116, 0.1889190000383678, 0.1910489997953846, 0.19000999986928946, 0.18720899993240891], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.3206559999853198, "peak_bytes": 143131648, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-23T17:21:11Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.1962300000286632, "p50": 0.19820000011350203, "p90": 0.20246899998710433, "mean": 0.19939980002163793, "iqr": 0.00514900011694408, "raw_times": [0.19731999987016025, 0.19820000011350203, 0.20246899998710433, 0.1962300000286632, 0.20278000010875985], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.30226499984564725, "peak_bytes": 147850240, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-23T17:21:13Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2074609999453969, "p50": 0.20947000007254246, "p90": 0.21126000001459033, "mean": 0.2095743999689148, "iqr": 0.0030890000743966084, "raw_times": [0.2115099998718506, 0.21126000001459033, 0.20947000007254246, 0.2074609999453969, 0.20817099994019372], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.31840599990573537, "peak_bytes": 150209536, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-23T17:21:15Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21225999989837874, "p50": 0.21317000005183218, "p90": 0.21427999990919488, "mean": 0.21412619998955051, "iqr": 0.0015599998732795939, "raw_times": [0.21225999989837874, 0.21317000005183218, 0.21427999990919488, 0.2127200000359153, 0.21820100005243148], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.3197160001491284, "peak_bytes": 152568832, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-23T17:21:15Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "FailOnRecompileLimitHit", "msg": "recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value."}}
6
+ {"ts": "2025-10-23T17:21:15Z", "run": "328ee5954ea34c76830340b8d2ddded5", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "FailOnRecompileLimitHit", "msg": "recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value."}}
flash_attn/impls/cells/benchmark.py CHANGED
@@ -2,38 +2,36 @@
2
  # requires-python = ">=3.10"
3
  # dependencies = [
4
  # "numpy",
5
- # "torch",
6
  # "kernels-benchmark-tools",
7
- # "xformers",
8
  # ]
9
  #
10
  # [tool.uv.sources]
11
- # kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
12
  # ///
13
  import torch
14
  import sys
15
  import os
16
  import kernels_benchmark_tools as kbt
17
- import xformers.ops as xops
18
 
19
 
20
- def xformers_attention(q, k, v):
21
- """xFormers memory efficient attention"""
22
- # xFormers expects [batch, seq_len, heads, head_dim]
23
- return xops.memory_efficient_attention(q, k, v)
24
-
25
 
26
  kbt.add(
27
- "xformers_meff",
28
- xformers_attention,
29
- tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
30
  )
31
 
32
  if __name__ == "__main__":
33
  device = "cuda" if torch.cuda.is_available() else "cpu"
34
  dtype = "float32" if device == "cpu" else "bfloat16"
35
 
36
- # Flux-like workloads
37
  base = 1024 if device == "cuda" else 512
38
  flux_sizes = (
39
  [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
 
2
  # requires-python = ">=3.10"
3
  # dependencies = [
4
  # "numpy",
5
+ # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
10
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
11
  # ///
12
  import torch
13
  import sys
14
  import os
15
  import kernels_benchmark_tools as kbt
 
16
 
17
 
18
+ def torch_flash(q, k, v):
19
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
20
+ with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
21
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
22
+ return o.transpose(1, 2).contiguous()
23
 
24
  kbt.add(
25
+ "torch_flash_ma",
26
+ torch_flash,
27
+ tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
28
  )
29
 
30
  if __name__ == "__main__":
31
  device = "cuda" if torch.cuda.is_available() else "cpu"
32
  dtype = "float32" if device == "cpu" else "bfloat16"
33
 
34
+ # Flux-like workloads scaled down for CPU testing
35
  base = 1024 if device == "cuda" else 512
36
  flux_sizes = (
37
  [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
flash_attn/impls/cells/benchmark_default.py CHANGED
@@ -2,12 +2,12 @@
2
  # requires-python = ">=3.10"
3
  # dependencies = [
4
  # "numpy",
5
- # "torch",
6
  # "kernels-benchmark-tools",
7
  # ]
8
  #
9
  # [tool.uv.sources]
10
- # kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
11
  # ///
12
  import torch
13
  import sys
 
2
  # requires-python = ">=3.10"
3
  # dependencies = [
4
  # "numpy",
5
+ # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
  # ]
8
  #
9
  # [tool.uv.sources]
10
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
11
  # ///
12
  import torch
13
  import sys
flash_attn/impls/cells/benchmark_max_autotune.py CHANGED
@@ -2,12 +2,12 @@
2
  # requires-python = ">=3.10"
3
  # dependencies = [
4
  # "numpy",
5
- # "torch",
6
  # "kernels-benchmark-tools",
7
  # ]
8
  #
9
  # [tool.uv.sources]
10
- # kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
11
  # ///
12
  import torch
13
  import sys
 
2
  # requires-python = ">=3.10"
3
  # dependencies = [
4
  # "numpy",
5
+ # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
  # ]
8
  #
9
  # [tool.uv.sources]
10
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
11
  # ///
12
  import torch
13
  import sys
flash_attn/impls/compiled_variants.html CHANGED
@@ -706,6 +706,29 @@
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  .cell-stderr {
710
  background: var(--bg-error);
711
  border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3556
  if(output){
3557
  output.classList.remove('output-stale');
3558
  let html='';
3559
- if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
 
 
 
 
 
3560
  console.log('UV Logs:', data);
3561
  if(data.stderr) {
3562
  // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3678
  }
3679
  }
3680
 
3681
- // Live reload functionality (robust SSE handling)
3682
- (function(){
3683
- if (!('EventSource' in window)) {
3684
- console.warn('SSE not supported in this browser');
3685
- return;
3686
- }
3687
- let source = new EventSource('/events');
3688
- let isOpen = false;
3689
- source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3690
- source.onmessage = function(e){
3691
- const msg=(e.data||'').trim(); if(!msg) return;
3692
- console.log('SSE message:', msg);
3693
- if (msg==='reload' || msg==='incremental') { location.reload(); }
3694
- // Ignore 'loading' to avoid premature reload loops
3695
- };
3696
- source.onerror = function(e){
3697
- // Let EventSource auto-reconnect instead of forcing a reload
3698
- if (isOpen) console.warn('SSE error after open, retrying...', e);
3699
- };
3700
- window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3701
- })();
3702
 
3703
 
3704
  document.addEventListener('DOMContentLoaded', function() {
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
@@ -3837,14 +3865,14 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3837
  <h1>Torch Compile Variants!</h1>
3838
  <p>This file benchmarks Flash Attention with different torch.compile modes.</p>
3839
  <h2>Flash Attention with torch.compile(mode="default")</h2>
3840
- <div class="cell cell-failed" id="cell-benchmark_default">
3841
  <div class="cell-header">
3842
  <span class="collapse-indicators">
3843
  <span onclick="toggleCode('benchmark_default')" style="cursor: pointer;">▼ code</span>
3844
  <span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
3845
- <span id="uv-indicator-benchmark_default" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3846
  </span> |
3847
- Cell: benchmark_default | 0.02s | FAILED
3848
  | <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
3849
  <button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
3850
  <a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
@@ -3856,12 +3884,12 @@ Cell: benchmark_default | 0.02s | FAILED
3856
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3857
  <span class="c1"># dependencies = [</span>
3858
  <span class="c1"># &quot;numpy&quot;,</span>
3859
- <span class="c1"># &quot;torch&quot;,</span>
3860
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3861
  <span class="c1"># ]</span>
3862
  <span class="c1">#</span>
3863
  <span class="c1"># [tool.uv.sources]</span>
3864
- <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3865
  <span class="c1"># ///</span>
3866
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3867
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3929,14 +3957,291 @@ Cell: benchmark_default | 0.02s | FAILED
3929
  </div>
3930
  </div>
3931
  <div id="output-benchmark_default" class="cell-output">
3932
- <div class="cell-stderr"> × Failed to resolve script requirement
3933
- ╰─▶ Distribution not found at:
3934
- file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3935
  </div>
3936
  </div>
3937
  </div>
3938
 
3939
  <h2>Flash Attention with torch.compile(mode="max-autotune")</h2>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3940
  </div>
3941
 
3942
  </body>
 
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
709
+
710
+ .cell-stdout {
711
+ background: var(--bg-tertiary);
712
+ padding: 0.75rem;
713
+ border-radius: 1px;
714
+ font-family: inherit;
715
+ font-size: 0.9rem;
716
+ color: var(--text-primary);
717
+
718
+ /* key bits */
719
+ overflow: auto; /* show scrollbars when needed */
720
+ max-width: 100%; /* respects whatever layout width you give it */
721
+ }
722
+
723
+ .cell-stdout .stdout-text {
724
+ margin: 0; /* reset pre default margin */
725
+ white-space: pre; /* keep line breaks, NO wrapping */
726
+ display: inline-block; /* shrink-to-content */
727
+ min-width: max-content; /* allow very long lines to define intrinsic width */
728
+ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
729
+ tab-size: 2;
730
+ }
731
+
732
  .cell-stderr {
733
  background: var(--bg-error);
734
  border-left: 2px solid var(--border-error);
 
3579
  if(output){
3580
  output.classList.remove('output-stale');
3581
  let html='';
3582
+ if (data.stdout) {
3583
+ html += '<div class="cell-stdout"><pre class="stdout-text">'
3584
+ + escapeHtml(data.stdout)
3585
+ + '</pre></div>';
3586
+ }
3587
+
3588
  console.log('UV Logs:', data);
3589
  if(data.stderr) {
3590
  // Split UV logs from regular stderr
 
3706
  }
3707
  }
3708
 
3709
+ // // Live reload functionality (robust SSE handling)
3710
+ // (function(){
3711
+ // if (!('EventSource' in window)) {
3712
+ // console.warn('SSE not supported in this browser');
3713
+ // return;
3714
+ // }
3715
+ // let source = new EventSource('/events');
3716
+ // let isOpen = false;
3717
+ // source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3718
+ // source.onmessage = function(e){
3719
+ // const msg=(e.data||'').trim(); if(!msg) return;
3720
+ // console.log('SSE message:', msg);
3721
+ // if (msg==='reload' || msg==='incremental') { location.reload(); }
3722
+ // // Ignore 'loading' to avoid premature reload loops
3723
+ // };
3724
+ // source.onerror = function(e){
3725
+ // // Let EventSource auto-reconnect instead of forcing a reload
3726
+ // if (isOpen) console.warn('SSE error after open, retrying...', e);
3727
+ // };
3728
+ // window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3729
+ // })();
3730
 
3731
 
3732
  document.addEventListener('DOMContentLoaded', function() {
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3865
  <h1>Torch Compile Variants!</h1>
3866
  <p>This file benchmarks Flash Attention with different torch.compile modes.</p>
3867
  <h2>Flash Attention with torch.compile(mode="default")</h2>
3868
+ <div class="cell" id="cell-benchmark_default">
3869
  <div class="cell-header">
3870
  <span class="collapse-indicators">
3871
  <span onclick="toggleCode('benchmark_default')" style="cursor: pointer;">▼ code</span>
3872
  <span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
3873
+ <span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
3874
  </span> |
3875
+ Cell: benchmark_default | 12.08s
3876
  | <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
3877
  <button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
3878
  <a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
 
3884
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3885
  <span class="c1"># dependencies = [</span>
3886
  <span class="c1"># &quot;numpy&quot;,</span>
3887
+ <span class="c1"># &quot;torch==2.8.0&quot;,</span>
3888
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3889
  <span class="c1"># ]</span>
3890
  <span class="c1">#</span>
3891
  <span class="c1"># [tool.uv.sources]</span>
3892
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
3893
  <span class="c1"># ///</span>
3894
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3895
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
3957
  </div>
3958
  </div>
3959
  <div id="output-benchmark_default" class="cell-output">
3960
+ <div class="cell-stdout"><pre class="stdout-text">
3961
+ ======================================================================
3962
+ PROFILE TRACE: torch_flash_compiled_default | flux_L128
3963
+ ======================================================================
3964
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3965
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3966
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3967
+ torch_flash_compiled_default 0.00% 0.000us 0.00% 0.000us 0.000us 967.332us 298.12% 967.332us 967.332us 1
3968
+ torch_flash_compiled_default 5.37% 154.798us 99.77% 2.878ms 2.878ms 0.000us 0.00% 324.481us 324.481us 1
3969
+ Torch-Compiled Region: 0/1 20.96% 604.478us 92.49% 2.668ms 889.236us 0.000us 0.00% 324.481us 108.160us 3
3970
+ aten::_scaled_dot_product_flash_attention 1.54% 44.432us 8.35% 240.853us 80.284us 0.000us 0.00% 276.257us 92.086us 3
3971
+ aten::_flash_attention_forward 1.64% 47.371us 5.29% 152.657us 50.886us 276.257us 85.14% 276.257us 92.086us 3
3972
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 276.257us 85.14% 276.257us 92.086us 3
3973
+ triton_poi_fused__scaled_dot_product_flash_attention... 3.50% 100.807us 6.04% 174.309us 19.368us 36.704us 11.31% 36.704us 4.078us 9
3974
+ triton_poi_fused__scaled_dot_product_flash_attention... 0.00% 0.000us 0.00% 0.000us 0.000us 36.704us 11.31% 36.704us 4.078us 9
3975
+ triton_poi_fused_clone_1 1.27% 36.672us 2.17% 62.583us 20.861us 11.520us 3.55% 11.520us 3.840us 3
3976
+ triton_poi_fused_clone_1 0.00% 0.000us 0.00% 0.000us 0.000us 11.520us 3.55% 11.520us 3.840us 3
3977
+ TorchDynamo Cache Lookup 1.91% 55.093us 1.91% 55.093us 18.364us 0.000us 0.00% 0.000us 0.000us 3
3978
+ Pregraph bytecode 0.36% 10.400us 0.36% 10.400us 3.467us 0.000us 0.00% 0.000us 0.000us 3
3979
+ AOTDispatcher Runtime Wrapper Prologue 0.70% 20.280us 0.70% 20.280us 6.760us 0.000us 0.00% 0.000us 0.000us 3
3980
+ Activity Buffer Request 53.91% 1.555ms 53.91% 1.555ms 1.555ms 0.000us 0.00% 0.000us 0.000us 1
3981
+ cuLaunchKernel 3.45% 99.413us 3.45% 99.413us 8.284us 0.000us 0.00% 0.000us 0.000us 12
3982
+ aten::transpose 1.19% 34.395us 1.52% 43.764us 3.647us 0.000us 0.00% 0.000us 0.000us 12
3983
+ aten::as_strided 0.32% 9.369us 0.32% 9.369us 0.781us 0.000us 0.00% 0.000us 0.000us 12
3984
+ aten::empty_like 0.44% 12.621us 1.20% 34.732us 11.577us 0.000us 0.00% 0.000us 0.000us 3
3985
+ aten::empty_strided 0.77% 22.111us 0.77% 22.111us 7.370us 0.000us 0.00% 0.000us 0.000us 3
3986
+ aten::empty 1.24% 35.841us 1.24% 35.841us 2.987us 0.000us 0.00% 0.000us 0.000us 12
3987
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
+ Self CPU time total: 2.884ms
3989
+ Self CUDA time total: 324.481us
3990
+
3991
+
3992
+
3993
+ ======================================================================
3994
+ PROFILE TRACE: torch_flash_compiled_default | flux_L256
3995
+ ======================================================================
3996
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3998
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
+ torch_flash_compiled_default 0.00% 0.000us 0.00% 0.000us 0.000us 834.378us 233.60% 834.378us 834.378us 1
4000
+ torch_flash_compiled_default 4.04% 97.294us 99.68% 2.400ms 2.400ms 0.000us 0.00% 357.190us 357.190us 1
4001
+ Torch-Compiled Region: 0/3 19.97% 480.803us 94.43% 2.274ms 757.987us 0.000us 0.00% 357.190us 119.063us 3
4002
+ aten::_scaled_dot_product_flash_attention 1.08% 25.983us 7.33% 176.640us 58.880us 0.000us 0.00% 300.165us 100.055us 3
4003
+ aten::_flash_attention_forward 1.50% 36.164us 5.01% 120.717us 40.239us 300.165us 84.04% 300.165us 100.055us 3
4004
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 300.165us 84.04% 300.165us 100.055us 3
4005
+ triton_poi_fused__scaled_dot_product_flash_attention... 3.30% 79.496us 6.27% 150.937us 16.771us 40.161us 11.24% 40.161us 4.462us 9
4006
+ triton_poi_fused__scaled_dot_product_flash_attention... 0.00% 0.000us 0.00% 0.000us 0.000us 40.161us 11.24% 40.161us 4.462us 9
4007
+ triton_poi_fused_clone_1 2.33% 56.123us 3.38% 81.404us 27.135us 16.864us 4.72% 16.864us 5.621us 3
4008
+ triton_poi_fused_clone_1 0.00% 0.000us 0.00% 0.000us 0.000us 16.864us 4.72% 16.864us 5.621us 3
4009
+ TorchDynamo Cache Lookup 1.21% 29.133us 1.21% 29.133us 9.711us 0.000us 0.00% 0.000us 0.000us 3
4010
+ Pregraph bytecode 0.32% 7.730us 0.32% 7.730us 2.577us 0.000us 0.00% 0.000us 0.000us 3
4011
+ AOTDispatcher Runtime Wrapper Prologue 0.49% 11.750us 0.49% 11.750us 3.917us 0.000us 0.00% 0.000us 0.000us 3
4012
+ Activity Buffer Request 56.67% 1.365ms 56.67% 1.365ms 1.365ms 0.000us 0.00% 0.000us 0.000us 1
4013
+ cuLaunchKernel 4.02% 96.722us 4.02% 96.722us 8.060us 0.000us 0.00% 0.000us 0.000us 12
4014
+ aten::transpose 0.90% 21.580us 1.24% 29.940us 2.495us 0.000us 0.00% 0.000us 0.000us 12
4015
+ aten::as_strided 0.35% 8.360us 0.35% 8.360us 0.697us 0.000us 0.00% 0.000us 0.000us 12
4016
+ aten::empty_like 0.27% 6.480us 1.00% 23.971us 7.990us 0.000us 0.00% 0.000us 0.000us 3
4017
+ aten::empty_strided 0.73% 17.491us 0.73% 17.491us 5.830us 0.000us 0.00% 0.000us 0.000us 3
4018
+ aten::empty 1.24% 29.800us 1.24% 29.800us 2.483us 0.000us 0.00% 0.000us 0.000us 12
4019
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
+ Self CPU time total: 2.408ms
4021
+ Self CUDA time total: 357.190us
4022
+
4023
+
4024
+
4025
+ ======================================================================
4026
+ PROFILE TRACE: torch_flash_compiled_default | flux_L320
4027
+ ======================================================================
4028
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4029
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4030
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
+ torch_flash_compiled_default 0.00% 0.000us 0.00% 0.000us 0.000us 876.295us 230.02% 876.295us 876.295us 1
4032
+ torch_flash_compiled_default 3.99% 99.235us 99.67% 2.477ms 2.477ms 0.000us 0.00% 380.963us 380.963us 1
4033
+ Torch-Compiled Region: 0/5 19.71% 489.623us 94.50% 2.348ms 782.708us 0.000us 0.00% 380.963us 126.988us 3
4034
+ aten::_scaled_dot_product_flash_attention 1.15% 28.583us 7.58% 188.458us 62.819us 0.000us 0.00% 323.107us 107.702us 3
4035
+ aten::_flash_attention_forward 1.61% 40.110us 5.06% 125.615us 41.872us 323.107us 84.81% 323.107us 107.702us 3
4036
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 323.107us 84.81% 323.107us 107.702us 3
4037
+ triton_poi_fused__scaled_dot_product_flash_attention... 3.47% 86.344us 6.19% 153.807us 17.090us 44.448us 11.67% 44.448us 4.939us 9
4038
+ triton_poi_fused__scaled_dot_product_flash_attention... 0.00% 0.000us 0.00% 0.000us 0.000us 44.448us 11.67% 44.448us 4.939us 9
4039
+ triton_poi_fused_clone_1 1.44% 35.902us 2.40% 59.634us 19.878us 13.408us 3.52% 13.408us 4.469us 3
4040
+ triton_poi_fused_clone_1 0.00% 0.000us 0.00% 0.000us 0.000us 13.408us 3.52% 13.408us 4.469us 3
4041
+ TorchDynamo Cache Lookup 1.18% 29.223us 1.18% 29.223us 9.741us 0.000us 0.00% 0.000us 0.000us 3
4042
+ Pregraph bytecode 0.30% 7.450us 0.30% 7.450us 2.483us 0.000us 0.00% 0.000us 0.000us 3
4043
+ AOTDispatcher Runtime Wrapper Prologue 0.46% 11.502us 0.46% 11.502us 3.834us 0.000us 0.00% 0.000us 0.000us 3
4044
+ Activity Buffer Request 57.86% 1.438ms 57.86% 1.438ms 1.438ms 0.000us 0.00% 0.000us 0.000us 1
4045
+ cuLaunchKernel 3.67% 91.195us 3.67% 91.195us 7.600us 0.000us 0.00% 0.000us 0.000us 12
4046
+ aten::transpose 0.95% 23.681us 1.38% 34.260us 2.855us 0.000us 0.00% 0.000us 0.000us 12
4047
+ aten::as_strided 0.43% 10.579us 0.43% 10.579us 0.882us 0.000us 0.00% 0.000us 0.000us 12
4048
+ aten::empty_like 0.27% 6.811us 0.93% 23.051us 7.684us 0.000us 0.00% 0.000us 0.000us 3
4049
+ aten::empty_strided 0.65% 16.240us 0.65% 16.240us 5.413us 0.000us 0.00% 0.000us 0.000us 3
4050
+ aten::empty 1.30% 32.232us 1.30% 32.232us 2.686us 0.000us 0.00% 0.000us 0.000us 12
4051
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4052
+ Self CPU time total: 2.485ms
4053
+ Self CUDA time total: 380.963us
4054
+
4055
+
4056
+
4057
+ ======================================================================
4058
+ PROFILE TRACE: torch_flash_compiled_default | flux_L384
4059
+ ======================================================================
4060
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4061
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4062
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4063
+ torch_flash_compiled_default 0.00% 0.000us 0.00% 0.000us 0.000us 900.385us 224.95% 900.385us 900.385us 1
4064
+ torch_flash_compiled_default 3.56% 101.756us 99.74% 2.848ms 2.848ms 0.000us 0.00% 400.258us 400.258us 1
4065
+ Torch-Compiled Region: 0/7 18.27% 521.655us 95.19% 2.718ms 906.103us 0.000us 0.00% 400.258us 133.419us 3
4066
+ aten::_scaled_dot_product_flash_attention 0.99% 28.253us 6.33% 180.729us 60.243us 0.000us 0.00% 336.352us 112.117us 3
4067
+ aten::_flash_attention_forward 1.29% 36.890us 4.19% 119.565us 39.855us 336.352us 84.03% 336.352us 112.117us 3
4068
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 336.352us 84.03% 336.352us 112.117us 3
4069
+ triton_poi_fused__scaled_dot_product_flash_attention... 3.07% 87.777us 16.12% 460.302us 51.145us 49.985us 12.49% 49.985us 5.554us 9
4070
+ triton_poi_fused__scaled_dot_product_flash_attention... 0.00% 0.000us 0.00% 0.000us 0.000us 49.985us 12.49% 49.985us 5.554us 9
4071
+ triton_poi_fused_clone_1 1.24% 35.330us 2.05% 58.492us 19.497us 13.921us 3.48% 13.921us 4.640us 3
4072
+ triton_poi_fused_clone_1 0.00% 0.000us 0.00% 0.000us 0.000us 13.921us 3.48% 13.921us 4.640us 3
4073
+ TorchDynamo Cache Lookup 0.99% 28.213us 0.99% 28.213us 9.404us 0.000us 0.00% 0.000us 0.000us 3
4074
+ Pregraph bytecode 0.25% 7.170us 0.25% 7.170us 2.390us 0.000us 0.00% 0.000us 0.000us 3
4075
+ AOTDispatcher Runtime Wrapper Prologue 0.43% 12.361us 0.43% 12.361us 4.120us 0.000us 0.00% 0.000us 0.000us 3
4076
+ Activity Buffer Request 51.74% 1.478ms 51.74% 1.478ms 1.478ms 0.000us 0.00% 0.000us 0.000us 1
4077
+ cuLaunchKernel 13.86% 395.687us 13.86% 395.687us 32.974us 0.000us 0.00% 0.000us 0.000us 12
4078
+ aten::transpose 0.83% 23.691us 1.15% 32.911us 2.743us 0.000us 0.00% 0.000us 0.000us 12
4079
+ aten::as_strided 0.32% 9.220us 0.32% 9.220us 0.768us 0.000us 0.00% 0.000us 0.000us 12
4080
+ aten::empty_like 0.23% 6.600us 0.78% 22.311us 7.437us 0.000us 0.00% 0.000us 0.000us 3
4081
+ aten::empty_strided 0.55% 15.711us 0.55% 15.711us 5.237us 0.000us 0.00% 0.000us 0.000us 3
4082
+ aten::empty 1.03% 29.502us 1.03% 29.502us 2.459us 0.000us 0.00% 0.000us 0.000us 12
4083
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4084
+ Self CPU time total: 2.856ms
4085
+ Self CUDA time total: 400.258us
4086
+
4087
+
4088
+ impl wl p50(ms) ok
4089
+ torch_flash_compiled_default flux_L128 0.20 True
4090
+ torch_flash_compiled_default flux_L256 0.23 True
4091
+ torch_flash_compiled_default flux_L320 0.24 True
4092
+ torch_flash_compiled_default flux_L384 0.24 True
4093
+ torch_flash_compiled_default flux_L448 FAIL False
4094
+ Error: recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value.
4095
+ torch_flash_compiled_default flux_L512 FAIL False
4096
+ Error: recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value.
4097
+ </pre></div>
4098
+ <div class="uv-install-logs" id="uv-logs-benchmark_default">
4099
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4100
+ <div class="uv-logs-content" style="display: none;">
4101
+ Installed 37 packages in 247ms
4102
+ </div>
4103
+ </div>
4104
+ <div class="cell-stderr">W1023 17:21:27.942000 6833 torch/_dynamo/convert_frame.py:1016] [0/8] torch._dynamo hit config.recompile_limit (8)
4105
+ W1023 17:21:27.942000 6833 torch/_dynamo/convert_frame.py:1016] [0/8] function: &#x27;torch_flash_base&#x27; (/__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cells/benchmark_default.py:18)
4106
+ W1023 17:21:27.942000 6833 torch/_dynamo/convert_frame.py:1016] [0/8] last reason: 0/7: GLOBAL_STATE changed: num_threads
4107
+ W1023 17:21:27.942000 6833 torch/_dynamo/convert_frame.py:1016] [0/8] To log all recompilation reasons, use TORCH_LOGS=&quot;recompiles&quot;.
4108
+ W1023 17:21:27.942000 6833 torch/_dynamo/convert_frame.py:1016] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.
4109
+ W1023 17:21:27.948000 6833 torch/_dynamo/convert_frame.py:1016] [0/9] torch._dynamo hit config.recompile_limit (8)
4110
+ W1023 17:21:27.948000 6833 torch/_dynamo/convert_frame.py:1016] [0/9] function: &#x27;torch_flash_base&#x27; (/__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cells/benchmark_default.py:18)
4111
+ W1023 17:21:27.948000 6833 torch/_dynamo/convert_frame.py:1016] [0/9] last reason: 0/7: GLOBAL_STATE changed: num_threads
4112
+ W1023 17:21:27.948000 6833 torch/_dynamo/convert_frame.py:1016] [0/9] To log all recompilation reasons, use TORCH_LOGS=&quot;recompiles&quot;.
4113
+ W1023 17:21:27.948000 6833 torch/_dynamo/convert_frame.py:1016] [0/9] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.</div>
4114
+ <div class="cell-artifacts">
4115
+ <h4>Artifacts:</h4>
4116
+ <a href="artifacts/benchmark_default/attn_default.jsonl" class="artifact" target="_blank">attn_default.jsonl</a>
4117
  </div>
4118
  </div>
4119
  </div>
4120
 
4121
  <h2>Flash Attention with torch.compile(mode="max-autotune")</h2>
4122
+ <div class="cell" id="cell-benchmark_max_autotune">
4123
+ <div class="cell-header">
4124
+ <span class="collapse-indicators">
4125
+ <span onclick="toggleCode('benchmark_max_autotune')" style="cursor: pointer;">▼ code</span>
4126
+ <span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
4127
+ <span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
4128
+ </span> |
4129
+ Cell: benchmark_max_autotune | 18.98s
4130
+ | <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
4131
+ <button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
4132
+ <a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
4133
+ <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/compiled_variants.md" target="_blank" class="github-btn">GitHub</a>
4134
+ </div>
4135
+ <div id="code-benchmark_max_autotune" class="cell-code" data-lines="70">
4136
+ <div class="code-wrap">
4137
+ <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
4138
+ <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
4139
+ <span class="c1"># dependencies = [</span>
4140
+ <span class="c1"># &quot;numpy&quot;,</span>
4141
+ <span class="c1"># &quot;torch==2.8.0&quot;,</span>
4142
+ <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
4143
+ <span class="c1"># ]</span>
4144
+ <span class="c1">#</span>
4145
+ <span class="c1"># [tool.uv.sources]</span>
4146
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
4147
+ <span class="c1"># ///</span>
4148
+ <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
4149
+ <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
4150
+ <span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
4151
+ <span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
4152
+
4153
+
4154
+ <span class="k">def</span><span class="w"> </span><span class="nf">torch_flash_base</span><span class="p">(</span><span class="n">q</span><span class="p">,</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span><span class="p">):</span>
4155
+ <span class="n">qt</span><span class="p">,</span> <span class="n">kt</span><span class="p">,</span> <span class="n">vt</span> <span class="o">=</span> <span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="p">(</span><span class="n">q</span><span class="p">,</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span><span class="p">))</span>
4156
+ <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">sdpa_kernel</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">SDPBackend</span><span class="o">.</span><span class="n">FLASH_ATTENTION</span><span class="p">):</span>
4157
+ <span class="n">o</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">scaled_dot_product_attention</span><span class="p">(</span><span class="n">qt</span><span class="p">,</span> <span class="n">kt</span><span class="p">,</span> <span class="n">vt</span><span class="p">)</span>
4158
+ <span class="k">return</span> <span class="n">o</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span>
4159
+
4160
+
4161
+ <span class="c1"># Compile with max-autotune mode</span>
4162
+ <span class="n">compiled_flash_max_autotune</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">torch_flash_base</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;max-autotune&quot;</span><span class="p">,</span> <span class="n">fullgraph</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">dynamic</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
4163
+
4164
+ <span class="n">kbt</span><span class="o">.</span><span class="n">add</span><span class="p">(</span>
4165
+ <span class="s2">&quot;torch_flash_compiled_max_autotune&quot;</span><span class="p">,</span>
4166
+ <span class="n">compiled_flash_max_autotune</span><span class="p">,</span>
4167
+ <span class="n">tags</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;family&quot;</span><span class="p">:</span> <span class="s2">&quot;torch-sdpa&quot;</span><span class="p">,</span> <span class="s2">&quot;backend&quot;</span><span class="p">:</span> <span class="s2">&quot;FLASH&quot;</span><span class="p">,</span> <span class="s2">&quot;compile&quot;</span><span class="p">:</span> <span class="s2">&quot;max-autotune&quot;</span><span class="p">},</span>
4168
+ <span class="p">)</span>
4169
+
4170
+ <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
4171
+ <span class="n">device</span> <span class="o">=</span> <span class="s2">&quot;cuda&quot;</span> <span class="k">if</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">is_available</span><span class="p">()</span> <span class="k">else</span> <span class="s2">&quot;cpu&quot;</span>
4172
+ <span class="n">dtype</span> <span class="o">=</span> <span class="s2">&quot;float32&quot;</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">&quot;cpu&quot;</span> <span class="k">else</span> <span class="s2">&quot;bfloat16&quot;</span>
4173
+
4174
+ <span class="c1"># Flux-like workloads</span>
4175
+ <span class="n">base</span> <span class="o">=</span> <span class="mi">1024</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">&quot;cuda&quot;</span> <span class="k">else</span> <span class="mi">512</span>
4176
+ <span class="n">flux_sizes</span> <span class="o">=</span> <span class="p">(</span>
4177
+ <span class="p">[</span><span class="mi">128</span><span class="p">,</span> <span class="mi">256</span><span class="p">,</span> <span class="mi">320</span><span class="p">,</span> <span class="mi">384</span><span class="p">,</span> <span class="mi">448</span><span class="p">,</span> <span class="mi">512</span><span class="p">]</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">&quot;cuda&quot;</span> <span class="k">else</span> <span class="p">[</span><span class="mi">64</span><span class="p">,</span> <span class="mi">128</span><span class="p">,</span> <span class="mi">192</span><span class="p">,</span> <span class="mi">256</span><span class="p">]</span>
4178
+ <span class="p">)</span>
4179
+ <span class="n">heads</span> <span class="o">=</span> <span class="mi">24</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">&quot;cuda&quot;</span> <span class="k">else</span> <span class="mi">8</span>
4180
+ <span class="n">head_dim</span> <span class="o">=</span> <span class="mi">128</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">&quot;cuda&quot;</span> <span class="k">else</span> <span class="mi">64</span>
4181
+
4182
+ <span class="n">wl</span> <span class="o">=</span> <span class="p">[]</span>
4183
+ <span class="k">for</span> <span class="n">L</span> <span class="ow">in</span> <span class="n">flux_sizes</span><span class="p">:</span>
4184
+ <span class="n">wl</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
4185
+ <span class="p">{</span>
4186
+ <span class="s2">&quot;name&quot;</span><span class="p">:</span> <span class="sa">f</span><span class="s2">&quot;flux_L</span><span class="si">{</span><span class="n">L</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">,</span>
4187
+ <span class="s2">&quot;batch&quot;</span><span class="p">:</span> <span class="mi">1</span><span class="p">,</span>
4188
+ <span class="s2">&quot;seq_len&quot;</span><span class="p">:</span> <span class="n">base</span> <span class="o">+</span> <span class="n">L</span><span class="p">,</span>
4189
+ <span class="s2">&quot;heads&quot;</span><span class="p">:</span> <span class="n">heads</span><span class="p">,</span>
4190
+ <span class="s2">&quot;head_dim&quot;</span><span class="p">:</span> <span class="n">head_dim</span><span class="p">,</span>
4191
+ <span class="s2">&quot;dtype&quot;</span><span class="p">:</span> <span class="n">dtype</span><span class="p">,</span>
4192
+ <span class="s2">&quot;device&quot;</span><span class="p">:</span> <span class="n">device</span><span class="p">,</span>
4193
+ <span class="s2">&quot;seed&quot;</span><span class="p">:</span> <span class="mi">0</span><span class="p">,</span>
4194
+ <span class="p">}</span>
4195
+ <span class="p">)</span>
4196
+
4197
+ <span class="n">kbt</span><span class="o">.</span><span class="n">run</span><span class="p">(</span>
4198
+ <span class="n">wl</span><span class="p">,</span>
4199
+ <span class="n">jsonl</span><span class="o">=</span><span class="s2">&quot;attn_max_autotune.jsonl&quot;</span><span class="p">,</span>
4200
+ <span class="n">reps</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
4201
+ <span class="n">warmup</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
4202
+ <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
4203
+ <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
4204
+ <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
4205
+ <span class="p">)</span>
4206
+ <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn_max_autotune.jsonl&quot;</span><span class="p">])</span>
4207
+ </pre></div>
4208
+
4209
+ <div class="code-line-highlight" id="line-highlight-benchmark_max_autotune"></div>
4210
+ </div>
4211
+ </div>
4212
+ <div id="output-benchmark_max_autotune" class="cell-output">
4213
+ <div class="cell-stdout"><pre class="stdout-text">impl wl p50(ms) ok
4214
+ torch_flash_compiled_max_autotune flux_L128 0.19 True
4215
+ torch_flash_compiled_max_autotune flux_L256 0.20 True
4216
+ torch_flash_compiled_max_autotune flux_L320 0.21 True
4217
+ torch_flash_compiled_max_autotune flux_L384 0.21 True
4218
+ torch_flash_compiled_max_autotune flux_L448 FAIL False
4219
+ Error: recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value.
4220
+ torch_flash_compiled_max_autotune flux_L512 FAIL False
4221
+ Error: recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value.
4222
+ </pre></div>
4223
+ <div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
4224
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4225
+ <div class="uv-logs-content" style="display: none;">
4226
+ Installed 37 packages in 208ms
4227
+ </div>
4228
+ </div>
4229
+ <div class="cell-stderr">W1023 17:21:15.860000 6116 torch/_dynamo/convert_frame.py:1016] [0/8] torch._dynamo hit config.recompile_limit (8)
4230
+ W1023 17:21:15.860000 6116 torch/_dynamo/convert_frame.py:1016] [0/8] function: &#x27;torch_flash_base&#x27; (/__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cells/benchmark_max_autotune.py:18)
4231
+ W1023 17:21:15.860000 6116 torch/_dynamo/convert_frame.py:1016] [0/8] last reason: 0/7: GLOBAL_STATE changed: num_threads
4232
+ W1023 17:21:15.860000 6116 torch/_dynamo/convert_frame.py:1016] [0/8] To log all recompilation reasons, use TORCH_LOGS=&quot;recompiles&quot;.
4233
+ W1023 17:21:15.860000 6116 torch/_dynamo/convert_frame.py:1016] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.
4234
+ W1023 17:21:15.866000 6116 torch/_dynamo/convert_frame.py:1016] [0/9] torch._dynamo hit config.recompile_limit (8)
4235
+ W1023 17:21:15.866000 6116 torch/_dynamo/convert_frame.py:1016] [0/9] function: &#x27;torch_flash_base&#x27; (/__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cells/benchmark_max_autotune.py:18)
4236
+ W1023 17:21:15.866000 6116 torch/_dynamo/convert_frame.py:1016] [0/9] last reason: 0/7: GLOBAL_STATE changed: num_threads
4237
+ W1023 17:21:15.866000 6116 torch/_dynamo/convert_frame.py:1016] [0/9] To log all recompilation reasons, use TORCH_LOGS=&quot;recompiles&quot;.
4238
+ W1023 17:21:15.866000 6116 torch/_dynamo/convert_frame.py:1016] [0/9] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.</div>
4239
+ <div class="cell-artifacts">
4240
+ <h4>Artifacts:</h4>
4241
+ <a href="artifacts/benchmark_max_autotune/attn_max_autotune.jsonl" class="artifact" target="_blank">attn_max_autotune.jsonl</a>
4242
+ </div>
4243
+ </div>
4244
+ </div>
4245
  </div>
4246
 
4247
  </body>
flash_attn/impls/flash_attention.html CHANGED
@@ -706,6 +706,29 @@
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  .cell-stderr {
710
  background: var(--bg-error);
711
  border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3556
  if(output){
3557
  output.classList.remove('output-stale');
3558
  let html='';
3559
- if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
 
 
 
 
 
3560
  console.log('UV Logs:', data);
3561
  if(data.stderr) {
3562
  // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3678
  }
3679
  }
3680
 
3681
- // Live reload functionality (robust SSE handling)
3682
- (function(){
3683
- if (!('EventSource' in window)) {
3684
- console.warn('SSE not supported in this browser');
3685
- return;
3686
- }
3687
- let source = new EventSource('/events');
3688
- let isOpen = false;
3689
- source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3690
- source.onmessage = function(e){
3691
- const msg=(e.data||'').trim(); if(!msg) return;
3692
- console.log('SSE message:', msg);
3693
- if (msg==='reload' || msg==='incremental') { location.reload(); }
3694
- // Ignore 'loading' to avoid premature reload loops
3695
- };
3696
- source.onerror = function(e){
3697
- // Let EventSource auto-reconnect instead of forcing a reload
3698
- if (isOpen) console.warn('SSE error after open, retrying...', e);
3699
- };
3700
- window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3701
- })();
3702
 
3703
 
3704
  document.addEventListener('DOMContentLoaded', function() {
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
@@ -3843,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3843
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3844
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: nv | 0.23s
3847
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3849
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3860,7 +3888,7 @@ Cell: nv | 0.23s
3860
  </div>
3861
  </div>
3862
  <div id="output-nv" class="cell-output">
3863
- <div class="cell-stdout">Wed Oct 22 08:58:24 2025
3864
  +-----------------------------------------------------------------------------------------+
3865
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3866
  |-----------------------------------------+------------------------+----------------------+
@@ -3869,7 +3897,7 @@ Cell: nv | 0.23s
3869
  | | | MIG M. |
3870
  |=========================================+========================+======================|
3871
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3872
- | N/A 26C P8 22W / 350W | 0MiB / 46068MiB | 0% Default |
3873
  | | | N/A |
3874
  +-----------------------------------------+------------------------+----------------------+
3875
 
@@ -3881,19 +3909,19 @@ Cell: nv | 0.23s
3881
  | No running processes found |
3882
  +-----------------------------------------------------------------------------------------+
3883
 
3884
- </div>
3885
  </div>
3886
  </div>
3887
 
3888
  <h2>Flash Attention Benchmark</h2>
3889
- <div class="cell cell-failed" id="cell-benchmark">
3890
  <div class="cell-header">
3891
  <span class="collapse-indicators">
3892
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3893
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3894
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3895
  </span> |
3896
- Cell: benchmark | 0.01s | FAILED
3897
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3898
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3899
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3905,12 +3933,12 @@ Cell: benchmark | 0.01s | FAILED
3905
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3906
  <span class="c1"># dependencies = [</span>
3907
  <span class="c1"># &quot;numpy&quot;,</span>
3908
- <span class="c1"># &quot;torch&quot;,</span>
3909
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3910
  <span class="c1"># ]</span>
3911
  <span class="c1">#</span>
3912
  <span class="c1"># [tool.uv.sources]</span>
3913
- <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3914
  <span class="c1"># ///</span>
3915
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3916
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3974,9 +4002,209 @@ Cell: benchmark | 0.01s | FAILED
3974
  </div>
3975
  </div>
3976
  <div id="output-benchmark" class="cell-output">
3977
- <div class="cell-stderr"> × Failed to resolve script requirement
3978
- ╰─▶ Distribution not found at:
3979
- file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3980
  </div>
3981
  </div>
3982
  </div>
 
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
709
+
710
+ .cell-stdout {
711
+ background: var(--bg-tertiary);
712
+ padding: 0.75rem;
713
+ border-radius: 1px;
714
+ font-family: inherit;
715
+ font-size: 0.9rem;
716
+ color: var(--text-primary);
717
+
718
+ /* key bits */
719
+ overflow: auto; /* show scrollbars when needed */
720
+ max-width: 100%; /* respects whatever layout width you give it */
721
+ }
722
+
723
+ .cell-stdout .stdout-text {
724
+ margin: 0; /* reset pre default margin */
725
+ white-space: pre; /* keep line breaks, NO wrapping */
726
+ display: inline-block; /* shrink-to-content */
727
+ min-width: max-content; /* allow very long lines to define intrinsic width */
728
+ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
729
+ tab-size: 2;
730
+ }
731
+
732
  .cell-stderr {
733
  background: var(--bg-error);
734
  border-left: 2px solid var(--border-error);
 
3579
  if(output){
3580
  output.classList.remove('output-stale');
3581
  let html='';
3582
+ if (data.stdout) {
3583
+ html += '<div class="cell-stdout"><pre class="stdout-text">'
3584
+ + escapeHtml(data.stdout)
3585
+ + '</pre></div>';
3586
+ }
3587
+
3588
  console.log('UV Logs:', data);
3589
  if(data.stderr) {
3590
  // Split UV logs from regular stderr
 
3706
  }
3707
  }
3708
 
3709
+ // // Live reload functionality (robust SSE handling)
3710
+ // (function(){
3711
+ // if (!('EventSource' in window)) {
3712
+ // console.warn('SSE not supported in this browser');
3713
+ // return;
3714
+ // }
3715
+ // let source = new EventSource('/events');
3716
+ // let isOpen = false;
3717
+ // source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3718
+ // source.onmessage = function(e){
3719
+ // const msg=(e.data||'').trim(); if(!msg) return;
3720
+ // console.log('SSE message:', msg);
3721
+ // if (msg==='reload' || msg==='incremental') { location.reload(); }
3722
+ // // Ignore 'loading' to avoid premature reload loops
3723
+ // };
3724
+ // source.onerror = function(e){
3725
+ // // Let EventSource auto-reconnect instead of forcing a reload
3726
+ // if (isOpen) console.warn('SSE error after open, retrying...', e);
3727
+ // };
3728
+ // window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3729
+ // })();
3730
 
3731
 
3732
  document.addEventListener('DOMContentLoaded', function() {
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.21s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3888
  </div>
3889
  </div>
3890
  <div id="output-nv" class="cell-output">
3891
+ <div class="cell-stdout"><pre class="stdout-text">Thu Oct 23 17:22:15 2025
3892
  +-----------------------------------------------------------------------------------------+
3893
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3894
  |-----------------------------------------+------------------------+----------------------+
 
3897
  | | | MIG M. |
3898
  |=========================================+========================+======================|
3899
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3900
+ | N/A 37C P0 91W / 350W | 0MiB / 46068MiB | 26% Default |
3901
  | | | N/A |
3902
  +-----------------------------------------+------------------------+----------------------+
3903
 
 
3909
  | No running processes found |
3910
  +-----------------------------------------------------------------------------------------+
3911
 
3912
+ </pre></div>
3913
  </div>
3914
  </div>
3915
 
3916
  <h2>Flash Attention Benchmark</h2>
3917
+ <div class="cell" id="cell-benchmark">
3918
  <div class="cell-header">
3919
  <span class="collapse-indicators">
3920
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3921
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3922
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3923
  </span> |
3924
+ Cell: benchmark | 3.60s
3925
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3926
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3927
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3933
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3934
  <span class="c1"># dependencies = [</span>
3935
  <span class="c1"># &quot;numpy&quot;,</span>
3936
+ <span class="c1"># &quot;torch==2.8.0&quot;,</span>
3937
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3938
  <span class="c1"># ]</span>
3939
  <span class="c1">#</span>
3940
  <span class="c1"># [tool.uv.sources]</span>
3941
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
3942
  <span class="c1"># ///</span>
3943
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3944
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
4002
  </div>
4003
  </div>
4004
  <div id="output-benchmark" class="cell-output">
4005
+ <div class="cell-stdout"><pre class="stdout-text">
4006
+ ======================================================================
4007
+ PROFILE TRACE: torch_flash_ma | flux_L128
4008
+ ======================================================================
4009
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4010
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4011
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4012
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 799.070us 225.43% 799.070us 799.070us 1
4013
+ torch_flash_ma 14.65% 361.148us 99.74% 2.458ms 2.458ms 0.000us 0.00% 362.241us 362.241us 1
4014
+ aten::scaled_dot_product_attention 1.75% 43.042us 9.34% 230.141us 76.714us 0.000us 0.00% 266.207us 88.736us 3
4015
+ aten::_scaled_dot_product_flash_attention 1.09% 26.961us 7.59% 187.099us 62.366us 0.000us 0.00% 266.207us 88.736us 3
4016
+ aten::_flash_attention_forward 1.68% 41.361us 5.54% 136.527us 45.509us 266.207us 75.10% 266.207us 88.736us 3
4017
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 266.207us 75.10% 266.207us 88.736us 3
4018
+ aten::contiguous 0.64% 15.860us 72.86% 1.796ms 149.661us 0.000us 0.00% 96.034us 8.003us 12
4019
+ aten::clone 1.71% 42.134us 72.21% 1.780ms 148.339us 0.000us 0.00% 96.034us 8.003us 12
4020
+ aten::copy_ 3.86% 95.153us 66.84% 1.648ms 137.298us 88.258us 24.90% 96.034us 8.003us 12
4021
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 88.258us 24.90% 88.258us 7.355us 12
4022
+ Activity Buffer Request 58.01% 1.430ms 58.01% 1.430ms 1.430ms 7.776us 2.19% 7.776us 7.776us 1
4023
+ aten::transpose 2.95% 72.712us 3.85% 94.884us 3.954us 0.000us 0.00% 0.000us 0.000us 24
4024
+ aten::as_strided 0.90% 22.172us 0.90% 22.172us 0.924us 0.000us 0.00% 0.000us 0.000us 24
4025
+ aten::empty_like 1.13% 27.832us 4.55% 112.245us 7.483us 0.000us 0.00% 0.000us 0.000us 15
4026
+ aten::empty 4.09% 100.886us 4.09% 100.886us 4.204us 0.000us 0.00% 0.000us 0.000us 24
4027
+ cudaLaunchKernel 5.96% 146.998us 5.96% 146.998us 9.800us 0.000us 0.00% 0.000us 0.000us 15
4028
+ aten::empty_strided 0.65% 15.960us 0.65% 15.960us 5.320us 0.000us 0.00% 0.000us 0.000us 3
4029
+ cudaDeviceGetAttribute 0.12% 2.850us 0.12% 2.850us 0.475us 0.000us 0.00% 0.000us 0.000us 6
4030
+ cudaFuncSetAttribute 0.54% 13.411us 0.54% 13.411us 4.470us 0.000us 0.00% 0.000us 0.000us 3
4031
+ cudaDeviceSynchronize 0.26% 6.530us 0.26% 6.530us 6.530us 0.000us 0.00% 0.000us 0.000us 1
4032
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4033
+ Self CPU time total: 2.465ms
4034
+ Self CUDA time total: 354.465us
4035
+
4036
+
4037
+
4038
+ ======================================================================
4039
+ PROFILE TRACE: torch_flash_ma | flux_L256
4040
+ ======================================================================
4041
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4042
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4043
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 680.541us 161.63% 680.541us 680.541us 1
4045
+ torch_flash_ma 11.51% 254.710us 99.74% 2.208ms 2.208ms 0.000us 0.00% 430.783us 430.783us 1
4046
+ aten::scaled_dot_product_attention 1.09% 24.080us 8.33% 184.408us 61.469us 0.000us 0.00% 312.064us 104.021us 3
4047
+ aten::_scaled_dot_product_flash_attention 0.81% 17.821us 7.24% 160.328us 53.443us 0.000us 0.00% 312.064us 104.021us 3
4048
+ aten::_flash_attention_forward 1.85% 41.011us 5.37% 118.956us 39.652us 312.064us 74.11% 312.064us 104.021us 3
4049
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 312.064us 74.11% 312.064us 104.021us 3
4050
+ aten::contiguous 0.42% 9.258us 77.80% 1.722ms 143.509us 0.000us 0.00% 118.719us 9.893us 12
4051
+ aten::clone 1.32% 29.284us 77.38% 1.713ms 142.737us 0.000us 0.00% 118.719us 9.893us 12
4052
+ aten::copy_ 3.64% 80.568us 73.02% 1.616ms 134.703us 108.991us 25.89% 118.719us 9.893us 12
4053
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 108.991us 25.89% 108.991us 9.083us 12
4054
+ Activity Buffer Request 65.56% 1.451ms 65.56% 1.451ms 1.451ms 9.728us 2.31% 9.728us 9.728us 1
4055
+ aten::transpose 2.36% 52.224us 3.17% 70.126us 2.922us 0.000us 0.00% 0.000us 0.000us 24
4056
+ aten::as_strided 0.81% 17.902us 0.81% 17.902us 0.746us 0.000us 0.00% 0.000us 0.000us 24
4057
+ aten::empty_like 0.96% 21.191us 3.98% 88.123us 5.875us 0.000us 0.00% 0.000us 0.000us 15
4058
+ aten::empty 3.58% 79.273us 3.58% 79.273us 3.303us 0.000us 0.00% 0.000us 0.000us 24
4059
+ cudaLaunchKernel 4.85% 107.363us 4.85% 107.363us 7.158us 0.000us 0.00% 0.000us 0.000us 15
4060
+ aten::empty_strided 0.70% 15.410us 0.70% 15.410us 5.137us 0.000us 0.00% 0.000us 0.000us 3
4061
+ cudaDeviceGetAttribute 0.09% 2.071us 0.09% 2.071us 0.345us 0.000us 0.00% 0.000us 0.000us 6
4062
+ cudaFuncSetAttribute 0.20% 4.321us 0.20% 4.321us 1.440us 0.000us 0.00% 0.000us 0.000us 3
4063
+ cudaDeviceSynchronize 0.26% 5.841us 0.26% 5.841us 5.841us 0.000us 0.00% 0.000us 0.000us 1
4064
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
+ Self CPU time total: 2.214ms
4066
+ Self CUDA time total: 421.055us
4067
+
4068
+
4069
+
4070
+ ======================================================================
4071
+ PROFILE TRACE: torch_flash_ma | flux_L320
4072
+ ======================================================================
4073
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4074
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4075
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 690.203us 159.06% 690.203us 690.203us 1
4077
+ torch_flash_ma 11.42% 254.276us 99.18% 2.209ms 2.209ms 0.000us 0.00% 443.100us 443.100us 1
4078
+ aten::scaled_dot_product_attention 1.09% 24.201us 8.13% 181.079us 60.360us 0.000us 0.00% 330.557us 110.186us 3
4079
+ aten::_scaled_dot_product_flash_attention 0.78% 17.350us 7.04% 156.878us 52.293us 0.000us 0.00% 330.557us 110.186us 3
4080
+ aten::_flash_attention_forward 1.80% 40.093us 5.30% 118.035us 39.345us 330.557us 76.18% 330.557us 110.186us 3
4081
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 330.557us 76.18% 330.557us 110.186us 3
4082
+ aten::contiguous 0.42% 9.369us 77.58% 1.728ms 143.991us 0.000us 0.00% 112.543us 9.379us 12
4083
+ aten::clone 1.34% 29.740us 77.16% 1.719ms 143.210us 0.000us 0.00% 112.543us 9.379us 12
4084
+ aten::copy_ 3.81% 84.905us 72.90% 1.624ms 135.305us 103.359us 23.82% 112.543us 9.379us 12
4085
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 103.359us 23.82% 103.359us 8.613us 12
4086
+ Activity Buffer Request 65.38% 1.456ms 65.38% 1.456ms 1.456ms 9.184us 2.12% 9.184us 9.184us 1
4087
+ aten::transpose 2.26% 50.400us 3.02% 67.214us 2.801us 0.000us 0.00% 0.000us 0.000us 24
4088
+ aten::as_strided 0.75% 16.814us 0.75% 16.814us 0.701us 0.000us 0.00% 0.000us 0.000us 24
4089
+ aten::empty_like 0.96% 21.489us 3.82% 85.044us 5.670us 0.000us 0.00% 0.000us 0.000us 15
4090
+ aten::empty 3.43% 76.464us 3.43% 76.464us 3.186us 0.000us 0.00% 0.000us 0.000us 24
4091
+ cudaLaunchKernel 4.82% 107.405us 4.82% 107.405us 7.160us 0.000us 0.00% 0.000us 0.000us 15
4092
+ aten::empty_strided 0.66% 14.631us 0.66% 14.631us 4.877us 0.000us 0.00% 0.000us 0.000us 3
4093
+ cudaDeviceGetAttribute 0.08% 1.710us 0.08% 1.710us 0.285us 0.000us 0.00% 0.000us 0.000us 6
4094
+ cudaFuncSetAttribute 0.18% 3.930us 0.18% 3.930us 1.310us 0.000us 0.00% 0.000us 0.000us 3
4095
+ cudaDeviceSynchronize 0.82% 18.331us 0.82% 18.331us 18.331us 0.000us 0.00% 0.000us 0.000us 1
4096
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4097
+ Self CPU time total: 2.227ms
4098
+ Self CUDA time total: 433.916us
4099
+
4100
+
4101
+
4102
+ ======================================================================
4103
+ PROFILE TRACE: torch_flash_ma | flux_L384
4104
+ ======================================================================
4105
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4106
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4107
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4108
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 691.645us 147.68% 691.645us 691.645us 1
4109
+ torch_flash_ma 10.40% 252.243us 99.18% 2.405ms 2.405ms 0.000us 0.00% 481.117us 481.117us 1
4110
+ aten::scaled_dot_product_attention 1.00% 24.352us 7.27% 176.289us 58.763us 0.000us 0.00% 341.277us 113.759us 3
4111
+ aten::_scaled_dot_product_flash_attention 0.73% 17.811us 6.27% 151.937us 50.646us 0.000us 0.00% 341.277us 113.759us 3
4112
+ aten::_flash_attention_forward 1.38% 33.540us 4.54% 110.186us 36.729us 341.277us 72.87% 341.277us 113.759us 3
4113
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 341.277us 72.87% 341.277us 113.759us 3
4114
+ aten::contiguous 0.39% 9.522us 79.59% 1.930ms 160.818us 0.000us 0.00% 139.840us 11.653us 12
4115
+ aten::clone 1.25% 30.240us 79.20% 1.920ms 160.024us 0.000us 0.00% 139.840us 11.653us 12
4116
+ aten::copy_ 3.35% 81.274us 75.28% 1.825ms 152.111us 127.072us 27.13% 139.840us 11.653us 12
4117
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 127.072us 27.13% 127.072us 10.589us 12
4118
+ Activity Buffer Request 59.91% 1.453ms 59.91% 1.453ms 1.453ms 12.768us 2.73% 12.768us 12.768us 1
4119
+ aten::transpose 2.18% 52.871us 2.90% 70.271us 2.928us 0.000us 0.00% 0.000us 0.000us 24
4120
+ aten::as_strided 0.72% 17.400us 0.72% 17.400us 0.725us 0.000us 0.00% 0.000us 0.000us 24
4121
+ aten::empty_like 0.83% 20.083us 3.47% 84.148us 5.610us 0.000us 0.00% 0.000us 0.000us 15
4122
+ aten::empty 3.18% 77.125us 3.18% 77.125us 3.214us 0.000us 0.00% 0.000us 0.000us 24
4123
+ cudaLaunchKernel 13.00% 315.205us 13.00% 315.205us 21.014us 0.000us 0.00% 0.000us 0.000us 15
4124
+ aten::empty_strided 0.61% 14.781us 0.61% 14.781us 4.927us 0.000us 0.00% 0.000us 0.000us 3
4125
+ cudaDeviceGetAttribute 0.07% 1.670us 0.07% 1.670us 0.278us 0.000us 0.00% 0.000us 0.000us 6
4126
+ cudaFuncSetAttribute 0.16% 3.970us 0.16% 3.970us 1.323us 0.000us 0.00% 0.000us 0.000us 3
4127
+ cudaDeviceSynchronize 0.82% 19.911us 0.82% 19.911us 19.911us 0.000us 0.00% 0.000us 0.000us 1
4128
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4129
+ Self CPU time total: 2.425ms
4130
+ Self CUDA time total: 468.349us
4131
+
4132
+
4133
+
4134
+ ======================================================================
4135
+ PROFILE TRACE: torch_flash_ma | flux_L448
4136
+ ======================================================================
4137
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4138
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4139
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4140
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 799.966us 130.76% 799.966us 799.966us 1
4141
+ torch_flash_ma 12.25% 304.685us 97.28% 2.419ms 2.419ms 0.000us 0.00% 624.638us 624.638us 1
4142
+ aten::scaled_dot_product_attention 0.97% 24.122us 7.38% 183.559us 61.186us 0.000us 0.00% 485.886us 161.962us 3
4143
+ aten::_scaled_dot_product_flash_attention 0.71% 17.700us 6.41% 159.437us 53.146us 0.000us 0.00% 485.886us 161.962us 3
4144
+ aten::_flash_attention_forward 1.59% 39.459us 4.74% 117.796us 39.265us 485.886us 79.42% 485.886us 161.962us 3
4145
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 485.886us 79.42% 485.886us 161.962us 3
4146
+ aten::contiguous 0.39% 9.743us 75.79% 1.885ms 157.075us 0.000us 0.00% 138.752us 11.563us 12
4147
+ aten::clone 1.21% 30.098us 75.40% 1.875ms 156.263us 0.000us 0.00% 138.752us 11.563us 12
4148
+ aten::copy_ 3.39% 84.237us 71.41% 1.776ms 147.998us 125.888us 20.58% 138.752us 11.563us 12
4149
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 125.888us 20.58% 125.888us 10.491us 12
4150
+ Activity Buffer Request 58.51% 1.455ms 58.51% 1.455ms 1.455ms 12.864us 2.10% 12.864us 12.864us 1
4151
+ aten::transpose 2.11% 52.456us 2.81% 69.984us 2.916us 0.000us 0.00% 0.000us 0.000us 24
4152
+ aten::as_strided 0.70% 17.528us 0.70% 17.528us 0.730us 0.000us 0.00% 0.000us 0.000us 24
4153
+ aten::empty_like 0.83% 20.690us 3.57% 88.794us 5.920us 0.000us 0.00% 0.000us 0.000us 15
4154
+ aten::empty 3.29% 81.917us 3.29% 81.917us 3.413us 0.000us 0.00% 0.000us 0.000us 24
4155
+ cudaLaunchKernel 10.48% 260.751us 10.48% 260.751us 17.383us 0.000us 0.00% 0.000us 0.000us 15
4156
+ aten::empty_strided 0.58% 14.540us 0.58% 14.540us 4.847us 0.000us 0.00% 0.000us 0.000us 3
4157
+ cudaDeviceGetAttribute 0.09% 2.170us 0.09% 2.170us 0.362us 0.000us 0.00% 0.000us 0.000us 6
4158
+ cudaFuncSetAttribute 0.16% 3.911us 0.16% 3.911us 1.304us 0.000us 0.00% 0.000us 0.000us 3
4159
+ cudaDeviceSynchronize 2.72% 67.754us 2.72% 67.754us 67.754us 0.000us 0.00% 0.000us 0.000us 1
4160
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4161
+ Self CPU time total: 2.487ms
4162
+ Self CUDA time total: 611.774us
4163
+
4164
+
4165
+
4166
+ ======================================================================
4167
+ PROFILE TRACE: torch_flash_ma | flux_L512
4168
+ ======================================================================
4169
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4170
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4171
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4172
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 754.076us 118.52% 754.076us 754.076us 1
4173
+ torch_flash_ma 10.33% 251.863us 96.72% 2.358ms 2.358ms 0.000us 0.00% 647.964us 647.964us 1
4174
+ aten::scaled_dot_product_attention 1.02% 24.850us 7.50% 182.789us 60.930us 0.000us 0.00% 507.517us 169.172us 3
4175
+ aten::_scaled_dot_product_flash_attention 0.72% 17.614us 6.48% 157.939us 52.646us 0.000us 0.00% 507.517us 169.172us 3
4176
+ aten::_flash_attention_forward 1.67% 40.594us 4.82% 117.465us 39.155us 507.517us 79.77% 507.517us 169.172us 3
4177
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 507.517us 79.77% 507.517us 169.172us 3
4178
+ aten::contiguous 0.38% 9.202us 77.00% 1.877ms 156.434us 0.000us 0.00% 140.447us 11.704us 12
4179
+ aten::clone 1.22% 29.851us 76.63% 1.868ms 155.667us 0.000us 0.00% 140.447us 11.704us 12
4180
+ aten::copy_ 3.45% 84.032us 72.63% 1.771ms 147.547us 128.703us 20.23% 140.447us 11.704us 12
4181
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 128.703us 20.23% 128.703us 10.725us 12
4182
+ Activity Buffer Request 59.63% 1.454ms 59.63% 1.454ms 1.454ms 11.744us 1.85% 11.744us 11.744us 1
4183
+ aten::transpose 2.09% 51.002us 2.82% 68.782us 2.866us 0.000us 0.00% 0.000us 0.000us 24
4184
+ aten::as_strided 0.73% 17.780us 0.73% 17.780us 0.741us 0.000us 0.00% 0.000us 0.000us 24
4185
+ aten::empty_like 0.85% 20.819us 3.58% 87.161us 5.811us 0.000us 0.00% 0.000us 0.000us 15
4186
+ aten::empty 3.27% 79.813us 3.27% 79.813us 3.326us 0.000us 0.00% 0.000us 0.000us 24
4187
+ cudaLaunchKernel 10.50% 256.026us 10.50% 256.026us 17.068us 0.000us 0.00% 0.000us 0.000us 15
4188
+ aten::empty_strided 0.59% 14.340us 0.59% 14.340us 4.780us 0.000us 0.00% 0.000us 0.000us 3
4189
+ cudaDeviceGetAttribute 0.08% 1.949us 0.08% 1.949us 0.325us 0.000us 0.00% 0.000us 0.000us 6
4190
+ cudaFuncSetAttribute 0.18% 4.440us 0.18% 4.440us 1.480us 0.000us 0.00% 0.000us 0.000us 3
4191
+ cudaDeviceSynchronize 3.28% 80.003us 3.28% 80.003us 80.003us 0.000us 0.00% 0.000us 0.000us 1
4192
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4193
+ Self CPU time total: 2.438ms
4194
+ Self CUDA time total: 636.220us
4195
+
4196
+
4197
+ impl wl p50(ms) ok
4198
+ torch_flash_ma flux_L128 0.18 True
4199
+ torch_flash_ma flux_L256 0.21 True
4200
+ torch_flash_ma flux_L320 0.22 True
4201
+ torch_flash_ma flux_L384 0.22 True
4202
+ torch_flash_ma flux_L448 0.27 True
4203
+ torch_flash_ma flux_L512 0.28 True
4204
+ </pre></div>
4205
+ <div class="cell-artifacts">
4206
+ <h4>Artifacts:</h4>
4207
+ <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
4208
  </div>
4209
  </div>
4210
  </div>
flash_attn/impls/hf_kernels_flash_attn.html CHANGED
@@ -706,6 +706,29 @@
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  .cell-stderr {
710
  background: var(--bg-error);
711
  border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3556
  if(output){
3557
  output.classList.remove('output-stale');
3558
  let html='';
3559
- if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
 
 
 
 
 
3560
  console.log('UV Logs:', data);
3561
  if(data.stderr) {
3562
  // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3678
  }
3679
  }
3680
 
3681
- // Live reload functionality (robust SSE handling)
3682
- (function(){
3683
- if (!('EventSource' in window)) {
3684
- console.warn('SSE not supported in this browser');
3685
- return;
3686
- }
3687
- let source = new EventSource('/events');
3688
- let isOpen = false;
3689
- source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3690
- source.onmessage = function(e){
3691
- const msg=(e.data||'').trim(); if(!msg) return;
3692
- console.log('SSE message:', msg);
3693
- if (msg==='reload' || msg==='incremental') { location.reload(); }
3694
- // Ignore 'loading' to avoid premature reload loops
3695
- };
3696
- source.onerror = function(e){
3697
- // Let EventSource auto-reconnect instead of forcing a reload
3698
- if (isOpen) console.warn('SSE error after open, retrying...', e);
3699
- };
3700
- window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3701
- })();
3702
 
3703
 
3704
  document.addEventListener('DOMContentLoaded', function() {
@@ -3829,21 +3857,21 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
3836
  <div class="main-content">
3837
  <h1>HF Kernels - Flash Attention</h1>
3838
  <h2>HuggingFace Kernels Flash Attention Benchmark</h2>
3839
- <div class="cell cell-failed" id="cell-benchmark">
3840
  <div class="cell-header">
3841
  <span class="collapse-indicators">
3842
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3843
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3844
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: benchmark | 0.01s | FAILED
3847
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3849
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3862,7 +3890,7 @@ Cell: benchmark | 0.01s | FAILED
3862
  <span class="c1"># ]</span>
3863
  <span class="c1">#</span>
3864
  <span class="c1"># [tool.uv.sources]</span>
3865
- <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3866
  <span class="c1"># ///</span>
3867
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3868
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3931,9 +3959,166 @@ Cell: benchmark | 0.01s | FAILED
3931
  </div>
3932
  </div>
3933
  <div id="output-benchmark" class="cell-output">
3934
- <div class="cell-stderr"> × Failed to resolve script requirement
3935
- ╰─▶ Distribution not found at:
3936
- file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3937
  </div>
3938
  </div>
3939
  </div>
 
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
709
+
710
+ .cell-stdout {
711
+ background: var(--bg-tertiary);
712
+ padding: 0.75rem;
713
+ border-radius: 1px;
714
+ font-family: inherit;
715
+ font-size: 0.9rem;
716
+ color: var(--text-primary);
717
+
718
+ /* key bits */
719
+ overflow: auto; /* show scrollbars when needed */
720
+ max-width: 100%; /* respects whatever layout width you give it */
721
+ }
722
+
723
+ .cell-stdout .stdout-text {
724
+ margin: 0; /* reset pre default margin */
725
+ white-space: pre; /* keep line breaks, NO wrapping */
726
+ display: inline-block; /* shrink-to-content */
727
+ min-width: max-content; /* allow very long lines to define intrinsic width */
728
+ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
729
+ tab-size: 2;
730
+ }
731
+
732
  .cell-stderr {
733
  background: var(--bg-error);
734
  border-left: 2px solid var(--border-error);
 
3579
  if(output){
3580
  output.classList.remove('output-stale');
3581
  let html='';
3582
+ if (data.stdout) {
3583
+ html += '<div class="cell-stdout"><pre class="stdout-text">'
3584
+ + escapeHtml(data.stdout)
3585
+ + '</pre></div>';
3586
+ }
3587
+
3588
  console.log('UV Logs:', data);
3589
  if(data.stderr) {
3590
  // Split UV logs from regular stderr
 
3706
  }
3707
  }
3708
 
3709
+ // // Live reload functionality (robust SSE handling)
3710
+ // (function(){
3711
+ // if (!('EventSource' in window)) {
3712
+ // console.warn('SSE not supported in this browser');
3713
+ // return;
3714
+ // }
3715
+ // let source = new EventSource('/events');
3716
+ // let isOpen = false;
3717
+ // source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3718
+ // source.onmessage = function(e){
3719
+ // const msg=(e.data||'').trim(); if(!msg) return;
3720
+ // console.log('SSE message:', msg);
3721
+ // if (msg==='reload' || msg==='incremental') { location.reload(); }
3722
+ // // Ignore 'loading' to avoid premature reload loops
3723
+ // };
3724
+ // source.onerror = function(e){
3725
+ // // Let EventSource auto-reconnect instead of forcing a reload
3726
+ // if (isOpen) console.warn('SSE error after open, retrying...', e);
3727
+ // };
3728
+ // window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3729
+ // })();
3730
 
3731
 
3732
  document.addEventListener('DOMContentLoaded', function() {
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
3864
  <div class="main-content">
3865
  <h1>HF Kernels - Flash Attention</h1>
3866
  <h2>HuggingFace Kernels Flash Attention Benchmark</h2>
3867
+ <div class="cell" id="cell-benchmark">
3868
  <div class="cell-header">
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 5.95s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3890
  <span class="c1"># ]</span>
3891
  <span class="c1">#</span>
3892
  <span class="c1"># [tool.uv.sources]</span>
3893
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
3894
  <span class="c1"># ///</span>
3895
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3896
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
3959
  </div>
3960
  </div>
3961
  <div id="output-benchmark" class="cell-output">
3962
+ <div class="cell-stdout"><pre class="stdout-text">
3963
+ ======================================================================
3964
+ PROFILE TRACE: hf_kernels_flash_attn | flux_L128
3965
+ ======================================================================
3966
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3967
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3968
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3969
+ hf_kernels_flash_attn 8.36% 154.078us 96.88% 1.786ms 1.786ms 0.000us 0.00% 362.493us 362.493us 1
3970
+ _flash_attn_9e27194::fwd 3.99% 73.523us 88.52% 1.632ms 543.906us 271.102us 100.00% 362.493us 120.831us 3
3971
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 272.638us 100.57% 272.638us 272.638us 1
3972
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 271.102us 100.00% 271.102us 90.367us 3
3973
+ Activity Buffer Request 76.97% 1.419ms 76.97% 1.419ms 1.419ms 91.391us 33.71% 91.391us 91.391us 1
3974
+ cudaDeviceGetAttribute 0.25% 4.549us 0.25% 4.549us 0.303us 0.000us 0.00% 0.000us 0.000us 15
3975
+ aten::empty_like 0.95% 17.511us 2.83% 52.153us 17.384us 0.000us 0.00% 0.000us 0.000us 3
3976
+ aten::empty_strided 1.88% 34.642us 1.88% 34.642us 11.547us 0.000us 0.00% 0.000us 0.000us 3
3977
+ aten::empty 1.44% 26.603us 1.44% 26.603us 2.956us 0.000us 0.00% 0.000us 0.000us 9
3978
+ cudaFuncSetAttribute 0.78% 14.320us 0.78% 14.320us 4.773us 0.000us 0.00% 0.000us 0.000us 3
3979
+ cudaLaunchKernel 2.27% 41.882us 2.27% 41.882us 13.961us 0.000us 0.00% 0.000us 0.000us 3
3980
+ cudaDeviceSynchronize 3.12% 57.433us 3.12% 57.433us 57.433us 0.000us 0.00% 0.000us 0.000us 1
3981
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3982
+ Self CPU time total: 1.843ms
3983
+ Self CUDA time total: 271.102us
3984
+
3985
+
3986
+
3987
+ ======================================================================
3988
+ PROFILE TRACE: hf_kernels_flash_attn | flux_L256
3989
+ ======================================================================
3990
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3991
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3992
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3993
+ hf_kernels_flash_attn 6.38% 115.656us 91.71% 1.662ms 1.662ms 0.000us 0.00% 396.671us 396.671us 1
3994
+ _flash_attn_9e27194::fwd 2.82% 51.131us 85.33% 1.547ms 515.555us 298.303us 100.00% 396.671us 132.224us 3
3995
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 299.743us 100.48% 299.743us 299.743us 1
3996
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 298.303us 100.00% 298.303us 99.434us 3
3997
+ Activity Buffer Request 77.99% 1.414ms 77.99% 1.414ms 1.414ms 98.368us 32.98% 98.368us 98.368us 1
3998
+ cudaDeviceGetAttribute 0.22% 3.931us 0.22% 3.931us 0.262us 0.000us 0.00% 0.000us 0.000us 15
3999
+ aten::empty_like 0.40% 7.190us 1.33% 24.041us 8.014us 0.000us 0.00% 0.000us 0.000us 3
4000
+ aten::empty_strided 0.93% 16.851us 0.93% 16.851us 5.617us 0.000us 0.00% 0.000us 0.000us 3
4001
+ aten::empty 1.25% 22.681us 1.25% 22.681us 2.520us 0.000us 0.00% 0.000us 0.000us 9
4002
+ cudaFuncSetAttribute 0.21% 3.730us 0.21% 3.730us 1.243us 0.000us 0.00% 0.000us 0.000us 3
4003
+ cudaLaunchKernel 1.51% 27.451us 1.51% 27.451us 9.150us 0.000us 0.00% 0.000us 0.000us 3
4004
+ cudaDeviceSynchronize 8.29% 150.237us 8.29% 150.237us 150.237us 0.000us 0.00% 0.000us 0.000us 1
4005
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
+ Self CPU time total: 1.813ms
4007
+ Self CUDA time total: 298.303us
4008
+
4009
+
4010
+
4011
+ ======================================================================
4012
+ PROFILE TRACE: hf_kernels_flash_attn | flux_L320
4013
+ ======================================================================
4014
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4015
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4016
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
+ hf_kernels_flash_attn 6.16% 112.885us 90.78% 1.663ms 1.663ms 0.000us 0.00% 427.613us 427.613us 1
4018
+ _flash_attn_9e27194::fwd 2.80% 51.281us 84.62% 1.550ms 516.788us 318.526us 100.00% 427.613us 142.538us 3
4019
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 319.901us 100.43% 319.901us 319.901us 1
4020
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 318.526us 100.00% 318.526us 106.175us 3
4021
+ Activity Buffer Request 77.28% 1.416ms 77.28% 1.416ms 1.416ms 109.087us 34.25% 109.087us 109.087us 1
4022
+ cudaDeviceGetAttribute 0.21% 3.930us 0.21% 3.930us 0.262us 0.000us 0.00% 0.000us 0.000us 15
4023
+ aten::empty_like 0.41% 7.431us 1.40% 25.731us 8.577us 0.000us 0.00% 0.000us 0.000us 3
4024
+ aten::empty_strided 1.00% 18.300us 1.00% 18.300us 6.100us 0.000us 0.00% 0.000us 0.000us 3
4025
+ aten::empty 1.26% 23.051us 1.26% 23.051us 2.561us 0.000us 0.00% 0.000us 0.000us 9
4026
+ cudaFuncSetAttribute 0.22% 4.001us 0.22% 4.001us 1.334us 0.000us 0.00% 0.000us 0.000us 3
4027
+ cudaLaunchKernel 1.45% 26.532us 1.45% 26.532us 8.844us 0.000us 0.00% 0.000us 0.000us 3
4028
+ cudaDeviceSynchronize 9.22% 168.858us 9.22% 168.858us 168.858us 0.000us 0.00% 0.000us 0.000us 1
4029
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4030
+ Self CPU time total: 1.832ms
4031
+ Self CUDA time total: 318.526us
4032
+
4033
+
4034
+
4035
+ ======================================================================
4036
+ PROFILE TRACE: hf_kernels_flash_attn | flux_L384
4037
+ ======================================================================
4038
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4040
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4041
+ hf_kernels_flash_attn 5.43% 111.055us 91.19% 1.866ms 1.866ms 0.000us 0.00% 446.776us 446.776us 1
4042
+ _flash_attn_9e27194::fwd 2.54% 51.901us 85.76% 1.755ms 584.928us 331.162us 100.00% 446.776us 148.925us 3
4043
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 332.667us 100.45% 332.667us 332.667us 1
4044
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 331.162us 100.00% 331.162us 110.387us 3
4045
+ Activity Buffer Request 69.78% 1.428ms 69.78% 1.428ms 1.428ms 115.614us 34.91% 115.614us 115.614us 1
4046
+ cudaDeviceGetAttribute 0.19% 3.942us 0.19% 3.942us 0.263us 0.000us 0.00% 0.000us 0.000us 15
4047
+ aten::empty_like 0.39% 8.070us 1.24% 25.461us 8.487us 0.000us 0.00% 0.000us 0.000us 3
4048
+ aten::empty_strided 0.85% 17.391us 0.85% 17.391us 5.797us 0.000us 0.00% 0.000us 0.000us 3
4049
+ aten::empty 1.08% 22.080us 1.08% 22.080us 2.453us 0.000us 0.00% 0.000us 0.000us 9
4050
+ cudaFuncSetAttribute 0.19% 3.861us 0.19% 3.861us 1.287us 0.000us 0.00% 0.000us 0.000us 3
4051
+ cudaLaunchKernel 10.75% 219.880us 10.75% 219.880us 73.293us 0.000us 0.00% 0.000us 0.000us 3
4052
+ cudaDeviceSynchronize 8.81% 180.219us 8.81% 180.219us 180.219us 0.000us 0.00% 0.000us 0.000us 1
4053
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
+ Self CPU time total: 2.046ms
4055
+ Self CUDA time total: 331.162us
4056
+
4057
+
4058
+
4059
+ ======================================================================
4060
+ PROFILE TRACE: hf_kernels_flash_attn | flux_L448
4061
+ ======================================================================
4062
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4063
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4064
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
+ hf_kernels_flash_attn 4.92% 108.784us 84.29% 1.864ms 1.864ms 0.000us 0.00% 663.288us 663.288us 1
4066
+ _flash_attn_9e27194::fwd 2.26% 49.951us 79.37% 1.755ms 585.135us 493.882us 100.00% 663.288us 221.096us 3
4067
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 495.418us 100.31% 495.418us 495.418us 1
4068
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 493.882us 100.00% 493.882us 164.627us 3
4069
+ Activity Buffer Request 65.22% 1.442ms 65.22% 1.442ms 1.442ms 169.406us 34.30% 169.406us 169.406us 1
4070
+ cudaDeviceGetAttribute 0.18% 3.990us 0.18% 3.990us 0.266us 0.000us 0.00% 0.000us 0.000us 15
4071
+ aten::empty_like 0.34% 7.522us 1.12% 24.742us 8.247us 0.000us 0.00% 0.000us 0.000us 3
4072
+ aten::empty_strided 0.78% 17.220us 0.78% 17.220us 5.740us 0.000us 0.00% 0.000us 0.000us 3
4073
+ aten::empty 0.96% 21.140us 0.96% 21.140us 2.349us 0.000us 0.00% 0.000us 0.000us 9
4074
+ cudaFuncSetAttribute 0.19% 4.121us 0.19% 4.121us 1.374us 0.000us 0.00% 0.000us 0.000us 3
4075
+ cudaLaunchKernel 9.45% 209.092us 9.45% 209.092us 69.697us 0.000us 0.00% 0.000us 0.000us 3
4076
+ cudaDeviceSynchronize 15.71% 347.407us 15.71% 347.407us 347.407us 0.000us 0.00% 0.000us 0.000us 1
4077
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4078
+ Self CPU time total: 2.212ms
4079
+ Self CUDA time total: 493.882us
4080
+
4081
+
4082
+
4083
+ ======================================================================
4084
+ PROFILE TRACE: hf_kernels_flash_attn | flux_L512
4085
+ ======================================================================
4086
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4087
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4088
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4089
+ hf_kernels_flash_attn 4.96% 110.355us 83.23% 1.852ms 1.852ms 0.000us 0.00% 697.540us 697.540us 1
4090
+ _flash_attn_9e27194::fwd 2.27% 50.469us 78.28% 1.742ms 580.665us 518.659us 100.00% 697.540us 232.513us 3
4091
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 520.068us 100.27% 520.068us 520.068us 1
4092
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 518.659us 100.00% 518.659us 172.886us 3
4093
+ Activity Buffer Request 64.27% 1.430ms 64.27% 1.430ms 1.430ms 178.881us 34.49% 178.881us 178.881us 1
4094
+ cudaDeviceGetAttribute 0.17% 3.832us 0.17% 3.832us 0.255us 0.000us 0.00% 0.000us 0.000us 15
4095
+ aten::empty_like 0.33% 7.341us 1.15% 25.571us 8.524us 0.000us 0.00% 0.000us 0.000us 3
4096
+ aten::empty_strided 0.82% 18.230us 0.82% 18.230us 6.077us 0.000us 0.00% 0.000us 0.000us 3
4097
+ aten::empty 0.94% 20.812us 0.94% 20.812us 2.312us 0.000us 0.00% 0.000us 0.000us 9
4098
+ cudaFuncSetAttribute 0.19% 4.171us 0.19% 4.171us 1.390us 0.000us 0.00% 0.000us 0.000us 3
4099
+ cudaLaunchKernel 9.29% 206.809us 9.29% 206.809us 68.936us 0.000us 0.00% 0.000us 0.000us 3
4100
+ cudaDeviceSynchronize 16.77% 373.119us 16.77% 373.119us 373.119us 0.000us 0.00% 0.000us 0.000us 1
4101
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4102
+ Self CPU time total: 2.225ms
4103
+ Self CUDA time total: 518.659us
4104
+
4105
+
4106
+ impl wl p50(ms) ok
4107
+ hf_kernels_flash_attn flux_L128 0.12 True
4108
+ hf_kernels_flash_attn flux_L256 0.14 True
4109
+ hf_kernels_flash_attn flux_L320 0.14 True
4110
+ hf_kernels_flash_attn flux_L384 0.15 True
4111
+ hf_kernels_flash_attn flux_L448 0.20 True
4112
+ hf_kernels_flash_attn flux_L512 0.20 True
4113
+ </pre></div>
4114
+ <div class="cell-stderr">
4115
+ Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4116
+ Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:16, 1.08it/s]
4117
+ Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 10.78it/s]
4118
+ </div>
4119
+ <div class="cell-artifacts">
4120
+ <h4>Artifacts:</h4>
4121
+ <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
4122
  </div>
4123
  </div>
4124
  </div>
flash_attn/impls/hf_kernels_flash_attn3.html CHANGED
@@ -706,6 +706,29 @@
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  .cell-stderr {
710
  background: var(--bg-error);
711
  border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3556
  if(output){
3557
  output.classList.remove('output-stale');
3558
  let html='';
3559
- if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
 
 
 
 
 
3560
  console.log('UV Logs:', data);
3561
  if(data.stderr) {
3562
  // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3678
  }
3679
  }
3680
 
3681
- // Live reload functionality (robust SSE handling)
3682
- (function(){
3683
- if (!('EventSource' in window)) {
3684
- console.warn('SSE not supported in this browser');
3685
- return;
3686
- }
3687
- let source = new EventSource('/events');
3688
- let isOpen = false;
3689
- source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3690
- source.onmessage = function(e){
3691
- const msg=(e.data||'').trim(); if(!msg) return;
3692
- console.log('SSE message:', msg);
3693
- if (msg==='reload' || msg==='incremental') { location.reload(); }
3694
- // Ignore 'loading' to avoid premature reload loops
3695
- };
3696
- source.onerror = function(e){
3697
- // Let EventSource auto-reconnect instead of forcing a reload
3698
- if (isOpen) console.warn('SSE error after open, retrying...', e);
3699
- };
3700
- window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3701
- })();
3702
 
3703
 
3704
  document.addEventListener('DOMContentLoaded', function() {
@@ -3829,21 +3857,21 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
3836
  <div class="main-content">
3837
  <h1>HF Kernels - Flash Attention 3</h1>
3838
  <h2>HuggingFace Kernels Flash Attention 3 Benchmark</h2>
3839
- <div class="cell cell-failed" id="cell-benchmark">
3840
  <div class="cell-header">
3841
  <span class="collapse-indicators">
3842
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3843
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3844
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: benchmark | 0.05s | FAILED
3847
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3849
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3856,13 +3884,13 @@ Cell: benchmark | 0.05s | FAILED
3856
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3857
  <span class="c1"># dependencies = [</span>
3858
  <span class="c1"># &quot;numpy&quot;,</span>
3859
- <span class="c1"># &quot;torch&quot;,</span>
3860
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3861
  <span class="c1"># &quot;kernels&quot;,</span>
3862
  <span class="c1"># ]</span>
3863
  <span class="c1">#</span>
3864
  <span class="c1"># [tool.uv.sources]</span>
3865
- <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3866
  <span class="c1"># ///</span>
3867
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3868
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3930,9 +3958,154 @@ Cell: benchmark | 0.05s | FAILED
3930
  </div>
3931
  </div>
3932
  <div id="output-benchmark" class="cell-output">
3933
- <div class="cell-stderr"> × Failed to resolve script requirement
3934
- ╰─▶ Distribution not found at:
3935
- file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3936
  </div>
3937
  </div>
3938
  </div>
 
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
709
+
710
+ .cell-stdout {
711
+ background: var(--bg-tertiary);
712
+ padding: 0.75rem;
713
+ border-radius: 1px;
714
+ font-family: inherit;
715
+ font-size: 0.9rem;
716
+ color: var(--text-primary);
717
+
718
+ /* key bits */
719
+ overflow: auto; /* show scrollbars when needed */
720
+ max-width: 100%; /* respects whatever layout width you give it */
721
+ }
722
+
723
+ .cell-stdout .stdout-text {
724
+ margin: 0; /* reset pre default margin */
725
+ white-space: pre; /* keep line breaks, NO wrapping */
726
+ display: inline-block; /* shrink-to-content */
727
+ min-width: max-content; /* allow very long lines to define intrinsic width */
728
+ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
729
+ tab-size: 2;
730
+ }
731
+
732
  .cell-stderr {
733
  background: var(--bg-error);
734
  border-left: 2px solid var(--border-error);
 
3579
  if(output){
3580
  output.classList.remove('output-stale');
3581
  let html='';
3582
+ if (data.stdout) {
3583
+ html += '<div class="cell-stdout"><pre class="stdout-text">'
3584
+ + escapeHtml(data.stdout)
3585
+ + '</pre></div>';
3586
+ }
3587
+
3588
  console.log('UV Logs:', data);
3589
  if(data.stderr) {
3590
  // Split UV logs from regular stderr
 
3706
  }
3707
  }
3708
 
3709
+ // // Live reload functionality (robust SSE handling)
3710
+ // (function(){
3711
+ // if (!('EventSource' in window)) {
3712
+ // console.warn('SSE not supported in this browser');
3713
+ // return;
3714
+ // }
3715
+ // let source = new EventSource('/events');
3716
+ // let isOpen = false;
3717
+ // source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3718
+ // source.onmessage = function(e){
3719
+ // const msg=(e.data||'').trim(); if(!msg) return;
3720
+ // console.log('SSE message:', msg);
3721
+ // if (msg==='reload' || msg==='incremental') { location.reload(); }
3722
+ // // Ignore 'loading' to avoid premature reload loops
3723
+ // };
3724
+ // source.onerror = function(e){
3725
+ // // Let EventSource auto-reconnect instead of forcing a reload
3726
+ // if (isOpen) console.warn('SSE error after open, retrying...', e);
3727
+ // };
3728
+ // window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3729
+ // })();
3730
 
3731
 
3732
  document.addEventListener('DOMContentLoaded', function() {
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
3864
  <div class="main-content">
3865
  <h1>HF Kernels - Flash Attention 3</h1>
3866
  <h2>HuggingFace Kernels Flash Attention 3 Benchmark</h2>
3867
+ <div class="cell" id="cell-benchmark">
3868
  <div class="cell-header">
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 5.65s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3884
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3885
  <span class="c1"># dependencies = [</span>
3886
  <span class="c1"># &quot;numpy&quot;,</span>
3887
+ <span class="c1"># &quot;torch==2.8.0&quot;,</span>
3888
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3889
  <span class="c1"># &quot;kernels&quot;,</span>
3890
  <span class="c1"># ]</span>
3891
  <span class="c1">#</span>
3892
  <span class="c1"># [tool.uv.sources]</span>
3893
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
3894
  <span class="c1"># ///</span>
3895
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3896
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
3958
  </div>
3959
  </div>
3960
  <div id="output-benchmark" class="cell-output">
3961
+ <div class="cell-stdout"><pre class="stdout-text">
3962
+ ======================================================================
3963
+ PROFILE TRACE: hf_kernels_flash_attn3 | flux_L128
3964
+ ======================================================================
3965
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3966
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3967
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3968
+ hf_kernels_flash_attn3 9.00% 178.129us 99.63% 1.971ms 1.971ms 0.000us 0.00% 345.823us 345.823us 1
3969
+ FlashAttnFunc 6.66% 131.797us 90.63% 1.793ms 597.659us 0.000us 0.00% 345.823us 115.274us 3
3970
+ _flash_attn3_48fe103_dirty::fwd 4.56% 90.256us 83.97% 1.661ms 553.727us 259.583us 100.00% 345.823us 115.274us 3
3971
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 292.158us 112.55% 292.158us 292.158us 1
3972
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 259.583us 100.00% 259.583us 86.528us 3
3973
+ Activity Buffer Request 73.82% 1.460ms 73.82% 1.460ms 1.460ms 86.240us 33.22% 86.240us 86.240us 1
3974
+ aten::empty 2.53% 50.052us 2.53% 50.052us 8.342us 0.000us 0.00% 0.000us 0.000us 6
3975
+ cudaFuncSetAttribute 0.86% 16.921us 0.86% 16.921us 5.640us 0.000us 0.00% 0.000us 0.000us 3
3976
+ cudaLaunchKernel 2.20% 43.551us 2.20% 43.551us 14.517us 0.000us 0.00% 0.000us 0.000us 3
3977
+ cudaDeviceSynchronize 0.37% 7.311us 0.37% 7.311us 7.311us 0.000us 0.00% 0.000us 0.000us 1
3978
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3979
+ Self CPU time total: 1.978ms
3980
+ Self CUDA time total: 259.583us
3981
+
3982
+
3983
+
3984
+ ======================================================================
3985
+ PROFILE TRACE: hf_kernels_flash_attn3 | flux_L256
3986
+ ======================================================================
3987
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3989
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
+ hf_kernels_flash_attn3 7.20% 133.787us 96.41% 1.793ms 1.793ms 0.000us 0.00% 393.753us 393.753us 1
3991
+ FlashAttnFunc 5.05% 93.854us 89.22% 1.659ms 552.953us 0.000us 0.00% 393.753us 131.251us 3
3992
+ _flash_attn3_48fe103_dirty::fwd 2.68% 49.913us 84.17% 1.565ms 521.669us 293.595us 100.00% 393.753us 131.251us 3
3993
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 295.003us 100.48% 295.003us 295.003us 1
3994
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 293.595us 100.00% 293.595us 97.865us 3
3995
+ Activity Buffer Request 78.08% 1.452ms 78.08% 1.452ms 1.452ms 100.158us 34.11% 100.158us 100.158us 1
3996
+ aten::empty 1.44% 26.770us 1.44% 26.770us 4.462us 0.000us 0.00% 0.000us 0.000us 6
3997
+ cudaFuncSetAttribute 0.31% 5.680us 0.31% 5.680us 1.893us 0.000us 0.00% 0.000us 0.000us 3
3998
+ cudaLaunchKernel 1.66% 30.852us 1.66% 30.852us 10.284us 0.000us 0.00% 0.000us 0.000us 3
3999
+ cudaDeviceSynchronize 3.59% 66.713us 3.59% 66.713us 66.713us 0.000us 0.00% 0.000us 0.000us 1
4000
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
+ Self CPU time total: 1.859ms
4002
+ Self CUDA time total: 293.595us
4003
+
4004
+
4005
+
4006
+ ======================================================================
4007
+ PROFILE TRACE: hf_kernels_flash_attn3 | flux_L320
4008
+ ======================================================================
4009
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4010
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4011
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4012
+ hf_kernels_flash_attn3 6.76% 125.695us 94.13% 1.750ms 1.750ms 0.000us 0.00% 430.748us 430.748us 1
4013
+ FlashAttnFunc 4.90% 91.016us 87.37% 1.624ms 541.277us 0.000us 0.00% 430.748us 143.583us 3
4014
+ _flash_attn3_48fe103_dirty::fwd 2.79% 51.770us 82.47% 1.533ms 510.938us 324.541us 100.00% 430.748us 143.583us 3
4015
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 325.948us 100.43% 325.948us 325.948us 1
4016
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 324.541us 100.00% 324.541us 108.180us 3
4017
+ Activity Buffer Request 76.46% 1.421ms 76.46% 1.421ms 1.421ms 106.207us 32.73% 106.207us 106.207us 1
4018
+ aten::empty 1.41% 26.162us 1.41% 26.162us 4.360us 0.000us 0.00% 0.000us 0.000us 6
4019
+ cudaFuncSetAttribute 0.27% 5.061us 0.27% 5.061us 1.687us 0.000us 0.00% 0.000us 0.000us 3
4020
+ cudaLaunchKernel 1.55% 28.862us 1.55% 28.862us 9.621us 0.000us 0.00% 0.000us 0.000us 3
4021
+ cudaDeviceSynchronize 5.87% 109.015us 5.87% 109.015us 109.015us 0.000us 0.00% 0.000us 0.000us 1
4022
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4023
+ Self CPU time total: 1.859ms
4024
+ Self CUDA time total: 324.541us
4025
+
4026
+
4027
+
4028
+ ======================================================================
4029
+ PROFILE TRACE: hf_kernels_flash_attn3 | flux_L384
4030
+ ======================================================================
4031
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4032
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4033
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4034
+ hf_kernels_flash_attn3 6.04% 124.874us 95.07% 1.964ms 1.964ms 0.000us 0.00% 429.567us 429.567us 1
4035
+ FlashAttnFunc 4.57% 94.345us 89.03% 1.840ms 613.174us 0.000us 0.00% 429.567us 143.189us 3
4036
+ _flash_attn3_48fe103_dirty::fwd 2.60% 53.754us 84.46% 1.745ms 581.725us 322.591us 100.00% 429.567us 143.189us 3
4037
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 324.063us 100.46% 324.063us 324.063us 1
4038
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 322.591us 100.00% 322.591us 107.530us 3
4039
+ Activity Buffer Request 69.43% 1.434ms 69.43% 1.434ms 1.434ms 106.976us 33.16% 106.976us 106.976us 1
4040
+ aten::empty 1.29% 26.591us 1.29% 26.591us 4.432us 0.000us 0.00% 0.000us 0.000us 6
4041
+ cudaFuncSetAttribute 0.25% 5.220us 0.25% 5.220us 1.740us 0.000us 0.00% 0.000us 0.000us 3
4042
+ cudaLaunchKernel 10.90% 225.141us 10.90% 225.141us 75.047us 0.000us 0.00% 0.000us 0.000us 3
4043
+ cudaDeviceSynchronize 4.93% 101.805us 4.93% 101.805us 101.805us 0.000us 0.00% 0.000us 0.000us 1
4044
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4045
+ Self CPU time total: 2.066ms
4046
+ Self CUDA time total: 322.591us
4047
+
4048
+
4049
+
4050
+ ======================================================================
4051
+ PROFILE TRACE: hf_kernels_flash_attn3 | flux_L448
4052
+ ======================================================================
4053
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4055
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4056
+ hf_kernels_flash_attn3 5.77% 124.745us 87.87% 1.900ms 1.900ms 0.000us 0.00% 654.301us 654.301us 1
4057
+ FlashAttnFunc 4.37% 94.576us 82.10% 1.775ms 591.589us 0.000us 0.00% 654.301us 218.100us 3
4058
+ _flash_attn3_48fe103_dirty::fwd 2.37% 51.203us 77.72% 1.680ms 560.064us 488.670us 100.00% 654.301us 218.100us 3
4059
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 490.142us 100.30% 490.142us 490.142us 1
4060
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 488.670us 100.00% 488.670us 162.890us 3
4061
+ Activity Buffer Request 66.37% 1.435ms 66.37% 1.435ms 1.435ms 165.631us 33.89% 165.631us 165.631us 1
4062
+ aten::empty 1.25% 26.990us 1.25% 26.990us 4.498us 0.000us 0.00% 0.000us 0.000us 6
4063
+ cudaFuncSetAttribute 0.24% 5.250us 0.24% 5.250us 1.750us 0.000us 0.00% 0.000us 0.000us 3
4064
+ cudaLaunchKernel 7.49% 161.858us 7.49% 161.858us 53.953us 0.000us 0.00% 0.000us 0.000us 3
4065
+ cudaDeviceSynchronize 12.13% 262.313us 12.13% 262.313us 262.313us 0.000us 0.00% 0.000us 0.000us 1
4066
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4067
+ Self CPU time total: 2.162ms
4068
+ Self CUDA time total: 488.670us
4069
+
4070
+
4071
+
4072
+ ======================================================================
4073
+ PROFILE TRACE: hf_kernels_flash_attn3 | flux_L512
4074
+ ======================================================================
4075
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4077
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4078
+ hf_kernels_flash_attn3 5.69% 119.216us 86.59% 1.815ms 1.815ms 0.000us 0.00% 666.625us 666.625us 1
4079
+ FlashAttnFunc 4.40% 92.224us 80.91% 1.696ms 565.401us 0.000us 0.00% 666.625us 222.208us 3
4080
+ _flash_attn3_48fe103_dirty::fwd 2.44% 51.234us 76.51% 1.604ms 534.659us 497.473us 100.00% 666.625us 222.208us 3
4081
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 498.849us 100.28% 498.849us 498.849us 1
4082
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 497.473us 100.00% 497.473us 165.824us 3
4083
+ Activity Buffer Request 64.99% 1.363ms 64.99% 1.363ms 1.363ms 169.152us 34.00% 169.152us 169.152us 1
4084
+ aten::empty 1.25% 26.300us 1.25% 26.300us 4.383us 0.000us 0.00% 0.000us 0.000us 6
4085
+ cudaFuncSetAttribute 0.27% 5.600us 0.27% 5.600us 1.867us 0.000us 0.00% 0.000us 0.000us 3
4086
+ cudaLaunchKernel 7.55% 158.288us 7.55% 158.288us 52.763us 0.000us 0.00% 0.000us 0.000us 3
4087
+ cudaDeviceSynchronize 13.41% 281.113us 13.41% 281.113us 281.113us 0.000us 0.00% 0.000us 0.000us 1
4088
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4089
+ Self CPU time total: 2.097ms
4090
+ Self CUDA time total: 497.473us
4091
+
4092
+
4093
+ impl wl p50(ms) ok
4094
+ hf_kernels_flash_attn3 flux_L128 0.13 True
4095
+ hf_kernels_flash_attn3 flux_L256 0.15 True
4096
+ hf_kernels_flash_attn3 flux_L320 0.16 True
4097
+ hf_kernels_flash_attn3 flux_L384 0.16 True
4098
+ hf_kernels_flash_attn3 flux_L448 0.21 True
4099
+ hf_kernels_flash_attn3 flux_L512 0.21 True
4100
+ </pre></div>
4101
+ <div class="cell-stderr">
4102
+ Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4103
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.23it/s]
4104
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.46it/s]
4105
+ </div>
4106
+ <div class="cell-artifacts">
4107
+ <h4>Artifacts:</h4>
4108
+ <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
4109
  </div>
4110
  </div>
4111
  </div>
flash_attn/impls/mem_efficient_attention.html CHANGED
@@ -706,6 +706,29 @@
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  .cell-stderr {
710
  background: var(--bg-error);
711
  border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3556
  if(output){
3557
  output.classList.remove('output-stale');
3558
  let html='';
3559
- if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
 
 
 
 
 
3560
  console.log('UV Logs:', data);
3561
  if(data.stderr) {
3562
  // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3678
  }
3679
  }
3680
 
3681
- // Live reload functionality (robust SSE handling)
3682
- (function(){
3683
- if (!('EventSource' in window)) {
3684
- console.warn('SSE not supported in this browser');
3685
- return;
3686
- }
3687
- let source = new EventSource('/events');
3688
- let isOpen = false;
3689
- source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3690
- source.onmessage = function(e){
3691
- const msg=(e.data||'').trim(); if(!msg) return;
3692
- console.log('SSE message:', msg);
3693
- if (msg==='reload' || msg==='incremental') { location.reload(); }
3694
- // Ignore 'loading' to avoid premature reload loops
3695
- };
3696
- source.onerror = function(e){
3697
- // Let EventSource auto-reconnect instead of forcing a reload
3698
- if (isOpen) console.warn('SSE error after open, retrying...', e);
3699
- };
3700
- window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3701
- })();
3702
 
3703
 
3704
  document.addEventListener('DOMContentLoaded', function() {
@@ -3829,21 +3857,21 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
3836
  <div class="main-content">
3837
  <h1>Memory Efficient Attention Implementation</h1>
3838
  <h2>Memory Efficient SDPA Benchmark</h2>
3839
- <div class="cell cell-failed" id="cell-benchmark">
3840
  <div class="cell-header">
3841
  <span class="collapse-indicators">
3842
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3843
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3844
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: benchmark | 0.01s | FAILED
3847
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3849
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3855,12 +3883,12 @@ Cell: benchmark | 0.01s | FAILED
3855
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3856
  <span class="c1"># dependencies = [</span>
3857
  <span class="c1"># &quot;numpy&quot;,</span>
3858
- <span class="c1"># &quot;torch&quot;,</span>
3859
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3860
  <span class="c1"># ]</span>
3861
  <span class="c1">#</span>
3862
  <span class="c1"># [tool.uv.sources]</span>
3863
- <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3864
  <span class="c1"># ///</span>
3865
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3866
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3926,9 +3954,203 @@ Cell: benchmark | 0.01s | FAILED
3926
  </div>
3927
  </div>
3928
  <div id="output-benchmark" class="cell-output">
3929
- <div class="cell-stderr"> × Failed to resolve script requirement
3930
- ╰─▶ Distribution not found at:
3931
- file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3932
  </div>
3933
  </div>
3934
  </div>
 
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
709
+
710
+ .cell-stdout {
711
+ background: var(--bg-tertiary);
712
+ padding: 0.75rem;
713
+ border-radius: 1px;
714
+ font-family: inherit;
715
+ font-size: 0.9rem;
716
+ color: var(--text-primary);
717
+
718
+ /* key bits */
719
+ overflow: auto; /* show scrollbars when needed */
720
+ max-width: 100%; /* respects whatever layout width you give it */
721
+ }
722
+
723
+ .cell-stdout .stdout-text {
724
+ margin: 0; /* reset pre default margin */
725
+ white-space: pre; /* keep line breaks, NO wrapping */
726
+ display: inline-block; /* shrink-to-content */
727
+ min-width: max-content; /* allow very long lines to define intrinsic width */
728
+ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
729
+ tab-size: 2;
730
+ }
731
+
732
  .cell-stderr {
733
  background: var(--bg-error);
734
  border-left: 2px solid var(--border-error);
 
3579
  if(output){
3580
  output.classList.remove('output-stale');
3581
  let html='';
3582
+ if (data.stdout) {
3583
+ html += '<div class="cell-stdout"><pre class="stdout-text">'
3584
+ + escapeHtml(data.stdout)
3585
+ + '</pre></div>';
3586
+ }
3587
+
3588
  console.log('UV Logs:', data);
3589
  if(data.stderr) {
3590
  // Split UV logs from regular stderr
 
3706
  }
3707
  }
3708
 
3709
+ // // Live reload functionality (robust SSE handling)
3710
+ // (function(){
3711
+ // if (!('EventSource' in window)) {
3712
+ // console.warn('SSE not supported in this browser');
3713
+ // return;
3714
+ // }
3715
+ // let source = new EventSource('/events');
3716
+ // let isOpen = false;
3717
+ // source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3718
+ // source.onmessage = function(e){
3719
+ // const msg=(e.data||'').trim(); if(!msg) return;
3720
+ // console.log('SSE message:', msg);
3721
+ // if (msg==='reload' || msg==='incremental') { location.reload(); }
3722
+ // // Ignore 'loading' to avoid premature reload loops
3723
+ // };
3724
+ // source.onerror = function(e){
3725
+ // // Let EventSource auto-reconnect instead of forcing a reload
3726
+ // if (isOpen) console.warn('SSE error after open, retrying...', e);
3727
+ // };
3728
+ // window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3729
+ // })();
3730
 
3731
 
3732
  document.addEventListener('DOMContentLoaded', function() {
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
3864
  <div class="main-content">
3865
  <h1>Memory Efficient Attention Implementation</h1>
3866
  <h2>Memory Efficient SDPA Benchmark</h2>
3867
+ <div class="cell" id="cell-benchmark">
3868
  <div class="cell-header">
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 3.60s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3883
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3884
  <span class="c1"># dependencies = [</span>
3885
  <span class="c1"># &quot;numpy&quot;,</span>
3886
+ <span class="c1"># &quot;torch==2.8.0&quot;,</span>
3887
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3888
  <span class="c1"># ]</span>
3889
  <span class="c1">#</span>
3890
  <span class="c1"># [tool.uv.sources]</span>
3891
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
3892
  <span class="c1"># ///</span>
3893
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3894
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
3954
  </div>
3955
  </div>
3956
  <div id="output-benchmark" class="cell-output">
3957
+ <div class="cell-stdout"><pre class="stdout-text">
3958
+ ======================================================================
3959
+ PROFILE TRACE: torch_mem_eff | flux_L128
3960
+ ======================================================================
3961
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3962
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3963
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3964
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 743.839us 143.68% 743.839us 743.839us 1
3965
+ torch_mem_eff 14.97% 353.534us 98.94% 2.336ms 2.336ms 0.000us 0.00% 525.535us 525.535us 1
3966
+ aten::scaled_dot_product_attention 1.34% 31.582us 7.53% 177.879us 59.293us 0.000us 0.00% 451.039us 150.346us 3
3967
+ aten::_scaled_dot_product_efficient_attention 0.99% 23.447us 6.20% 146.297us 48.766us 0.000us 0.00% 451.039us 150.346us 3
3968
+ aten::_efficient_attention_forward 1.49% 35.270us 4.27% 100.806us 33.602us 451.039us 87.12% 451.039us 150.346us 3
3969
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 451.039us 87.12% 451.039us 150.346us 3
3970
+ aten::contiguous 0.56% 13.241us 73.52% 1.736ms 192.899us 0.000us 0.00% 74.496us 8.277us 9
3971
+ aten::clone 1.47% 34.702us 72.96% 1.723ms 191.428us 0.000us 0.00% 74.496us 8.277us 9
3972
+ aten::copy_ 3.23% 76.247us 68.33% 1.614ms 179.290us 66.656us 12.88% 74.496us 8.277us 9
3973
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 66.656us 12.88% 66.656us 7.406us 9
3974
+ Activity Buffer Request 61.73% 1.458ms 61.73% 1.458ms 1.458ms 7.840us 1.51% 7.840us 7.840us 1
3975
+ aten::transpose 2.92% 68.989us 3.85% 90.910us 3.788us 0.000us 0.00% 0.000us 0.000us 24
3976
+ aten::as_strided 0.93% 21.921us 0.93% 21.921us 0.913us 0.000us 0.00% 0.000us 0.000us 24
3977
+ aten::empty_like 0.77% 18.239us 3.16% 74.542us 8.282us 0.000us 0.00% 0.000us 0.000us 9
3978
+ aten::empty 3.59% 84.706us 3.59% 84.706us 4.034us 0.000us 0.00% 0.000us 0.000us 21
3979
+ cudaLaunchKernel 4.35% 102.715us 4.35% 102.715us 8.560us 0.000us 0.00% 0.000us 0.000us 12
3980
+ cudaStreamIsCapturing 0.16% 3.710us 0.16% 3.710us 1.237us 0.000us 0.00% 0.000us 0.000us 3
3981
+ cudaFuncSetAttribute 0.44% 10.440us 0.44% 10.440us 3.480us 0.000us 0.00% 0.000us 0.000us 3
3982
+ cudaDeviceSynchronize 1.06% 24.961us 1.06% 24.961us 24.961us 0.000us 0.00% 0.000us 0.000us 1
3983
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3984
+ Self CPU time total: 2.361ms
3985
+ Self CUDA time total: 517.695us
3986
+
3987
+
3988
+
3989
+ ======================================================================
3990
+ PROFILE TRACE: torch_mem_eff | flux_L256
3991
+ ======================================================================
3992
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3993
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3994
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3995
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 704.155us 121.71% 704.155us 704.155us 1
3996
+ torch_mem_eff 11.29% 250.325us 93.54% 2.073ms 2.073ms 0.000us 0.00% 586.972us 586.972us 1
3997
+ aten::scaled_dot_product_attention 0.83% 18.299us 6.32% 139.996us 46.665us 0.000us 0.00% 507.229us 169.076us 3
3998
+ aten::_scaled_dot_product_efficient_attention 0.91% 20.123us 5.49% 121.697us 40.566us 0.000us 0.00% 507.229us 169.076us 3
3999
+ aten::_efficient_attention_forward 1.32% 29.201us 3.61% 80.034us 26.678us 507.229us 87.67% 507.229us 169.076us 3
4000
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 507.229us 87.67% 507.229us 169.076us 3
4001
+ aten::contiguous 0.32% 7.068us 74.05% 1.641ms 182.386us 0.000us 0.00% 79.743us 8.860us 9
4002
+ aten::clone 1.01% 22.352us 73.73% 1.634ms 181.601us 0.000us 0.00% 79.743us 8.860us 9
4003
+ aten::copy_ 2.89% 63.964us 70.44% 1.562ms 173.503us 71.327us 12.33% 79.743us 8.860us 9
4004
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 71.327us 12.33% 71.327us 7.925us 9
4005
+ Activity Buffer Request 64.67% 1.433ms 64.67% 1.433ms 1.433ms 8.416us 1.45% 8.416us 8.416us 1
4006
+ aten::transpose 2.15% 47.759us 2.85% 63.231us 2.635us 0.000us 0.00% 0.000us 0.000us 24
4007
+ aten::as_strided 0.70% 15.472us 0.70% 15.472us 0.645us 0.000us 0.00% 0.000us 0.000us 24
4008
+ aten::empty_like 0.52% 11.480us 2.28% 50.532us 5.615us 0.000us 0.00% 0.000us 0.000us 9
4009
+ aten::empty 2.90% 64.203us 2.90% 64.203us 3.057us 0.000us 0.00% 0.000us 0.000us 21
4010
+ cudaLaunchKernel 3.80% 84.195us 3.80% 84.195us 7.016us 0.000us 0.00% 0.000us 0.000us 12
4011
+ cudaStreamIsCapturing 0.10% 2.170us 0.10% 2.170us 0.723us 0.000us 0.00% 0.000us 0.000us 3
4012
+ cudaFuncSetAttribute 0.15% 3.380us 0.15% 3.380us 1.127us 0.000us 0.00% 0.000us 0.000us 3
4013
+ cudaDeviceSynchronize 6.46% 143.197us 6.46% 143.197us 143.197us 0.000us 0.00% 0.000us 0.000us 1
4014
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4015
+ Self CPU time total: 2.217ms
4016
+ Self CUDA time total: 578.556us
4017
+
4018
+
4019
+
4020
+ ======================================================================
4021
+ PROFILE TRACE: torch_mem_eff | flux_L320
4022
+ ======================================================================
4023
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4024
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4025
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4026
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 741.345us 118.22% 741.345us 741.345us 1
4027
+ torch_mem_eff 10.83% 244.352us 91.98% 2.075ms 2.075ms 0.000us 0.00% 636.768us 636.768us 1
4028
+ aten::scaled_dot_product_attention 0.80% 18.001us 6.18% 139.437us 46.479us 0.000us 0.00% 543.969us 181.323us 3
4029
+ aten::_scaled_dot_product_efficient_attention 0.80% 18.160us 5.38% 121.436us 40.479us 0.000us 0.00% 543.969us 181.323us 3
4030
+ aten::_efficient_attention_forward 1.26% 28.484us 3.53% 79.573us 26.524us 543.969us 86.74% 543.969us 181.323us 3
4031
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 543.969us 86.74% 543.969us 181.323us 3
4032
+ aten::contiguous 0.34% 7.591us 72.87% 1.644ms 182.689us 0.000us 0.00% 92.799us 10.311us 9
4033
+ aten::clone 1.02% 22.973us 72.53% 1.637ms 181.846us 0.000us 0.00% 92.799us 10.311us 9
4034
+ aten::copy_ 2.84% 64.004us 69.28% 1.563ms 173.686us 83.135us 13.26% 92.799us 10.311us 9
4035
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 83.135us 13.26% 83.135us 9.237us 9
4036
+ Activity Buffer Request 63.58% 1.435ms 63.58% 1.435ms 1.435ms 9.664us 1.54% 9.664us 9.664us 1
4037
+ aten::transpose 2.42% 54.684us 3.15% 71.104us 2.963us 0.000us 0.00% 0.000us 0.000us 24
4038
+ aten::as_strided 0.73% 16.420us 0.73% 16.420us 0.684us 0.000us 0.00% 0.000us 0.000us 24
4039
+ aten::empty_like 0.53% 12.038us 2.24% 50.461us 5.607us 0.000us 0.00% 0.000us 0.000us 9
4040
+ aten::empty 2.78% 62.772us 2.78% 62.772us 2.989us 0.000us 0.00% 0.000us 0.000us 21
4041
+ cudaLaunchKernel 3.80% 85.752us 3.80% 85.752us 7.146us 0.000us 0.00% 0.000us 0.000us 12
4042
+ cudaStreamIsCapturing 0.10% 2.260us 0.10% 2.260us 0.753us 0.000us 0.00% 0.000us 0.000us 3
4043
+ cudaFuncSetAttribute 0.15% 3.330us 0.15% 3.330us 1.110us 0.000us 0.00% 0.000us 0.000us 3
4044
+ cudaDeviceSynchronize 8.02% 181.009us 8.02% 181.009us 181.009us 0.000us 0.00% 0.000us 0.000us 1
4045
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4046
+ Self CPU time total: 2.256ms
4047
+ Self CUDA time total: 627.104us
4048
+
4049
+
4050
+
4051
+ ======================================================================
4052
+ PROFILE TRACE: torch_mem_eff | flux_L384
4053
+ ======================================================================
4054
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4055
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4056
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 762.814us 117.08% 762.814us 762.814us 1
4058
+ torch_mem_eff 10.94% 270.925us 93.63% 2.319ms 2.319ms 0.000us 0.00% 663.068us 663.068us 1
4059
+ aten::scaled_dot_product_attention 0.75% 18.610us 6.03% 149.368us 49.789us 0.000us 0.00% 560.285us 186.762us 3
4060
+ aten::_scaled_dot_product_efficient_attention 0.84% 20.750us 5.28% 130.758us 43.586us 0.000us 0.00% 560.285us 186.762us 3
4061
+ aten::_efficient_attention_forward 1.24% 30.680us 3.47% 85.933us 28.644us 560.285us 85.99% 560.285us 186.762us 3
4062
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 560.285us 85.99% 560.285us 186.762us 3
4063
+ aten::contiguous 0.34% 8.310us 74.76% 1.851ms 205.718us 0.000us 0.00% 102.783us 11.420us 9
4064
+ aten::clone 0.93% 23.120us 74.43% 1.843ms 204.794us 0.000us 0.00% 102.783us 11.420us 9
4065
+ aten::copy_ 2.76% 68.243us 71.46% 1.770ms 196.615us 91.263us 14.01% 102.783us 11.420us 9
4066
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 91.263us 14.01% 91.263us 10.140us 9
4067
+ Activity Buffer Request 57.69% 1.429ms 57.69% 1.429ms 1.429ms 11.520us 1.77% 11.520us 11.520us 1
4068
+ aten::transpose 2.18% 53.884us 2.86% 70.837us 2.952us 0.000us 0.00% 0.000us 0.000us 24
4069
+ aten::as_strided 0.68% 16.953us 0.68% 16.953us 0.706us 0.000us 0.00% 0.000us 0.000us 24
4070
+ aten::empty_like 0.46% 11.381us 2.04% 50.492us 5.610us 0.000us 0.00% 0.000us 0.000us 9
4071
+ aten::empty 2.62% 64.842us 2.62% 64.842us 3.088us 0.000us 0.00% 0.000us 0.000us 21
4072
+ cudaLaunchKernel 11.97% 296.414us 11.97% 296.414us 24.701us 0.000us 0.00% 0.000us 0.000us 12
4073
+ cudaStreamIsCapturing 0.10% 2.540us 0.10% 2.540us 0.847us 0.000us 0.00% 0.000us 0.000us 3
4074
+ cudaFuncSetAttribute 0.13% 3.261us 0.13% 3.261us 1.087us 0.000us 0.00% 0.000us 0.000us 3
4075
+ cudaDeviceSynchronize 6.37% 157.857us 6.37% 157.857us 157.857us 0.000us 0.00% 0.000us 0.000us 1
4076
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
+ Self CPU time total: 2.476ms
4078
+ Self CUDA time total: 651.548us
4079
+
4080
+
4081
+
4082
+ ======================================================================
4083
+ PROFILE TRACE: torch_mem_eff | flux_L448
4084
+ ======================================================================
4085
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4086
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4087
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 811.582us 115.69% 811.582us 811.582us 1
4089
+ torch_mem_eff 10.28% 258.922us 90.15% 2.271ms 2.271ms 0.000us 0.00% 712.095us 712.095us 1
4090
+ aten::scaled_dot_product_attention 0.74% 18.760us 5.47% 137.886us 45.962us 0.000us 0.00% 611.487us 203.829us 3
4091
+ aten::_scaled_dot_product_efficient_attention 0.72% 18.189us 4.73% 119.126us 39.709us 0.000us 0.00% 611.487us 203.829us 3
4092
+ aten::_efficient_attention_forward 1.11% 28.033us 3.12% 78.704us 26.235us 611.487us 87.16% 611.487us 203.829us 3
4093
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 611.487us 87.16% 611.487us 203.829us 3
4094
+ aten::contiguous 0.29% 7.191us 72.68% 1.831ms 203.401us 0.000us 0.00% 100.608us 11.179us 9
4095
+ aten::clone 0.89% 22.393us 72.40% 1.823ms 202.602us 0.000us 0.00% 100.608us 11.179us 9
4096
+ aten::copy_ 2.57% 64.604us 69.47% 1.750ms 194.423us 90.048us 12.84% 100.608us 11.179us 9
4097
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.048us 12.84% 90.048us 10.005us 9
4098
+ Activity Buffer Request 58.13% 1.464ms 58.13% 1.464ms 1.464ms 10.560us 1.51% 10.560us 10.560us 1
4099
+ aten::transpose 1.95% 49.033us 2.60% 65.375us 2.724us 0.000us 0.00% 0.000us 0.000us 24
4100
+ aten::as_strided 0.65% 16.342us 0.65% 16.342us 0.681us 0.000us 0.00% 0.000us 0.000us 24
4101
+ aten::empty_like 0.51% 12.912us 2.03% 51.223us 5.691us 0.000us 0.00% 0.000us 0.000us 9
4102
+ aten::empty 2.50% 62.890us 2.50% 62.890us 2.995us 0.000us 0.00% 0.000us 0.000us 21
4103
+ cudaLaunchKernel 9.59% 241.441us 9.59% 241.441us 20.120us 0.000us 0.00% 0.000us 0.000us 12
4104
+ cudaStreamIsCapturing 0.09% 2.220us 0.09% 2.220us 0.740us 0.000us 0.00% 0.000us 0.000us 3
4105
+ cudaFuncSetAttribute 0.14% 3.650us 0.14% 3.650us 1.217us 0.000us 0.00% 0.000us 0.000us 3
4106
+ cudaDeviceSynchronize 9.85% 248.062us 9.85% 248.062us 248.062us 0.000us 0.00% 0.000us 0.000us 1
4107
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4108
+ Self CPU time total: 2.519ms
4109
+ Self CUDA time total: 701.535us
4110
+
4111
+
4112
+
4113
+ ======================================================================
4114
+ PROFILE TRACE: torch_mem_eff | flux_L512
4115
+ ======================================================================
4116
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4117
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4118
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 955.976us 112.33% 955.976us 955.976us 1
4120
+ torch_mem_eff 9.37% 248.255us 85.34% 2.262ms 2.262ms 0.000us 0.00% 865.703us 865.703us 1
4121
+ aten::scaled_dot_product_attention 0.68% 17.990us 5.29% 140.316us 46.772us 0.000us 0.00% 738.854us 246.285us 3
4122
+ aten::_scaled_dot_product_efficient_attention 0.72% 19.111us 4.61% 122.326us 40.775us 0.000us 0.00% 738.854us 246.285us 3
4123
+ aten::_efficient_attention_forward 1.10% 29.141us 2.98% 78.926us 26.309us 738.854us 86.81% 738.854us 246.285us 3
4124
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 738.854us 86.81% 738.854us 246.285us 3
4125
+ aten::contiguous 0.28% 7.521us 68.87% 1.825ms 202.832us 0.000us 0.00% 126.849us 14.094us 9
4126
+ aten::clone 0.86% 22.848us 68.58% 1.818ms 201.996us 0.000us 0.00% 126.849us 14.094us 9
4127
+ aten::copy_ 2.53% 66.983us 65.79% 1.744ms 193.757us 112.225us 13.19% 126.849us 14.094us 9
4128
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 112.225us 13.19% 112.225us 12.469us 9
4129
+ Activity Buffer Request 55.19% 1.463ms 55.19% 1.463ms 1.463ms 14.624us 1.72% 14.624us 14.624us 1
4130
+ aten::transpose 2.08% 55.231us 2.73% 72.342us 3.014us 0.000us 0.00% 0.000us 0.000us 24
4131
+ aten::as_strided 0.65% 17.111us 0.65% 17.111us 0.713us 0.000us 0.00% 0.000us 0.000us 24
4132
+ aten::empty_like 0.44% 11.730us 1.94% 51.302us 5.700us 0.000us 0.00% 0.000us 0.000us 9
4133
+ aten::empty 2.40% 63.653us 2.40% 63.653us 3.031us 0.000us 0.00% 0.000us 0.000us 21
4134
+ cudaLaunchKernel 8.85% 234.503us 8.85% 234.503us 19.542us 0.000us 0.00% 0.000us 0.000us 12
4135
+ cudaStreamIsCapturing 0.08% 2.150us 0.08% 2.150us 0.717us 0.000us 0.00% 0.000us 0.000us 3
4136
+ cudaFuncSetAttribute 0.11% 2.981us 0.11% 2.981us 0.994us 0.000us 0.00% 0.000us 0.000us 3
4137
+ cudaDeviceSynchronize 14.66% 388.669us 14.66% 388.669us 388.669us 0.000us 0.00% 0.000us 0.000us 1
4138
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4139
+ Self CPU time total: 2.651ms
4140
+ Self CUDA time total: 851.079us
4141
+
4142
+
4143
+ impl wl p50(ms) ok
4144
+ torch_mem_eff flux_L128 0.23 True
4145
+ torch_mem_eff flux_L256 0.26 True
4146
+ torch_mem_eff flux_L320 0.28 True
4147
+ torch_mem_eff flux_L384 0.28 True
4148
+ torch_mem_eff flux_L448 0.30 True
4149
+ torch_mem_eff flux_L512 0.34 True
4150
+ </pre></div>
4151
+ <div class="cell-artifacts">
4152
+ <h4>Artifacts:</h4>
4153
+ <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
4154
  </div>
4155
  </div>
4156
  </div>
flash_attn/impls/sage_attention.html CHANGED
@@ -706,6 +706,29 @@
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  .cell-stderr {
710
  background: var(--bg-error);
711
  border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3556
  if(output){
3557
  output.classList.remove('output-stale');
3558
  let html='';
3559
- if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
 
 
 
 
 
3560
  console.log('UV Logs:', data);
3561
  if(data.stderr) {
3562
  // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3678
  }
3679
  }
3680
 
3681
- // Live reload functionality (robust SSE handling)
3682
- (function(){
3683
- if (!('EventSource' in window)) {
3684
- console.warn('SSE not supported in this browser');
3685
- return;
3686
- }
3687
- let source = new EventSource('/events');
3688
- let isOpen = false;
3689
- source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3690
- source.onmessage = function(e){
3691
- const msg=(e.data||'').trim(); if(!msg) return;
3692
- console.log('SSE message:', msg);
3693
- if (msg==='reload' || msg==='incremental') { location.reload(); }
3694
- // Ignore 'loading' to avoid premature reload loops
3695
- };
3696
- source.onerror = function(e){
3697
- // Let EventSource auto-reconnect instead of forcing a reload
3698
- if (isOpen) console.warn('SSE error after open, retrying...', e);
3699
- };
3700
- window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3701
- })();
3702
 
3703
 
3704
  document.addEventListener('DOMContentLoaded', function() {
@@ -3829,21 +3857,21 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
3836
  <div class="main-content">
3837
  <h1>SageAttention Implementation</h1>
3838
  <h2>SageAttention Benchmark (INT8 Quantized)</h2>
3839
- <div class="cell cell-failed" id="cell-benchmark">
3840
  <div class="cell-header">
3841
  <span class="collapse-indicators">
3842
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3843
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3844
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: benchmark | 0.05s | FAILED
3847
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3849
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3855,14 +3883,14 @@ Cell: benchmark | 0.05s | FAILED
3855
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3856
  <span class="c1"># dependencies = [</span>
3857
  <span class="c1"># &quot;numpy&quot;,</span>
3858
- <span class="c1"># &quot;torch&quot;,</span>
3859
  <span class="c1"># &quot;kernels&quot;,</span>
3860
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3861
  <span class="c1"># &quot;sageattention&quot;,</span>
3862
  <span class="c1"># ]</span>
3863
  <span class="c1">#</span>
3864
  <span class="c1"># [tool.uv.sources]</span>
3865
- <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3866
  <span class="c1"># ///</span>
3867
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3868
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3937,9 +3965,80 @@ Cell: benchmark | 0.05s | FAILED
3937
  </div>
3938
  </div>
3939
  <div id="output-benchmark" class="cell-output">
3940
- <div class="cell-stderr"> × Failed to resolve script requirement
3941
- ╰─▶ Distribution not found at:
3942
- file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3943
  </div>
3944
  </div>
3945
  </div>
 
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
709
+
710
+ .cell-stdout {
711
+ background: var(--bg-tertiary);
712
+ padding: 0.75rem;
713
+ border-radius: 1px;
714
+ font-family: inherit;
715
+ font-size: 0.9rem;
716
+ color: var(--text-primary);
717
+
718
+ /* key bits */
719
+ overflow: auto; /* show scrollbars when needed */
720
+ max-width: 100%; /* respects whatever layout width you give it */
721
+ }
722
+
723
+ .cell-stdout .stdout-text {
724
+ margin: 0; /* reset pre default margin */
725
+ white-space: pre; /* keep line breaks, NO wrapping */
726
+ display: inline-block; /* shrink-to-content */
727
+ min-width: max-content; /* allow very long lines to define intrinsic width */
728
+ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
729
+ tab-size: 2;
730
+ }
731
+
732
  .cell-stderr {
733
  background: var(--bg-error);
734
  border-left: 2px solid var(--border-error);
 
3579
  if(output){
3580
  output.classList.remove('output-stale');
3581
  let html='';
3582
+ if (data.stdout) {
3583
+ html += '<div class="cell-stdout"><pre class="stdout-text">'
3584
+ + escapeHtml(data.stdout)
3585
+ + '</pre></div>';
3586
+ }
3587
+
3588
  console.log('UV Logs:', data);
3589
  if(data.stderr) {
3590
  // Split UV logs from regular stderr
 
3706
  }
3707
  }
3708
 
3709
+ // // Live reload functionality (robust SSE handling)
3710
+ // (function(){
3711
+ // if (!('EventSource' in window)) {
3712
+ // console.warn('SSE not supported in this browser');
3713
+ // return;
3714
+ // }
3715
+ // let source = new EventSource('/events');
3716
+ // let isOpen = false;
3717
+ // source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3718
+ // source.onmessage = function(e){
3719
+ // const msg=(e.data||'').trim(); if(!msg) return;
3720
+ // console.log('SSE message:', msg);
3721
+ // if (msg==='reload' || msg==='incremental') { location.reload(); }
3722
+ // // Ignore 'loading' to avoid premature reload loops
3723
+ // };
3724
+ // source.onerror = function(e){
3725
+ // // Let EventSource auto-reconnect instead of forcing a reload
3726
+ // if (isOpen) console.warn('SSE error after open, retrying...', e);
3727
+ // };
3728
+ // window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3729
+ // })();
3730
 
3731
 
3732
  document.addEventListener('DOMContentLoaded', function() {
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
3864
  <div class="main-content">
3865
  <h1>SageAttention Implementation</h1>
3866
  <h2>SageAttention Benchmark (INT8 Quantized)</h2>
3867
+ <div class="cell" id="cell-benchmark">
3868
  <div class="cell-header">
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 34.80s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3883
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3884
  <span class="c1"># dependencies = [</span>
3885
  <span class="c1"># &quot;numpy&quot;,</span>
3886
+ <span class="c1"># &quot;torch==2.8.0&quot;,</span>
3887
  <span class="c1"># &quot;kernels&quot;,</span>
3888
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3889
  <span class="c1"># &quot;sageattention&quot;,</span>
3890
  <span class="c1"># ]</span>
3891
  <span class="c1">#</span>
3892
  <span class="c1"># [tool.uv.sources]</span>
3893
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
3894
  <span class="c1"># ///</span>
3895
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3896
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
3965
  </div>
3966
  </div>
3967
  <div id="output-benchmark" class="cell-output">
3968
+ <div class="cell-stdout"><pre class="stdout-text">impl wl p50(ms) ok
3969
+ sage_int8_fp16 flux_L128 FAIL False
3970
+ Error: module &#x27;sage_attention_c88aae76123df82b&#x27; has no attribute &#x27;fwd&#x27;
3971
+ sage_int8_fp16 flux_L256 FAIL False
3972
+ Error: module &#x27;sage_attention_c88aae76123df82b&#x27; has no attribute &#x27;fwd&#x27;
3973
+ sage_int8_fp16 flux_L320 FAIL False
3974
+ Error: module &#x27;sage_attention_c88aae76123df82b&#x27; has no attribute &#x27;fwd&#x27;
3975
+ sage_int8_fp16 flux_L384 FAIL False
3976
+ Error: module &#x27;sage_attention_c88aae76123df82b&#x27; has no attribute &#x27;fwd&#x27;
3977
+ sage_int8_fp16 flux_L448 FAIL False
3978
+ Error: module &#x27;sage_attention_c88aae76123df82b&#x27; has no attribute &#x27;fwd&#x27;
3979
+ sage_int8_fp16 flux_L512 FAIL False
3980
+ Error: module &#x27;sage_attention_c88aae76123df82b&#x27; has no attribute &#x27;fwd&#x27;
3981
+ </pre></div>
3982
+ <div class="uv-install-logs" id="uv-logs-benchmark">
3983
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3984
+ <div class="uv-logs-content" style="display: none;">
3985
+ Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
3986
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
3987
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3988
+ Downloading sympy (6.0MiB)
3989
+ Downloading nvidia-cufile-cu12 (1.1MiB)
3990
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3991
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
3992
+ Downloading matplotlib (8.3MiB)
3993
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3994
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3995
+ Downloading networkx (1.9MiB)
3996
+ Downloading nvidia-nccl-cu12 (307.4MiB)
3997
+ Downloading setuptools (1.1MiB)
3998
+ Downloading kiwisolver (1.4MiB)
3999
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4000
+ Downloading nvidia-curand-cu12 (60.7MiB)
4001
+ Downloading numpy (16.2MiB)
4002
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4003
+ Downloading torch (846.9MiB)
4004
+ Downloading triton (148.3MiB)
4005
+ Downloading fonttools (4.7MiB)
4006
+ Downloading pillow (6.7MiB)
4007
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4008
+ Downloading hf-xet (3.0MiB)
4009
+ Downloading nvidia-cufile-cu12
4010
+ Downloading kiwisolver
4011
+ Downloading hf-xet
4012
+ Downloading setuptools
4013
+ Downloading networkx
4014
+ Downloading fonttools
4015
+ Downloading pillow
4016
+ Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4017
+ Downloading nvidia-cuda-cupti-cu12
4018
+ Downloading matplotlib
4019
+ Downloading numpy
4020
+ Downloading sympy
4021
+ Downloading nvidia-nvjitlink-cu12
4022
+ Downloading nvidia-curand-cu12
4023
+ Downloading nvidia-cuda-nvrtc-cu12
4024
+ Downloading triton
4025
+ Downloading nvidia-cufft-cu12
4026
+ Downloading nvidia-cusolver-cu12
4027
+ Downloading nvidia-cusparse-cu12
4028
+ Downloading nvidia-cusparselt-cu12
4029
+ Downloading nvidia-nccl-cu12
4030
+ Downloading nvidia-cublas-cu12
4031
+ Downloading nvidia-cudnn-cu12
4032
+ Downloading torch
4033
+ Installed 48 packages in 236ms
4034
+ </div>
4035
+ </div>
4036
+ <div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
4037
+ Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 9.16it/s]
4038
+ Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 12.59it/s]</div>
4039
+ <div class="cell-artifacts">
4040
+ <h4>Artifacts:</h4>
4041
+ <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
4042
  </div>
4043
  </div>
4044
  </div>
flash_attn/impls/xformers.html CHANGED
@@ -706,6 +706,29 @@
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  .cell-stderr {
710
  background: var(--bg-error);
711
  border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3556
  if(output){
3557
  output.classList.remove('output-stale');
3558
  let html='';
3559
- if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
 
 
 
 
 
3560
  console.log('UV Logs:', data);
3561
  if(data.stderr) {
3562
  // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3678
  }
3679
  }
3680
 
3681
- // Live reload functionality (robust SSE handling)
3682
- (function(){
3683
- if (!('EventSource' in window)) {
3684
- console.warn('SSE not supported in this browser');
3685
- return;
3686
- }
3687
- let source = new EventSource('/events');
3688
- let isOpen = false;
3689
- source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3690
- source.onmessage = function(e){
3691
- const msg=(e.data||'').trim(); if(!msg) return;
3692
- console.log('SSE message:', msg);
3693
- if (msg==='reload' || msg==='incremental') { location.reload(); }
3694
- // Ignore 'loading' to avoid premature reload loops
3695
- };
3696
- source.onerror = function(e){
3697
- // Let EventSource auto-reconnect instead of forcing a reload
3698
- if (isOpen) console.warn('SSE error after open, retrying...', e);
3699
- };
3700
- window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3701
- })();
3702
 
3703
 
3704
  document.addEventListener('DOMContentLoaded', function() {
@@ -3829,21 +3857,21 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
3836
  <div class="main-content">
3837
  <h1>xFormers Memory Efficient Attention</h1>
3838
  <h2>xFormers Benchmark</h2>
3839
- <div class="cell cell-failed" id="cell-benchmark">
3840
  <div class="cell-header">
3841
  <span class="collapse-indicators">
3842
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3843
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3844
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: benchmark | 0.01s | FAILED
3847
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3849
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3855,13 +3883,13 @@ Cell: benchmark | 0.01s | FAILED
3855
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3856
  <span class="c1"># dependencies = [</span>
3857
  <span class="c1"># &quot;numpy&quot;,</span>
3858
- <span class="c1"># &quot;torch&quot;,</span>
3859
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3860
  <span class="c1"># &quot;xformers&quot;,</span>
3861
  <span class="c1"># ]</span>
3862
  <span class="c1">#</span>
3863
  <span class="c1"># [tool.uv.sources]</span>
3864
- <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3865
  <span class="c1"># ///</span>
3866
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3867
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3926,9 +3954,169 @@ Cell: benchmark | 0.01s | FAILED
3926
  </div>
3927
  </div>
3928
  <div id="output-benchmark" class="cell-output">
3929
- <div class="cell-stderr"> × Failed to resolve script requirement
3930
- ╰─▶ Distribution not found at:
3931
- file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3932
  </div>
3933
  </div>
3934
  </div>
 
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
709
+
710
+ .cell-stdout {
711
+ background: var(--bg-tertiary);
712
+ padding: 0.75rem;
713
+ border-radius: 1px;
714
+ font-family: inherit;
715
+ font-size: 0.9rem;
716
+ color: var(--text-primary);
717
+
718
+ /* key bits */
719
+ overflow: auto; /* show scrollbars when needed */
720
+ max-width: 100%; /* respects whatever layout width you give it */
721
+ }
722
+
723
+ .cell-stdout .stdout-text {
724
+ margin: 0; /* reset pre default margin */
725
+ white-space: pre; /* keep line breaks, NO wrapping */
726
+ display: inline-block; /* shrink-to-content */
727
+ min-width: max-content; /* allow very long lines to define intrinsic width */
728
+ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
729
+ tab-size: 2;
730
+ }
731
+
732
  .cell-stderr {
733
  background: var(--bg-error);
734
  border-left: 2px solid var(--border-error);
 
3579
  if(output){
3580
  output.classList.remove('output-stale');
3581
  let html='';
3582
+ if (data.stdout) {
3583
+ html += '<div class="cell-stdout"><pre class="stdout-text">'
3584
+ + escapeHtml(data.stdout)
3585
+ + '</pre></div>';
3586
+ }
3587
+
3588
  console.log('UV Logs:', data);
3589
  if(data.stderr) {
3590
  // Split UV logs from regular stderr
 
3706
  }
3707
  }
3708
 
3709
+ // // Live reload functionality (robust SSE handling)
3710
+ // (function(){
3711
+ // if (!('EventSource' in window)) {
3712
+ // console.warn('SSE not supported in this browser');
3713
+ // return;
3714
+ // }
3715
+ // let source = new EventSource('/events');
3716
+ // let isOpen = false;
3717
+ // source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3718
+ // source.onmessage = function(e){
3719
+ // const msg=(e.data||'').trim(); if(!msg) return;
3720
+ // console.log('SSE message:', msg);
3721
+ // if (msg==='reload' || msg==='incremental') { location.reload(); }
3722
+ // // Ignore 'loading' to avoid premature reload loops
3723
+ // };
3724
+ // source.onerror = function(e){
3725
+ // // Let EventSource auto-reconnect instead of forcing a reload
3726
+ // if (isOpen) console.warn('SSE error after open, retrying...', e);
3727
+ // };
3728
+ // window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3729
+ // })();
3730
 
3731
 
3732
  document.addEventListener('DOMContentLoaded', function() {
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
3864
  <div class="main-content">
3865
  <h1>xFormers Memory Efficient Attention</h1>
3866
  <h2>xFormers Benchmark</h2>
3867
+ <div class="cell" id="cell-benchmark">
3868
  <div class="cell-header">
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 4.83s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3883
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3884
  <span class="c1"># dependencies = [</span>
3885
  <span class="c1"># &quot;numpy&quot;,</span>
3886
+ <span class="c1"># &quot;torch==2.8.0&quot;,</span>
3887
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3888
  <span class="c1"># &quot;xformers&quot;,</span>
3889
  <span class="c1"># ]</span>
3890
  <span class="c1">#</span>
3891
  <span class="c1"># [tool.uv.sources]</span>
3892
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
3893
  <span class="c1"># ///</span>
3894
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3895
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
3954
  </div>
3955
  </div>
3956
  <div id="output-benchmark" class="cell-output">
3957
+ <div class="cell-stdout"><pre class="stdout-text">
3958
+ ======================================================================
3959
+ PROFILE TRACE: xformers_meff | flux_L128
3960
+ ======================================================================
3961
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3962
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3963
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3964
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 506.718us 193.09% 506.718us 506.718us 1
3965
+ xformers_meff 20.33% 479.463us 99.70% 2.351ms 2.351ms 0.000us 0.00% 351.872us 351.872us 1
3966
+ xformers_flash3::flash_fwd 8.78% 206.960us 77.92% 1.837ms 612.487us 0.000us 0.00% 351.872us 117.291us 3
3967
+ flash_attn_3::fwd 3.33% 78.433us 69.14% 1.631ms 543.500us 262.432us 100.00% 351.872us 117.291us 3
3968
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 262.432us 100.00% 262.432us 87.477us 3
3969
+ Activity Buffer Request 61.85% 1.459ms 61.85% 1.459ms 1.459ms 89.440us 34.08% 89.440us 89.440us 1
3970
+ aten::empty 1.44% 34.032us 1.44% 34.032us 5.672us 0.000us 0.00% 0.000us 0.000us 6
3971
+ cudaFuncSetAttribute 0.62% 14.682us 0.62% 14.682us 4.894us 0.000us 0.00% 0.000us 0.000us 3
3972
+ cudaLaunchKernel 1.89% 44.672us 1.89% 44.672us 14.891us 0.000us 0.00% 0.000us 0.000us 3
3973
+ aten::reshape 0.50% 11.821us 1.45% 34.232us 5.705us 0.000us 0.00% 0.000us 0.000us 6
3974
+ aten::view 0.95% 22.411us 0.95% 22.411us 3.735us 0.000us 0.00% 0.000us 0.000us 6
3975
+ cudaDeviceSynchronize 0.30% 7.110us 0.30% 7.110us 7.110us 0.000us 0.00% 0.000us 0.000us 1
3976
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
+ Self CPU time total: 2.358ms
3978
+ Self CUDA time total: 262.432us
3979
+
3980
+
3981
+
3982
+ ======================================================================
3983
+ PROFILE TRACE: xformers_meff | flux_L256
3984
+ ======================================================================
3985
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3986
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3987
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 457.756us 155.59% 457.756us 457.756us 1
3989
+ xformers_meff 14.84% 310.507us 99.07% 2.072ms 2.072ms 0.000us 0.00% 391.132us 391.132us 1
3990
+ xformers_flash3::flash_fwd 7.41% 154.907us 83.06% 1.737ms 579.115us 0.000us 0.00% 391.132us 130.377us 3
3991
+ flash_attn_3::fwd 2.73% 57.112us 75.65% 1.582ms 527.479us 294.205us 100.00% 391.132us 130.377us 3
3992
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 294.205us 100.00% 294.205us 98.068us 3
3993
+ Activity Buffer Request 69.53% 1.454ms 69.53% 1.454ms 1.454ms 96.927us 32.95% 96.927us 96.927us 1
3994
+ aten::empty 1.38% 28.932us 1.38% 28.932us 4.822us 0.000us 0.00% 0.000us 0.000us 6
3995
+ cudaFuncSetAttribute 0.38% 7.960us 0.38% 7.960us 2.653us 0.000us 0.00% 0.000us 0.000us 3
3996
+ cudaLaunchKernel 1.63% 34.022us 1.63% 34.022us 11.341us 0.000us 0.00% 0.000us 0.000us 3
3997
+ aten::reshape 0.48% 10.060us 1.17% 24.410us 4.068us 0.000us 0.00% 0.000us 0.000us 6
3998
+ aten::view 0.69% 14.350us 0.69% 14.350us 2.392us 0.000us 0.00% 0.000us 0.000us 6
3999
+ cudaDeviceSynchronize 0.93% 19.421us 0.93% 19.421us 19.421us 0.000us 0.00% 0.000us 0.000us 1
4000
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
+ Self CPU time total: 2.092ms
4002
+ Self CUDA time total: 294.205us
4003
+
4004
+
4005
+
4006
+ ======================================================================
4007
+ PROFILE TRACE: xformers_meff | flux_L320
4008
+ ======================================================================
4009
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4010
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4011
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4012
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 455.327us 140.30% 455.327us 455.327us 1
4013
+ xformers_meff 14.70% 303.895us 98.43% 2.034ms 2.034ms 0.000us 0.00% 429.791us 429.791us 1
4014
+ xformers_flash3::flash_fwd 7.05% 145.707us 82.60% 1.707ms 568.998us 0.000us 0.00% 429.791us 143.264us 3
4015
+ flash_attn_3::fwd 2.62% 54.152us 75.55% 1.561ms 520.429us 324.543us 100.00% 429.791us 143.264us 3
4016
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 324.543us 100.00% 324.543us 108.181us 3
4017
+ Activity Buffer Request 69.54% 1.437ms 69.54% 1.437ms 1.437ms 105.248us 32.43% 105.248us 105.248us 1
4018
+ aten::empty 1.47% 30.342us 1.47% 30.342us 5.057us 0.000us 0.00% 0.000us 0.000us 6
4019
+ cudaFuncSetAttribute 0.27% 5.580us 0.27% 5.580us 1.860us 0.000us 0.00% 0.000us 0.000us 3
4020
+ cudaLaunchKernel 1.65% 34.132us 1.65% 34.132us 11.377us 0.000us 0.00% 0.000us 0.000us 3
4021
+ aten::reshape 0.42% 8.741us 1.13% 23.401us 3.900us 0.000us 0.00% 0.000us 0.000us 6
4022
+ aten::view 0.71% 14.660us 0.71% 14.660us 2.443us 0.000us 0.00% 0.000us 0.000us 6
4023
+ cudaDeviceSynchronize 1.57% 32.391us 1.57% 32.391us 32.391us 0.000us 0.00% 0.000us 0.000us 1
4024
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
+ Self CPU time total: 2.067ms
4026
+ Self CUDA time total: 324.543us
4027
+
4028
+
4029
+
4030
+ ======================================================================
4031
+ PROFILE TRACE: xformers_meff | flux_L384
4032
+ ======================================================================
4033
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4034
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4035
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 460.189us 141.34% 460.189us 460.189us 1
4037
+ xformers_meff 13.29% 304.067us 98.75% 2.259ms 2.259ms 0.000us 0.00% 433.468us 433.468us 1
4038
+ xformers_flash3::flash_fwd 6.63% 151.806us 84.43% 1.932ms 643.925us 0.000us 0.00% 433.468us 144.489us 3
4039
+ flash_attn_3::fwd 2.38% 54.492us 77.79% 1.780ms 593.323us 325.597us 100.00% 433.468us 144.489us 3
4040
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 325.597us 100.00% 325.597us 108.532us 3
4041
+ Activity Buffer Request 63.32% 1.449ms 63.32% 1.449ms 1.449ms 107.871us 33.13% 107.871us 107.871us 1
4042
+ aten::empty 1.26% 28.813us 1.26% 28.813us 4.802us 0.000us 0.00% 0.000us 0.000us 6
4043
+ cudaFuncSetAttribute 0.27% 6.140us 0.27% 6.140us 2.047us 0.000us 0.00% 0.000us 0.000us 3
4044
+ cudaLaunchKernel 10.56% 241.573us 10.56% 241.573us 80.524us 0.000us 0.00% 0.000us 0.000us 3
4045
+ aten::reshape 0.41% 9.348us 1.03% 23.589us 3.931us 0.000us 0.00% 0.000us 0.000us 6
4046
+ aten::view 0.62% 14.241us 0.62% 14.241us 2.374us 0.000us 0.00% 0.000us 0.000us 6
4047
+ cudaDeviceSynchronize 1.25% 28.691us 1.25% 28.691us 28.691us 0.000us 0.00% 0.000us 0.000us 1
4048
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
+ Self CPU time total: 2.288ms
4050
+ Self CUDA time total: 325.597us
4051
+
4052
+
4053
+
4054
+ ======================================================================
4055
+ PROFILE TRACE: xformers_meff | flux_L448
4056
+ ======================================================================
4057
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4058
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4059
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
+ xformers_meff 14.32% 335.208us 96.41% 2.256ms 2.256ms 0.000us 0.00% 650.207us 650.207us 1
4061
+ xformers_flash3::flash_fwd 6.57% 153.746us 81.05% 1.897ms 632.294us 0.000us 0.00% 650.207us 216.736us 3
4062
+ flash_attn_3::fwd 2.39% 56.024us 74.48% 1.743ms 581.045us 487.359us 100.00% 650.207us 216.736us 3
4063
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 545.022us 111.83% 545.022us 545.022us 1
4064
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 487.359us 100.00% 487.359us 162.453us 3
4065
+ Activity Buffer Request 62.65% 1.466ms 62.65% 1.466ms 1.466ms 162.848us 33.41% 162.848us 162.848us 1
4066
+ aten::empty 1.29% 30.110us 1.29% 30.110us 5.018us 0.000us 0.00% 0.000us 0.000us 6
4067
+ cudaFuncSetAttribute 0.25% 5.800us 0.25% 5.800us 1.933us 0.000us 0.00% 0.000us 0.000us 3
4068
+ cudaLaunchKernel 7.91% 185.030us 7.91% 185.030us 61.677us 0.000us 0.00% 0.000us 0.000us 3
4069
+ aten::reshape 0.42% 9.770us 1.04% 24.390us 4.065us 0.000us 0.00% 0.000us 0.000us 6
4070
+ aten::view 0.62% 14.620us 0.62% 14.620us 2.437us 0.000us 0.00% 0.000us 0.000us 6
4071
+ cudaDeviceSynchronize 3.59% 83.934us 3.59% 83.934us 83.934us 0.000us 0.00% 0.000us 0.000us 1
4072
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4073
+ Self CPU time total: 2.340ms
4074
+ Self CUDA time total: 487.359us
4075
+
4076
+
4077
+
4078
+ ======================================================================
4079
+ PROFILE TRACE: xformers_meff | flux_L512
4080
+ ======================================================================
4081
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4082
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4083
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4084
+ xformers_meff 13.07% 298.846us 95.47% 2.183ms 2.183ms 0.000us 0.00% 676.610us 676.610us 1
4085
+ xformers_flash3::flash_fwd 6.50% 148.626us 81.42% 1.862ms 620.693us 0.000us 0.00% 676.610us 225.537us 3
4086
+ flash_attn_3::fwd 2.33% 53.191us 74.93% 1.713ms 571.151us 505.889us 100.00% 676.610us 225.537us 3
4087
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 520.769us 102.94% 520.769us 520.769us 1
4088
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 505.889us 100.00% 505.889us 168.630us 3
4089
+ Activity Buffer Request 63.62% 1.455ms 63.62% 1.455ms 1.455ms 170.721us 33.75% 170.721us 170.721us 1
4090
+ aten::empty 1.23% 28.092us 1.23% 28.092us 4.682us 0.000us 0.00% 0.000us 0.000us 6
4091
+ cudaFuncSetAttribute 0.25% 5.790us 0.25% 5.790us 1.930us 0.000us 0.00% 0.000us 0.000us 3
4092
+ cudaLaunchKernel 7.50% 171.540us 7.50% 171.540us 57.180us 0.000us 0.00% 0.000us 0.000us 3
4093
+ aten::reshape 0.38% 8.590us 0.98% 22.470us 3.745us 0.000us 0.00% 0.000us 0.000us 6
4094
+ aten::view 0.61% 13.880us 0.61% 13.880us 2.313us 0.000us 0.00% 0.000us 0.000us 6
4095
+ cudaDeviceSynchronize 4.53% 103.496us 4.53% 103.496us 103.496us 0.000us 0.00% 0.000us 0.000us 1
4096
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4097
+ Self CPU time total: 2.287ms
4098
+ Self CUDA time total: 505.889us
4099
+
4100
+
4101
+ impl wl p50(ms) ok
4102
+ xformers_meff flux_L128 0.20 True
4103
+ xformers_meff flux_L256 0.21 True
4104
+ xformers_meff flux_L320 0.22 True
4105
+ xformers_meff flux_L384 0.22 True
4106
+ xformers_meff flux_L448 0.28 True
4107
+ xformers_meff flux_L512 0.27 True
4108
+ </pre></div>
4109
+ <div class="uv-install-logs" id="uv-logs-benchmark">
4110
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4111
+ <div class="uv-logs-content" style="display: none;">
4112
+ Downloading xformers (111.8MiB)
4113
+ Downloading xformers
4114
+ Installed 1 package in 14ms
4115
+ </div>
4116
+ </div>
4117
+ <div class="cell-artifacts">
4118
+ <h4>Artifacts:</h4>
4119
+ <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
4120
  </div>
4121
  </div>
4122
  </div>
flash_attn/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: af1280c87fa60ce034a98afb4f52eca9686cbe35c7ed7a9fc31248f6d6c05ea2
  • Pointer size: 130 Bytes
  • Size of remote file: 29.8 kB

Git LFS Details

  • SHA256: dcf3186873eba7261121e895010b1119e477b4cfef20b846ea699f8779951f5d
  • Pointer size: 130 Bytes
  • Size of remote file: 27 kB
flash_attn/results/cells/combine.py CHANGED
@@ -1,319 +1,69 @@
1
  # /// script
2
  # requires-python = ">=3.10"
3
- # dependencies = [
4
- # "numpy",
5
- # "torch",
6
- # "kernels-benchmark-tools",
7
- # "matplotlib",
8
- # ]
9
- #
10
  # [tool.uv.sources]
11
- # kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
12
  # ///
13
- import os
14
- import sys
15
- from pathlib import Path
16
- import json
17
- import torch # noqa: F401 # imported because upstream may expect torch to be importable
18
- import kernels_benchmark_tools as kbt
19
-
20
- # --- Matplotlib setup and helpers ------------------------------------------------
21
- import matplotlib as mpl
22
- import matplotlib.pyplot as plt
23
- import csv
24
-
25
-
26
- # Keep text as text (not paths) so CSS can style fonts, size, etc.
27
- mpl.rcParams["svg.fonttype"] = "none"
28
- # Make ids deterministic across builds
29
- mpl.rcParams["svg.hashsalt"] = "latency-benchmark-combined"
30
- # Avoid auto-closed figures interfering with our tagging
31
- mpl.rcParams["figure.autolayout"] = True
32
- # Make background transparent
33
- mpl.rcParams["figure.facecolor"] = "none"
34
- mpl.rcParams["axes.facecolor"] = "none"
35
- mpl.rcParams["savefig.facecolor"] = "none"
36
- mpl.rcParams["savefig.edgecolor"] = "none"
37
-
38
- def _slugify(s: str) -> str:
39
- s = (s or "").strip().lower()
40
- keep = []
41
- for ch in s:
42
- if ch.isalnum():
43
- keep.append(ch)
44
- elif ch in (" ", "-", "_", "/", ".", ":"):
45
- keep.append("-")
46
- else:
47
- keep.append("")
48
- out = "".join(keep)
49
- while "--" in out:
50
- out = out.replace("--", "-")
51
- return out.strip("-") or "unnamed"
52
-
53
- def _tag_current_figure(default_series_prefix="series"):
54
- """Attach SVG ids (gid) to key artists so they can be targeted from CSS."""
55
- fig = plt.gcf()
56
- if fig is None:
57
- return
58
-
59
- # Tag the figure itself
60
- fig.set_gid("figure--latency")
61
-
62
- for ax_idx, ax in enumerate(fig.get_axes(), start=1):
63
- ax.set_gid(f"axes--{ax_idx}")
64
-
65
- # Axis labels & title
66
- if ax.get_title():
67
- for t in ax.texts:
68
- if t.get_text() == ax.get_title():
69
- t.set_gid("title--main")
70
- if ax.xaxis and ax.xaxis.get_label():
71
- ax.xaxis.label.set_gid("label--x")
72
- if ax.yaxis and ax.yaxis.get_label():
73
- ax.yaxis.label.set_gid("label--y")
74
-
75
- # Gridlines
76
- for i, gl in enumerate(ax.get_xgridlines(), start=1):
77
- gl.set_gid(f"grid-x--{i}")
78
- for i, gl in enumerate(ax.get_ygridlines(), start=1):
79
- gl.set_gid(f"grid-y--{i}")
80
-
81
- # Legend block & entries
82
- leg = ax.get_legend()
83
- if leg is not None:
84
- leg.set_gid("legend")
85
- for i, txt in enumerate(leg.get_texts(), start=1):
86
- label_slug = _slugify(txt.get_text())
87
- txt.set_gid(f"legend-label--{label_slug or i}")
88
-
89
- # Series (lines, patches)
90
- # Lines
91
- line_seen = {}
92
- for ln in getattr(ax, "lines", []):
93
- raw_label = ln.get_label() or ""
94
- # Matplotlib uses labels beginning with "_" for non-legendable items
95
- label = raw_label if not raw_label.startswith("_") else f"{default_series_prefix}"
96
- slug = _slugify(label)
97
- line_seen[slug] = line_seen.get(slug, 0) + 1
98
- suffix = "" if line_seen[slug] == 1 else f"-{line_seen[slug]}"
99
- ln.set_gid(f"series--{slug}{suffix}")
100
-
101
- # Patches (bars, areas)
102
- patch_seen = {}
103
- for pt in getattr(ax, "patches", []):
104
- label = getattr(pt, "get_label", lambda: "")() or f"{default_series_prefix}"
105
- if isinstance(label, str) and label.startswith("_"):
106
- label = default_series_prefix
107
- slug = _slugify(label)
108
- patch_seen[slug] = patch_seen.get(slug, 0) + 1
109
- suffix = "" if patch_seen[slug] == 1 else f"-{patch_seen[slug]}"
110
- pt.set_gid(f"series--{slug}{suffix}")
111
-
112
- def _postprocess_svg_add_classes(svg_path: Path):
113
- """Add convenient CSS classes alongside ids (e.g., class='series grid grid-x')."""
114
- try:
115
- import xml.etree.ElementTree as ET
116
- ET.register_namespace("", "http://www.w3.org/2000/svg")
117
- tree = ET.parse(svg_path)
118
- root = tree.getroot()
119
- for el in root.iter():
120
- el_id = el.attrib.get("id", "")
121
- if not el_id:
122
- continue
123
- cls = []
124
- if el_id.startswith("figure--"):
125
- cls.append("figure")
126
- elif el_id.startswith("axes--"):
127
- cls.append("axes")
128
- elif el_id.startswith("grid-x--"):
129
- cls += ["grid", "grid-x"]
130
- elif el_id.startswith("grid-y--"):
131
- cls += ["grid", "grid-y"]
132
- elif el_id.startswith("legend"):
133
- cls.append("legend")
134
- elif el_id.startswith("label--x"):
135
- cls.append("xlabel")
136
- elif el_id.startswith("label--y"):
137
- cls.append("ylabel")
138
- elif el_id.startswith("title--"):
139
- cls.append("title")
140
- elif el_id.startswith("series--"):
141
- cls.append("series")
142
- if cls:
143
- # Preserve any existing class (unlikely from Matplotlib)
144
- existing = el.attrib.get("class", "")
145
- el.set("class", (existing + " " + " ".join(cls)).strip())
146
- tree.write(svg_path, encoding="utf-8", xml_declaration=True)
147
- except Exception as e:
148
- print(f"✗ SVG postprocess (classes) skipped: {e}")
149
-
150
- # Monkey-patch savefig to force SVG & ensure tagging occurs even if kbt.viz saves internally.
151
- _orig_savefig = plt.savefig
152
- def _savefig_svg(fname, *args, **kwargs):
153
- # Always save as SVG at a stable path for the artifact system
154
- out = Path("latency.svg")
155
- kwargs["format"] = "svg"
156
- # Ensure everything we care about has ids before export
157
- _tag_current_figure()
158
- res = _orig_savefig(out, *args, **kwargs)
159
- # Add helpful CSS classes on top of ids
160
- _postprocess_svg_add_classes(out)
161
- print(f"✓ Combined visualization saved as {out}")
162
- return res
163
-
164
- plt.savefig = _savefig_svg # apply patch
165
-
166
- # Capture close calls in case kbt.viz() closes figures before we re-save
167
- _orig_close = plt.close
168
- _last_closed = {"fig": None}
169
- def _capture_close(arg=None):
170
- try:
171
- if hasattr(arg, "savefig"): # looks like a Figure
172
- _last_closed["fig"] = arg
173
- else:
174
- _last_closed["fig"] = plt.gcf()
175
- finally:
176
- return _orig_close(arg)
177
- plt.close = _capture_close
178
-
179
- # --- Locate benchmark artifacts --------------------------------------------------
180
- cache_dirs = {
181
- "Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
182
- "MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
183
- "Flash Attn 2": os.environ.get('UVNOTE_FILE_FLASH_ATTN2_BENCHMARK'),
184
- "xFormers": os.environ.get('UVNOTE_FILE_XFORMERS_BENCHMARK'),
185
- "SageAttention": os.environ.get('UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK'),
186
- "Compiled (default)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT'),
187
- "Compiled (max-autotune)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE'),
188
- "HF Kernels Flash Attn": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK'),
189
- "HF Kernels Flash Attn3": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK'),
190
  }
191
 
192
- print("LOADING BENCHMARK DATA")
193
- for name, cache_dir in cache_dirs.items():
194
- print(f"{name:30s}: {cache_dir}")
195
- print()
196
 
197
  file_mapping = {
198
- "Flash (PyTorch SDPA)": "attn.jsonl",
199
- "MemEff (PyTorch SDPA)": "attn.jsonl",
200
- "Flash Attn 2": "attn.jsonl",
201
- "xFormers": "attn.jsonl",
202
- "SageAttention": "attn.jsonl",
203
  "Compiled (default)": "attn_default.jsonl",
204
  "Compiled (max-autotune)": "attn_max_autotune.jsonl",
205
- "HF Kernels Flash Attn": "attn.jsonl",
206
- "HF Kernels Flash Attn3": "attn.jsonl",
207
  }
208
 
 
209
  all_paths = []
210
- for name, cache_dir in cache_dirs.items():
 
211
  if cache_dir:
212
- path = Path(cache_dir) / file_mapping[name]
 
213
  if path.exists() and path.stat().st_size > 0:
214
  all_paths.append(str(path))
215
  print(f"✓ Found {name}: {path}")
216
  else:
217
- print(f"⊘ Empty/Missing {name}: {path}")
218
  else:
219
- print(f"✗ No cache dir for {name}")
220
- print()
221
 
222
  if not all_paths:
223
  print("ERROR: No benchmark data files found!")
224
- # restore patched functions before exiting
225
- plt.savefig = _orig_savefig
226
- plt.close = _orig_close
227
  sys.exit(1)
228
 
229
- # --- Summary + Visualization -----------------------------------------------------
230
- print("COMBINED BENCHMARK SUMMARY\n")
231
- kbt.summarize(all_paths)
232
- print("\nGENERATING COMBINED VISUALIZATION\n")
 
 
233
 
234
  try:
235
- # If kbt.viz saves internally, our patched savefig ensures SVG gets written,
236
- # and it will carry ids/classes for CSS styling.
237
- kbt.viz(all_paths)
238
- # Safety net: if kbt.viz didn't save, save now.
239
- # if not Path("latency.svg").exists():
240
- # _tag_current_figure()
241
- # plt.savefig("latency.svg")
242
 
243
- plt.savefig("latency.svg") # ensure saved with tagging
 
244
 
245
- print("✓ SVG visualization ready: latency.svg!")
246
- except ImportError as e:
247
- print(f" Visualization requires matplotlib: {e}")
248
- except Exception as e:
249
- print(f"✗ Visualization failed: {e}")
250
  finally:
251
- # Clean up patches to avoid side effects in later cells
252
  plt.savefig = _orig_savefig
253
- plt.close = _orig_close
254
-
255
- print()
256
- print("ANALYSIS COMPLETE")
257
- print(f"Total implementations analyzed: {len(all_paths)}")
258
- print(f"\nImplementations included:")
259
- for name, cache_dir in cache_dirs.items():
260
- if cache_dir:
261
- path = Path(cache_dir) / file_mapping[name]
262
- if path.exists() and path.stat().st_size > 0:
263
- print(f" ✓ {name}")
264
-
265
-
266
-
267
- # Collect all benchmark data and export to CSV
268
- all_data = {}
269
- for name, cache_dir in cache_dirs.items():
270
- if cache_dir:
271
- path = Path(cache_dir) / file_mapping[name]
272
- if path.exists() and path.stat().st_size > 0:
273
- with open(path, 'r') as f:
274
- records = [json.loads(line) for line in f]
275
- all_data[name] = records
276
-
277
- # Export to CSV
278
- csv_path = Path("latency.csv")
279
- with open(csv_path, 'w', newline='') as csvfile:
280
- writer = csv.writer(csvfile)
281
-
282
- # Write header
283
- header = ["Implementation", "Impl ID", "Workload", "Batch", "Seq Length", "Heads", "Head Dim", "Dtype",
284
- "Mean (ms)", "P10 (ms)", "P50 (ms)", "P90 (ms)", "Reps",
285
- # "Compile (ms)",
286
- "Peak Mem (MB)", "Backend", "Family"]
287
- writer.writerow(header)
288
-
289
- # Write data rows
290
- for impl_name, records in all_data.items():
291
- for record in records:
292
- wl = record.get('wl', {})
293
- lat = record.get('lat_ms', {})
294
- tags = record.get('tags', {})
295
-
296
- row = [
297
- impl_name,
298
- record.get('impl', ''),
299
- wl.get('name', ''),
300
- wl.get('batch', ''),
301
- wl.get('seq_len', ''),
302
- wl.get('heads', ''),
303
- wl.get('head_dim', ''),
304
- wl.get('dtype', ''),
305
- lat.get('mean', ''),
306
- lat.get('p10', ''),
307
- lat.get('p50', ''),
308
- lat.get('p90', ''),
309
- lat.get('reps', ''),
310
- # record.get('compile_ms', ''),
311
- round(record.get('peak_bytes', 0) / 1024 / 1024, 2) if record.get('peak_bytes') else '',
312
- tags.get('backend', ''),
313
- tags.get('family', ''),
314
- ]
315
- writer.writerow(row)
316
-
317
- print(f"✓ CSV export complete: {csv_path}")
318
- print(f"Total implementations: {len(all_data)}")
319
- print(f"Total records: {sum(len(records) for records in all_data.values())}")
 
1
  # /// script
2
  # requires-python = ">=3.10"
3
+ # dependencies = ["torch", "kernels-benchmark-tools", "matplotlib"]
 
 
 
 
 
 
4
  # [tool.uv.sources]
5
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
6
  # ///
7
+ from kernels_benchmark_tools.core.visuals import generate_combined_results
8
+
9
+ # Note: Flash attention has multiple implementations with different output files
10
+ # Some use attn.jsonl, compiled variants use attn_default.jsonl and attn_max_autotune.jsonl
11
+ cache_env_map = {
12
+ "Flash (PyTorch SDPA)": "UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK",
13
+ "MemEff (PyTorch SDPA)": "UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK",
14
+ "xFormers": "UVNOTE_FILE_XFORMERS_BENCHMARK",
15
+ "Compiled (default)": "UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT",
16
+ "Compiled (max-autotune)": "UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE",
17
+ "HF Kernels Flash Attn": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK",
18
+ "HF Kernels Flash Attn3": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  }
20
 
21
+ # For flash attention, we need custom file mapping
22
+ import os
23
+ from pathlib import Path
 
24
 
25
  file_mapping = {
 
 
 
 
 
26
  "Compiled (default)": "attn_default.jsonl",
27
  "Compiled (max-autotune)": "attn_max_autotune.jsonl",
 
 
28
  }
29
 
30
+ # Collect paths with custom file names for compiled variants
31
  all_paths = []
32
+ for name, env_var in cache_env_map.items():
33
+ cache_dir = os.environ.get(env_var)
34
  if cache_dir:
35
+ filename = file_mapping.get(name, "attn.jsonl")
36
+ path = Path(cache_dir) / filename
37
  if path.exists() and path.stat().st_size > 0:
38
  all_paths.append(str(path))
39
  print(f"✓ Found {name}: {path}")
40
  else:
41
+ print(f"⊘ Skipped {name}: {path}")
42
  else:
43
+ print(f"✗ Missing {name}")
 
44
 
45
  if not all_paths:
46
  print("ERROR: No benchmark data files found!")
47
+ import sys
 
 
48
  sys.exit(1)
49
 
50
+ # Use the simplified visualization
51
+ from kernels_benchmark_tools.core import tools
52
+ from kernels_benchmark_tools.core.visuals import setup_svg_matplotlib, create_svg_with_tagging
53
+
54
+ setup_svg_matplotlib()
55
+ _orig_savefig, _orig_close = create_svg_with_tagging("latency.svg", "flash-attention")
56
 
57
  try:
58
+ print("\nCOMBINED BENCHMARK SUMMARY\n")
59
+ tools.summarize(all_paths)
 
 
 
 
 
60
 
61
+ print("\nGENERATING COMBINED VISUALIZATION\n")
62
+ tools.viz(all_paths)
63
 
64
+ import matplotlib.pyplot as plt
65
+ plt.savefig("latency.svg")
66
+ print(" SVG visualization ready!")
 
 
67
  finally:
 
68
  plt.savefig = _orig_savefig
69
+ plt.close = _orig_close
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
flash_attn/results/combined_results.html CHANGED
The diff for this file is too large to render. See raw diff
 
layer_norm/impls/artifacts/benchmark/ln.jsonl ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S512_D4096", "batch": 1, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.039122000089264475, "p50": 0.04020200003651553, "p90": 0.04062199991494708, "mean": 0.040302199977304554, "iqr": 0.00047999992602854036, "raw_times": [0.04020200003651553, 0.04142299985687714, 0.04062199991494708, 0.04014199998891854, 0.039122000089264475], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.049882999974215636, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
2
+ {"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S512_D8192", "batch": 1, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.03869200008921325, "p50": 0.039361000062854146, "p90": 0.03952199995183037, "mean": 0.039353600004687905, "iqr": 0.0002899998889915878, "raw_times": [0.03923200006283878, 0.03996099985670298, 0.03952199995183037, 0.039361000062854146, 0.03869200008921325], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04323200005273975, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
3
+ {"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S1024_D4096", "batch": 1, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.038322000136759016, "p50": 0.039080999840734876, "p90": 0.03983200008406129, "mean": 0.03918759998668975, "iqr": 0.0012000000424450263, "raw_times": [0.038322000136759016, 0.038632000041616266, 0.03983200008406129, 0.039080999840734876, 0.04007099983027729], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04126199996790092, "peak_bytes": 75513856, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0016021728515625, "mse": 1.1682510375976562e-05, "ref": "layer_norm_fp32"}, "err": null}
4
+ {"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S1024_D8192", "batch": 1, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.038531999962287955, "p50": 0.03957200010518136, "p90": 0.040011999999478576, "mean": 0.040755799955149996, "iqr": 0.0013210001270635985, "raw_times": [0.04697199983638711, 0.03957200010518136, 0.040011999999478576, 0.03869099987241498, 0.038531999962287955], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04860299986830796, "peak_bytes": 151027712, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
5
+ {"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S2048_D4096", "batch": 1, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.03818100003627478, "p50": 0.039942000057635596, "p90": 0.04086199987796135, "mean": 0.044605999983104994, "iqr": 0.0025399997412023367, "raw_times": [0.06572299980689422, 0.04086199987796135, 0.03818100003627478, 0.039942000057635596, 0.038322000136759016], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.046752999878663104, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
6
+ {"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S2048_D8192", "batch": 1, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.046271999963209964, "p50": 0.046712000084880856, "p90": 0.0469120000161638, "mean": 0.04688640001404565, "iqr": 0.00020900006347801536, "raw_times": [0.046712000084880856, 0.04783300005328783, 0.046271999963209964, 0.046702999952685786, 0.0469120000161638], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.049573000069358386, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null}
7
+ {"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S4096_D4096", "batch": 1, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044022000110999215, "p50": 0.04556199996841315, "p90": 0.045742000111204106, "mean": 0.04578840002977813, "iqr": 0.000619000047663576, "raw_times": [0.04512300006354053, 0.04556199996841315, 0.045742000111204106, 0.044022000110999215, 0.04849299989473366], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04771299995809386, "peak_bytes": 302006272, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
8
+ {"ts": "2025-10-23T17:21:54Z", "run": "f94366cee8c44bb99e36bc03ca51ad49", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "llama_S4096_D8192", "batch": 1, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.20399000004545087, "p50": 0.20584999992934172, "p90": 0.20648999998229556, "mean": 0.20627999997486768, "iqr": 0.0007099999947968172, "raw_times": [0.20399000004545087, 0.2092899999297515, 0.20577999998749874, 0.20648999998229556, 0.20584999992934172], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.20653999990827288, "peak_bytes": 604012544, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
layer_norm/impls/cells/benchmark.py CHANGED
@@ -2,13 +2,13 @@
2
  # requires-python = ">=3.10"
3
  # dependencies = [
4
  # "numpy",
5
- # "torch",
6
  # "kernels",
7
  # "kernels-benchmark-tools",
8
  # ]
9
  #
10
  # [tool.uv.sources]
11
- # kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
12
  # ///
13
  import torch
14
  from kernels import get_kernel
 
2
  # requires-python = ">=3.10"
3
  # dependencies = [
4
  # "numpy",
5
+ # "torch==2.8.0",
6
  # "kernels",
7
  # "kernels-benchmark-tools",
8
  # ]
9
  #
10
  # [tool.uv.sources]
11
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
12
  # ///
13
  import torch
14
  from kernels import get_kernel
layer_norm/impls/hf_kernels_layer_norm.html CHANGED
@@ -706,6 +706,29 @@
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  .cell-stderr {
710
  background: var(--bg-error);
711
  border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3556
  if(output){
3557
  output.classList.remove('output-stale');
3558
  let html='';
3559
- if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
 
 
 
 
 
3560
  console.log('UV Logs:', data);
3561
  if(data.stderr) {
3562
  // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3678
  }
3679
  }
3680
 
3681
- // Live reload functionality (robust SSE handling)
3682
- (function(){
3683
- if (!('EventSource' in window)) {
3684
- console.warn('SSE not supported in this browser');
3685
- return;
3686
- }
3687
- let source = new EventSource('/events');
3688
- let isOpen = false;
3689
- source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3690
- source.onmessage = function(e){
3691
- const msg=(e.data||'').trim(); if(!msg) return;
3692
- console.log('SSE message:', msg);
3693
- if (msg==='reload' || msg==='incremental') { location.reload(); }
3694
- // Ignore 'loading' to avoid premature reload loops
3695
- };
3696
- source.onerror = function(e){
3697
- // Let EventSource auto-reconnect instead of forcing a reload
3698
- if (isOpen) console.warn('SSE error after open, retrying...', e);
3699
- };
3700
- window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3701
- })();
3702
 
3703
 
3704
  document.addEventListener('DOMContentLoaded', function() {
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
@@ -3838,14 +3866,14 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3838
  <h1>HF Kernels LayerNorm Implementation</h1>
3839
  <p>Based on kernels-community <code>layer-norm</code> kernel.</p>
3840
  <h2>LayerNorm Benchmark (HF Kernels)</h2>
3841
- <div class="cell cell-failed" id="cell-benchmark">
3842
  <div class="cell-header">
3843
  <span class="collapse-indicators">
3844
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3845
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3846
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3847
  </span> |
3848
- Cell: benchmark | 0.05s | FAILED
3849
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3850
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3851
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3856,13 +3884,13 @@ Cell: benchmark | 0.05s | FAILED
3856
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3857
  <span class="c1"># dependencies = [</span>
3858
  <span class="c1"># &quot;numpy&quot;,</span>
3859
- <span class="c1"># &quot;torch&quot;,</span>
3860
  <span class="c1"># &quot;kernels&quot;,</span>
3861
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3862
  <span class="c1"># ]</span>
3863
  <span class="c1">#</span>
3864
  <span class="c1"># [tool.uv.sources]</span>
3865
- <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3866
  <span class="c1"># ///</span>
3867
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3868
  <span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
@@ -3920,9 +3948,28 @@ Cell: benchmark | 0.05s | FAILED
3920
  </div>
3921
  </div>
3922
  <div id="output-benchmark" class="cell-output">
3923
- <div class="cell-stderr"> × Failed to resolve script requirement
3924
- ╰─▶ Distribution not found at:
3925
- file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3926
  </div>
3927
  </div>
3928
  </div>
 
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
709
+
710
+ .cell-stdout {
711
+ background: var(--bg-tertiary);
712
+ padding: 0.75rem;
713
+ border-radius: 1px;
714
+ font-family: inherit;
715
+ font-size: 0.9rem;
716
+ color: var(--text-primary);
717
+
718
+ /* key bits */
719
+ overflow: auto; /* show scrollbars when needed */
720
+ max-width: 100%; /* respects whatever layout width you give it */
721
+ }
722
+
723
+ .cell-stdout .stdout-text {
724
+ margin: 0; /* reset pre default margin */
725
+ white-space: pre; /* keep line breaks, NO wrapping */
726
+ display: inline-block; /* shrink-to-content */
727
+ min-width: max-content; /* allow very long lines to define intrinsic width */
728
+ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
729
+ tab-size: 2;
730
+ }
731
+
732
  .cell-stderr {
733
  background: var(--bg-error);
734
  border-left: 2px solid var(--border-error);
 
3579
  if(output){
3580
  output.classList.remove('output-stale');
3581
  let html='';
3582
+ if (data.stdout) {
3583
+ html += '<div class="cell-stdout"><pre class="stdout-text">'
3584
+ + escapeHtml(data.stdout)
3585
+ + '</pre></div>';
3586
+ }
3587
+
3588
  console.log('UV Logs:', data);
3589
  if(data.stderr) {
3590
  // Split UV logs from regular stderr
 
3706
  }
3707
  }
3708
 
3709
+ // // Live reload functionality (robust SSE handling)
3710
+ // (function(){
3711
+ // if (!('EventSource' in window)) {
3712
+ // console.warn('SSE not supported in this browser');
3713
+ // return;
3714
+ // }
3715
+ // let source = new EventSource('/events');
3716
+ // let isOpen = false;
3717
+ // source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3718
+ // source.onmessage = function(e){
3719
+ // const msg=(e.data||'').trim(); if(!msg) return;
3720
+ // console.log('SSE message:', msg);
3721
+ // if (msg==='reload' || msg==='incremental') { location.reload(); }
3722
+ // // Ignore 'loading' to avoid premature reload loops
3723
+ // };
3724
+ // source.onerror = function(e){
3725
+ // // Let EventSource auto-reconnect instead of forcing a reload
3726
+ // if (isOpen) console.warn('SSE error after open, retrying...', e);
3727
+ // };
3728
+ // window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3729
+ // })();
3730
 
3731
 
3732
  document.addEventListener('DOMContentLoaded', function() {
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3866
  <h1>HF Kernels LayerNorm Implementation</h1>
3867
  <p>Based on kernels-community <code>layer-norm</code> kernel.</p>
3868
  <h2>LayerNorm Benchmark (HF Kernels)</h2>
3869
+ <div class="cell" id="cell-benchmark">
3870
  <div class="cell-header">
3871
  <span class="collapse-indicators">
3872
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3873
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3874
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3875
  </span> |
3876
+ Cell: benchmark | 5.52s
3877
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3878
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3879
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3884
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3885
  <span class="c1"># dependencies = [</span>
3886
  <span class="c1"># &quot;numpy&quot;,</span>
3887
+ <span class="c1"># &quot;torch==2.8.0&quot;,</span>
3888
  <span class="c1"># &quot;kernels&quot;,</span>
3889
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3890
  <span class="c1"># ]</span>
3891
  <span class="c1">#</span>
3892
  <span class="c1"># [tool.uv.sources]</span>
3893
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
3894
  <span class="c1"># ///</span>
3895
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3896
  <span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
 
3948
  </div>
3949
  </div>
3950
  <div id="output-benchmark" class="cell-output">
3951
+ <div class="cell-stdout"><pre class="stdout-text">impl wl p50(ms) ok
3952
+ hf_kernels_layer_norm llama_S1024_D4096 0.04 False
3953
+ hf_kernels_layer_norm llama_S1024_D8192 0.04 False
3954
+ hf_kernels_layer_norm llama_S2048_D4096 0.04 False
3955
+ hf_kernels_layer_norm llama_S2048_D8192 0.05 False
3956
+ hf_kernels_layer_norm llama_S4096_D4096 0.05 False
3957
+ hf_kernels_layer_norm llama_S4096_D8192 0.21 False
3958
+ hf_kernels_layer_norm llama_S512_D4096 0.04 False
3959
+ hf_kernels_layer_norm llama_S512_D8192 0.04 False
3960
+ </pre></div>
3961
+ <div class="uv-install-logs" id="uv-logs-benchmark">
3962
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3963
+ <div class="uv-logs-content" style="display: none;">
3964
+ Installed 10 packages in 16ms
3965
+ </div>
3966
+ </div>
3967
+ <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
3968
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.02it/s]
3969
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.04it/s]</div>
3970
+ <div class="cell-artifacts">
3971
+ <h4>Artifacts:</h4>
3972
+ <a href="artifacts/benchmark/ln.jsonl" class="artifact" target="_blank">ln.jsonl</a>
3973
  </div>
3974
  </div>
3975
  </div>
layer_norm/impls/torch_layer_norm.html CHANGED
@@ -706,6 +706,29 @@
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  .cell-stderr {
710
  background: var(--bg-error);
711
  border-left: 2px solid var(--border-error);
@@ -3556,7 +3579,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3556
  if(output){
3557
  output.classList.remove('output-stale');
3558
  let html='';
3559
- if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
 
 
 
 
 
3560
  console.log('UV Logs:', data);
3561
  if(data.stderr) {
3562
  // Split UV logs from regular stderr
@@ -3678,27 +3706,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3678
  }
3679
  }
3680
 
3681
- // Live reload functionality (robust SSE handling)
3682
- (function(){
3683
- if (!('EventSource' in window)) {
3684
- console.warn('SSE not supported in this browser');
3685
- return;
3686
- }
3687
- let source = new EventSource('/events');
3688
- let isOpen = false;
3689
- source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3690
- source.onmessage = function(e){
3691
- const msg=(e.data||'').trim(); if(!msg) return;
3692
- console.log('SSE message:', msg);
3693
- if (msg==='reload' || msg==='incremental') { location.reload(); }
3694
- // Ignore 'loading' to avoid premature reload loops
3695
- };
3696
- source.onerror = function(e){
3697
- // Let EventSource auto-reconnect instead of forcing a reload
3698
- if (isOpen) console.warn('SSE error after open, retrying...', e);
3699
- };
3700
- window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3701
- })();
3702
 
3703
 
3704
  document.addEventListener('DOMContentLoaded', function() {
@@ -3829,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
@@ -3844,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3844
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3845
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3846
  </span> |
3847
- Cell: nv | 0.22s
3848
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3849
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3850
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3859,7 +3887,7 @@ Cell: nv | 0.22s
3859
  </div>
3860
  </div>
3861
  <div id="output-nv" class="cell-output">
3862
- <div class="cell-stdout">Wed Oct 22 08:58:23 2025
3863
  +-----------------------------------------------------------------------------------------+
3864
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3865
  |-----------------------------------------+------------------------+----------------------+
@@ -3868,7 +3896,7 @@ Cell: nv | 0.22s
3868
  | | | MIG M. |
3869
  |=========================================+========================+======================|
3870
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3871
- | N/A 26C P8 22W / 350W | 0MiB / 46068MiB | 0% Default |
3872
  | | | N/A |
3873
  +-----------------------------------------+------------------------+----------------------+
3874
 
@@ -3880,19 +3908,19 @@ Cell: nv | 0.22s
3880
  | No running processes found |
3881
  +-----------------------------------------------------------------------------------------+
3882
 
3883
- </div>
3884
  </div>
3885
  </div>
3886
 
3887
  <h2>LayerNorm Benchmark (PyTorch)</h2>
3888
- <div class="cell cell-failed" id="cell-benchmark">
3889
  <div class="cell-header">
3890
  <span class="collapse-indicators">
3891
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3892
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3893
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3894
  </span> |
3895
- Cell: benchmark | 0.01s | FAILED
3896
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3897
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3898
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3903,12 +3931,12 @@ Cell: benchmark | 0.01s | FAILED
3903
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3904
  <span class="c1"># dependencies = [</span>
3905
  <span class="c1"># &quot;numpy&quot;,</span>
3906
- <span class="c1"># &quot;torch&quot;,</span>
3907
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3908
  <span class="c1"># ]</span>
3909
  <span class="c1">#</span>
3910
  <span class="c1"># [tool.uv.sources]</span>
3911
- <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3912
  <span class="c1"># ///</span>
3913
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3914
  <span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
@@ -3946,9 +3974,25 @@ Cell: benchmark | 0.01s | FAILED
3946
  </div>
3947
  </div>
3948
  <div id="output-benchmark" class="cell-output">
3949
- <div class="cell-stderr"> × Failed to resolve script requirement
3950
- ╰─▶ Distribution not found at:
3951
- file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3952
  </div>
3953
  </div>
3954
  </div>
 
706
  white-space: pre-wrap;
707
  color: var(--text-primary);
708
  }
709
+
710
+ .cell-stdout {
711
+ background: var(--bg-tertiary);
712
+ padding: 0.75rem;
713
+ border-radius: 1px;
714
+ font-family: inherit;
715
+ font-size: 0.9rem;
716
+ color: var(--text-primary);
717
+
718
+ /* key bits */
719
+ overflow: auto; /* show scrollbars when needed */
720
+ max-width: 100%; /* respects whatever layout width you give it */
721
+ }
722
+
723
+ .cell-stdout .stdout-text {
724
+ margin: 0; /* reset pre default margin */
725
+ white-space: pre; /* keep line breaks, NO wrapping */
726
+ display: inline-block; /* shrink-to-content */
727
+ min-width: max-content; /* allow very long lines to define intrinsic width */
728
+ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
729
+ tab-size: 2;
730
+ }
731
+
732
  .cell-stderr {
733
  background: var(--bg-error);
734
  border-left: 2px solid var(--border-error);
 
3579
  if(output){
3580
  output.classList.remove('output-stale');
3581
  let html='';
3582
+ if (data.stdout) {
3583
+ html += '<div class="cell-stdout"><pre class="stdout-text">'
3584
+ + escapeHtml(data.stdout)
3585
+ + '</pre></div>';
3586
+ }
3587
+
3588
  console.log('UV Logs:', data);
3589
  if(data.stderr) {
3590
  // Split UV logs from regular stderr
 
3706
  }
3707
  }
3708
 
3709
+ // // Live reload functionality (robust SSE handling)
3710
+ // (function(){
3711
+ // if (!('EventSource' in window)) {
3712
+ // console.warn('SSE not supported in this browser');
3713
+ // return;
3714
+ // }
3715
+ // let source = new EventSource('/events');
3716
+ // let isOpen = false;
3717
+ // source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
3718
+ // source.onmessage = function(e){
3719
+ // const msg=(e.data||'').trim(); if(!msg) return;
3720
+ // console.log('SSE message:', msg);
3721
+ // if (msg==='reload' || msg==='incremental') { location.reload(); }
3722
+ // // Ignore 'loading' to avoid premature reload loops
3723
+ // };
3724
+ // source.onerror = function(e){
3725
+ // // Let EventSource auto-reconnect instead of forcing a reload
3726
+ // if (isOpen) console.warn('SSE error after open, retrying...', e);
3727
+ // };
3728
+ // window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
3729
+ // })();
3730
 
3731
 
3732
  document.addEventListener('DOMContentLoaded', function() {
 
3857
  <div class="system-info">
3858
  <div class="system-info-header">Generated on:</div>
3859
  <div class="system-info-content">
3860
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
3861
  </div>
3862
  </div>
3863
 
 
3872
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3873
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3874
  </span> |
3875
+ Cell: nv | 0.21s
3876
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3877
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3878
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
+ <div class="cell-stdout"><pre class="stdout-text">Thu Oct 23 17:20:58 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
 
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
+ | N/A 35C P0 70W / 350W | 0MiB / 46068MiB | 26% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
 
3908
  | No running processes found |
3909
  +-----------------------------------------------------------------------------------------+
3910
 
3911
+ </pre></div>
3912
  </div>
3913
  </div>
3914
 
3915
  <h2>LayerNorm Benchmark (PyTorch)</h2>
3916
+ <div class="cell" id="cell-benchmark">
3917
  <div class="cell-header">
3918
  <span class="collapse-indicators">
3919
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
+ Cell: benchmark | 4.50s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3931
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3932
  <span class="c1"># dependencies = [</span>
3933
  <span class="c1"># &quot;numpy&quot;,</span>
3934
+ <span class="c1"># &quot;torch==2.8.0&quot;,</span>
3935
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3936
  <span class="c1"># ]</span>
3937
  <span class="c1">#</span>
3938
  <span class="c1"># [tool.uv.sources]</span>
3939
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
3940
  <span class="c1"># ///</span>
3941
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3942
  <span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
 
3974
  </div>
3975
  </div>
3976
  <div id="output-benchmark" class="cell-output">
3977
+ <div class="cell-stdout"><pre class="stdout-text">impl wl p50(ms) ok
3978
+ torch_layer_norm llama_S1024_D4096 0.03 False
3979
+ torch_layer_norm llama_S1024_D8192 0.03 False
3980
+ torch_layer_norm llama_S2048_D4096 0.03 False
3981
+ torch_layer_norm llama_S2048_D8192 0.05 False
3982
+ torch_layer_norm llama_S4096_D4096 0.04 False
3983
+ torch_layer_norm llama_S4096_D8192 0.20 False
3984
+ torch_layer_norm llama_S512_D4096 0.03 False
3985
+ torch_layer_norm llama_S512_D8192 0.03 False
3986
+ </pre></div>
3987
+ <div class="uv-install-logs" id="uv-logs-benchmark">
3988
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3989
+ <div class="uv-logs-content" style="display: none;">
3990
+ Installed 37 packages in 245ms
3991
+ </div>
3992
+ </div>
3993
+ <div class="cell-artifacts">
3994
+ <h4>Artifacts:</h4>
3995
+ <a href="artifacts/benchmark/ln.jsonl" class="artifact" target="_blank">ln.jsonl</a>
3996
  </div>
3997
  </div>
3998
  </div>
layer_norm/results/artifacts/combine/latency.svg ADDED

Git LFS Details

  • SHA256: dcfdf13c3e578bdaaa538562f5d5eb70e73719a5e2194d8058bdda8e34157d54
  • Pointer size: 128 Bytes
  • Size of remote file: 949 Bytes
layer_norm/results/cells/combine.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = ["torch", "kernels-benchmark-tools", "matplotlib"]
4
+ # [tool.uv.sources]
5
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
6
+ # ///
7
+ from kernels_benchmark_tools.core.visuals import generate_combined_results
8
+
9
+ cache_env_map = {
10
+ "Torch LayerNorm": "UVNOTE_FILE_TORCH_LAYER_NORM_BENCHMARK",
11
+ "HF Kernels LayerNorm": "UVNOTE_FILE_HF_KERNELS_LAYER_NORM_BENCHMARK",
12
+ }
13
+
14
+ generate_combined_results(
15
+ cache_env_map=cache_env_map,
16
+ output_filename="ln.jsonl",
17
+ svg_filename="latency.svg",
18
+ figure_id="layernorm"
19
+ )
layer_norm/results/combined_results.html ADDED
The diff for this file is too large to render. See raw diff