diff --git "a/rotary/impls/torch_rotary.html" "b/rotary/impls/torch_rotary.html" --- "a/rotary/impls/torch_rotary.html" +++ "b/rotary/impls/torch_rotary.html" @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.20s +Cell: nv | 0.23s | Raw @@ -3887,7 +3895,7 @@ Cell: nv | 0.20s
-
Wed Oct 29 14:26:51 2025       
+
Wed Oct 29 15:50:24 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3904,7 @@ Cell: nv | 0.20s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   32C    P0             76W /  350W |       0MiB /  46068MiB |     11%      Default |
+| N/A   29C    P0             88W /  350W |       0MiB /  46068MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3918,9 +3926,9 @@ Cell: nv | 0.20s
 
 ▼ code 
 ▼ output
- ▶ uv-logs
+ ▶ uv-logs
  | 
-Cell: benchmark | 3.84s
+Cell: benchmark | 7.50s
  | 
 
 Raw
@@ -3999,27 +4007,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.124ms      1261.56%       1.124ms       1.124ms             1  
-                                            torch_eager        14.73%     412.767us        99.72%       2.794ms       2.794ms       0.000us         0.00%      90.337us      90.337us             1  
-                                              aten::mul         6.25%     175.043us        11.07%     310.105us      12.921us      46.912us        52.64%      46.912us       1.955us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.912us        52.64%      46.912us       1.955us            24  
-                                            aten::copy_         4.12%     115.463us        61.76%       1.730ms      96.132us      28.993us        32.53%      30.210us       1.678us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.368us        25.10%      22.368us       1.864us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.215us        14.83%      13.215us       1.101us            12  
-                                            aten::clone         1.31%      36.692us        59.66%       1.671ms     278.565us       0.000us         0.00%       7.842us       1.307us             6  
-                                              aten::sub         1.68%      47.063us         2.72%      76.213us      12.702us       6.655us         7.47%       6.655us       1.109us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.625us         7.43%       6.625us       1.104us             6  
-                                              aten::add         1.39%      39.044us         2.34%      65.583us      10.930us       6.560us         7.36%       6.560us       1.093us             6  
-                                Activity Buffer Request        52.45%       1.470ms        52.45%       1.470ms       1.470ms       1.217us         1.37%       1.217us       1.217us             1  
-                                    aten::empty_strided         1.99%      55.621us         1.99%      55.621us       9.270us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.66%      74.431us         2.66%      74.431us      12.405us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.98%      83.492us         3.80%     106.494us       4.437us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.82%      23.002us         0.82%      23.002us       0.958us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         9.34%     261.675us         9.34%     261.675us       5.452us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.28%       7.890us         0.28%       7.890us       7.890us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.033ms      1157.58%       1.033ms       1.033ms             1  
+                                            torch_eager        14.26%     386.998us        99.70%       2.705ms       2.705ms       0.000us         0.00%      90.431us      90.431us             1  
+                                              aten::mul         6.08%     164.867us        10.45%     283.577us      11.816us      46.976us        52.65%      46.976us       1.957us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.976us        52.65%      46.976us       1.957us            24  
+                                            aten::copy_         3.96%     107.533us        62.14%       1.686ms      93.665us      28.959us        32.46%      30.175us       1.676us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.303us        25.00%      22.303us       1.859us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.280us        14.89%      13.280us       1.107us            12  
+                                            aten::clone         1.58%      42.971us        61.19%       1.660ms     276.703us       0.000us         0.00%       7.872us       1.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.656us         7.46%       6.656us       1.109us             6  
+                                              aten::sub         1.73%      46.871us         2.69%      72.911us      12.152us       6.656us         7.46%       6.656us       1.109us             6  
+                                              aten::add         1.35%      36.531us         2.16%      58.672us       9.779us       6.624us         7.42%       6.624us       1.104us             6  
+                                Activity Buffer Request        53.14%       1.442ms        53.14%       1.442ms       1.442ms       1.216us         1.36%       1.216us       1.216us             1  
+                                    aten::empty_strided         2.28%      61.772us         2.28%      61.772us      10.295us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.73%      74.144us         2.73%      74.144us      12.357us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.20%      86.920us         4.13%     112.081us       4.670us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.93%      25.161us         0.93%      25.161us       1.048us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.45%     229.371us         8.45%     229.371us       4.779us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.30%       8.270us         0.30%       8.270us       8.270us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.802ms
-Self CUDA time total: 89.120us
+Self CPU time total: 2.713ms
+Self CUDA time total: 89.215us
 
 
 
@@ -4029,27 +4037,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     968.092us      1071.28%     968.092us     968.092us             1  
-                                            torch_eager        12.50%     317.076us        99.79%       2.532ms       2.532ms       0.000us         0.00%      91.488us      91.488us             1  
-                                              aten::mul         6.07%     153.959us        10.35%     262.528us      10.939us      47.648us        52.73%      47.648us       1.985us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.648us        52.73%      47.648us       1.985us            24  
-                                            aten::copy_         4.16%     105.603us        65.14%       1.653ms      91.828us      29.344us        32.47%      30.464us       1.692us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.592us        25.00%      22.592us       1.883us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.376us        14.80%      13.376us       1.115us            12  
-                                            aten::clone         1.12%      28.391us        62.74%       1.592ms     265.351us       0.000us         0.00%       7.872us       1.312us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.752us         7.47%       6.752us       1.125us             6  
-                                              aten::sub         1.55%      39.261us         2.49%      63.132us      10.522us       6.688us         7.40%       6.688us       1.115us             6  
-                                              aten::add         1.47%      37.180us         2.35%      59.741us       9.957us       6.688us         7.40%       6.688us       1.115us             6  
-                                Activity Buffer Request        56.17%       1.425ms        56.17%       1.425ms       1.425ms       1.120us         1.24%       1.120us       1.120us             1  
-                                    aten::empty_strided         2.04%      51.662us         2.04%      51.662us       8.610us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.12%      53.792us         2.12%      53.792us       8.965us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.04%      77.153us         3.82%      96.932us       4.039us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.78%      19.779us         0.78%      19.779us       0.824us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.79%     223.101us         8.79%     223.101us       4.648us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.21%       5.210us         0.21%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     920.346us      1019.88%     920.346us     920.346us             1  
+                                            torch_eager        11.67%     287.669us        99.75%       2.459ms       2.459ms       0.000us         0.00%      91.392us      91.392us             1  
+                                              aten::mul         5.97%     147.150us        10.47%     258.131us      10.755us      47.681us        52.84%      47.681us       1.987us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.681us        52.84%      47.681us       1.987us            24  
+                                            aten::copy_         4.01%      98.743us        66.94%       1.650ms      91.665us      29.184us        32.34%      30.335us       1.685us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.433us        24.86%      22.433us       1.869us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.376us        14.82%      13.376us       1.115us            12  
+                                            aten::clone         0.96%      23.772us        64.13%       1.581ms     263.446us       0.000us         0.00%       7.902us       1.317us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.751us         7.48%       6.751us       1.125us             6  
+                                              aten::sub         1.51%      37.314us         2.51%      61.954us      10.326us       6.720us         7.45%       6.720us       1.120us             6  
+                                              aten::add         1.33%      32.821us         2.21%      54.451us       9.075us       6.656us         7.38%       6.656us       1.109us             6  
+                                Activity Buffer Request        58.20%       1.434ms        58.20%       1.434ms       1.434ms       1.151us         1.28%       1.151us       1.151us             1  
+                                    aten::empty_strided         1.33%      32.830us         1.33%      32.830us       5.472us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.21%      54.420us         2.21%      54.420us       9.070us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.84%      69.900us         3.65%      89.853us       3.744us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.81%      19.953us         0.81%      19.953us       0.831us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.92%     219.731us         8.92%     219.731us       4.578us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.25%       6.050us         0.25%       6.050us       6.050us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.538ms
-Self CUDA time total: 90.368us
+Self CPU time total: 2.465ms
+Self CUDA time total: 90.241us
 
 
 
@@ -4059,27 +4067,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.007ms      1071.77%       1.007ms       1.007ms             1  
-                                            torch_eager        12.81%     333.813us        99.77%       2.600ms       2.600ms       0.000us         0.00%      95.234us      95.234us             1  
-                                              aten::mul         6.17%     160.752us        10.75%     280.063us      11.669us      48.706us        51.86%      48.706us       2.029us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.706us        51.86%      48.706us       2.029us            24  
-                                            aten::copy_         4.30%     112.081us        64.85%       1.690ms      93.891us      30.753us        32.74%      32.065us       1.781us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.009us        24.50%      23.009us       1.917us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.463us        15.40%      14.463us       1.205us            12  
-                                            aten::clone         1.08%      28.070us        62.18%       1.621ms     270.093us       0.000us         0.00%       9.056us       1.509us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         8.25%       7.744us       1.291us             6  
-                                              aten::sub         1.50%      39.201us         2.50%      65.063us      10.844us       7.263us         7.73%       7.263us       1.211us             6  
-                                              aten::add         1.40%      36.592us         2.30%      59.882us       9.980us       7.200us         7.67%       7.200us       1.200us             6  
-                                Activity Buffer Request        55.61%       1.449ms        55.61%       1.449ms       1.449ms       1.312us         1.40%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.87%      48.773us         1.87%      48.773us       8.129us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.21%      57.593us         2.21%      57.593us       9.599us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.85%      74.230us         3.62%      94.450us       3.935us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.78%      20.220us         0.78%      20.220us       0.842us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         9.19%     239.464us         9.19%     239.464us       4.989us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.23%       5.970us         0.23%       5.970us       5.970us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     909.568us       966.47%     909.568us     909.568us             1  
+                                            torch_eager        11.23%     276.876us        99.79%       2.460ms       2.460ms       0.000us         0.00%      95.424us      95.424us             1  
+                                              aten::mul         6.27%     154.461us        10.66%     262.794us      10.950us      48.800us        51.85%      48.800us       2.033us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.800us        51.85%      48.800us       2.033us            24  
+                                            aten::copy_         4.02%      99.094us        67.67%       1.668ms      92.677us      30.912us        32.85%      32.224us       1.790us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.008us        24.45%      23.008us       1.917us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.400us        15.30%      14.400us       1.200us            12  
+                                            aten::clone         0.93%      22.950us        64.64%       1.593ms     265.583us       0.000us         0.00%       9.216us       1.536us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us         8.40%       7.904us       1.317us             6  
+                                              aten::sub         1.56%      38.564us         2.52%      62.034us      10.339us       7.200us         7.65%       7.200us       1.200us             6  
+                                              aten::add         1.24%      30.660us         2.12%      52.250us       8.708us       7.200us         7.65%       7.200us       1.200us             6  
+                                Activity Buffer Request        58.87%       1.451ms        58.87%       1.451ms       1.451ms       1.312us         1.39%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.24%      30.531us         1.24%      30.531us       5.089us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.20%      54.240us         2.20%      54.240us       9.040us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.65%      65.401us         3.42%      84.323us       3.513us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.77%      18.922us         0.77%      18.922us       0.788us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.80%     216.993us         8.80%     216.993us       4.521us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.21%       5.190us         0.21%       5.190us       5.190us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.606ms
-Self CUDA time total: 93.922us
+Self CPU time total: 2.465ms
+Self CUDA time total: 94.112us
 
 
 
@@ -4089,27 +4097,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     976.889us       967.02%     976.889us     976.889us             1  
-                                            torch_eager        12.01%     329.416us        99.82%       2.739ms       2.739ms       0.000us         0.00%     102.333us     102.333us             1  
-                                              aten::mul         5.67%     155.545us         9.73%     266.927us      11.122us      52.800us        52.27%      52.800us       2.200us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.800us        52.27%      52.800us       2.200us            24  
-                                            aten::copy_         3.82%     104.765us        68.18%       1.871ms     103.922us      32.349us        32.02%      33.661us       1.870us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.574us        24.33%      24.574us       2.048us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.872us        15.71%      15.872us       1.323us            12  
-                                            aten::clone         1.07%      29.290us        65.23%       1.790ms     298.277us       0.000us         0.00%       9.087us       1.515us             6  
-                                              aten::sub         1.39%      38.150us         2.28%      62.431us      10.405us       7.936us         7.86%       7.936us       1.323us             6  
-                                              aten::add         1.24%      34.113us         2.07%      56.743us       9.457us       7.936us         7.86%       7.936us       1.323us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.775us         7.70%       7.775us       1.296us             6  
-                                Activity Buffer Request        52.33%       1.436ms        52.33%       1.436ms       1.436ms       1.312us         1.30%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.16%      31.821us         1.16%      31.821us       5.304us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.42%     258.335us         9.42%     258.335us      43.056us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.63%      72.071us         3.33%      91.411us       3.809us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.70%      19.340us         0.70%      19.340us       0.806us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.39%     230.176us         8.39%     230.176us       4.795us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       5.010us         0.18%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     892.572us       880.74%     892.572us     892.572us             1  
+                                            torch_eager        11.35%     283.366us        99.78%       2.492ms       2.492ms       0.000us         0.00%     102.687us     102.687us             1  
+                                              aten::mul         5.93%     148.202us        10.19%     254.513us      10.605us      52.956us        52.25%      52.956us       2.207us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.956us        52.25%      52.956us       2.207us            24  
+                                            aten::copy_         3.94%      98.395us        68.27%       1.705ms      94.725us      32.482us        32.05%      33.826us       1.879us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.641us        24.31%      24.641us       2.053us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.905us        15.69%      15.905us       1.325us            12  
+                                            aten::clone         0.86%      21.380us        65.50%       1.636ms     272.651us       0.000us         0.00%       9.185us       1.531us             6  
+                                              aten::add         1.24%      31.000us         2.12%      53.041us       8.840us       8.032us         7.93%       8.032us       1.339us             6  
+                                              aten::sub         1.40%      35.052us         2.32%      58.022us       9.670us       7.873us         7.77%       7.873us       1.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.841us         7.74%       7.841us       1.307us             6  
+                                Activity Buffer Request        52.43%       1.309ms        52.43%       1.309ms       1.309ms       1.344us         1.33%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.32%      33.071us         1.32%      33.071us       5.512us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.52%     237.764us         9.52%     237.764us      39.627us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.60%      64.825us         3.35%      83.624us       3.484us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.75%      18.799us         0.75%      18.799us       0.783us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.44%     210.793us         8.44%     210.793us       4.392us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.22%       5.611us         0.22%       5.611us       5.611us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.744ms
-Self CUDA time total: 101.021us
+Self CPU time total: 2.498ms
+Self CUDA time total: 101.343us
 
 
 
@@ -4119,27 +4127,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     972.954us      1035.95%     972.954us     972.954us             1  
-                                            torch_eager        11.82%     323.628us        99.83%       2.734ms       2.734ms       0.000us         0.00%      95.231us      95.231us             1  
-                                              aten::mul         5.48%     150.092us         9.71%     265.906us      11.079us      48.958us        52.13%      48.958us       2.040us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.958us        52.13%      48.958us       2.040us            24  
-                                            aten::copy_         4.01%     109.805us        68.55%       1.878ms     104.307us      30.784us        32.78%      32.096us       1.783us            18  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     907.478us       966.25%     907.478us     907.478us             1  
+                                            torch_eager        11.02%     305.318us        99.81%       2.765ms       2.765ms       0.000us         0.00%      95.230us      95.230us             1  
+                                              aten::mul         5.24%     145.172us         9.20%     254.787us      10.616us      49.023us        52.20%      49.023us       2.043us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.023us        52.20%      49.023us       2.043us            24  
+                                            aten::copy_         3.74%     103.536us        70.23%       1.945ms     108.067us      30.719us        32.71%      32.031us       1.779us            18  
 void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.912us        24.40%      22.912us       1.909us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.177us        15.09%      14.177us       1.181us            12  
-                                            aten::clone         0.98%      26.740us        65.50%       1.794ms     299.012us       0.000us         0.00%       9.184us       1.531us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us         8.38%       7.872us       1.312us             6  
-                                              aten::sub         1.35%      37.100us         2.22%      60.781us      10.130us       7.106us         7.57%       7.106us       1.184us             6  
-                                              aten::add         1.26%      34.471us         2.07%      56.641us       9.440us       7.071us         7.53%       7.071us       1.178us             6  
-                                Activity Buffer Request        53.28%       1.459ms        53.28%       1.459ms       1.459ms       1.312us         1.40%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.12%      30.591us         1.12%      30.591us       5.098us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.84%     242.034us         8.84%     242.034us      40.339us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.64%      72.284us         3.37%      92.363us       3.848us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.73%      20.079us         0.73%      20.079us       0.837us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.33%     228.067us         8.33%     228.067us       4.751us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.17%       4.701us         0.17%       4.701us       4.701us       0.000us         0.00%       0.000us       0.000us             1  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.176us        15.09%      14.176us       1.181us            12  
+                                            aten::clone         1.09%      30.110us        67.87%       1.880ms     313.329us       0.000us         0.00%       9.119us       1.520us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.807us         8.31%       7.807us       1.301us             6  
+                                              aten::sub         1.24%      34.480us         2.10%      58.270us       9.712us       7.104us         7.56%       7.104us       1.184us             6  
+                                              aten::add         1.09%      30.091us         1.87%      51.880us       8.647us       7.072us         7.53%       7.072us       1.179us             6  
+                                Activity Buffer Request        52.12%       1.444ms        52.12%       1.444ms       1.444ms       1.312us         1.40%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.13%      31.430us         1.13%      31.430us       5.238us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        12.15%     336.439us        12.15%     336.439us      56.073us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.48%      68.768us         3.17%      87.719us       3.655us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.68%      18.951us         0.68%      18.951us       0.790us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.82%     216.674us         7.82%     216.674us       4.514us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.210us         0.19%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.739ms
-Self CUDA time total: 93.919us
+Self CPU time total: 2.770ms
+Self CUDA time total: 93.918us
 
 
 
@@ -4149,27 +4157,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     940.506us       929.78%     940.506us     940.506us             1  
-                                            torch_eager        10.47%     280.203us        99.80%       2.672ms       2.672ms       0.000us         0.00%     102.466us     102.466us             1  
-                                              aten::mul         5.68%     151.942us         9.93%     265.874us      11.078us      52.767us        52.17%      52.767us       2.199us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.767us        52.17%      52.767us       2.199us            24  
-                                            aten::copy_         3.99%     106.699us        69.68%       1.866ms     103.641us      32.384us        32.01%      33.696us       1.872us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.672us        24.39%      24.672us       2.056us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.003us        15.82%      16.003us       1.334us            12  
-                                            aten::clone         0.80%      21.540us        66.42%       1.778ms     296.379us       0.000us         0.00%       9.024us       1.504us             6  
-                                              aten::sub         1.42%      38.052us         2.40%      64.133us      10.689us       8.002us         7.91%       8.002us       1.334us             6  
-                                              aten::add         1.23%      32.860us         2.10%      56.182us       9.364us       8.001us         7.91%       8.001us       1.333us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us         7.62%       7.712us       1.285us             6  
-                                Activity Buffer Request        54.45%       1.458ms        54.45%       1.458ms       1.458ms       1.312us         1.30%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.14%      30.450us         1.14%      30.450us       5.075us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.74%     234.006us         8.74%     234.006us      39.001us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.58%      69.109us         3.28%      87.850us       3.660us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.70%      18.741us         0.70%      18.741us       0.781us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.61%     230.527us         8.61%     230.527us       4.803us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.400us         0.20%       5.400us       5.400us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     917.786us       906.49%     917.786us     917.786us             1  
+                                            torch_eager        10.59%     290.695us        99.81%       2.741ms       2.741ms       0.000us         0.00%     102.558us     102.558us             1  
+                                              aten::mul         5.39%     148.136us         9.30%     255.477us      10.645us      52.735us        52.09%      52.735us       2.197us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.735us        52.09%      52.735us       2.197us            24  
+                                            aten::copy_         4.15%     114.085us        70.69%       1.941ms     107.839us      32.512us        32.11%      33.824us       1.879us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.704us        24.40%      24.704us       2.059us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.999us        15.80%      15.999us       1.333us            12  
+                                            aten::clone         0.78%      21.500us        67.65%       1.858ms     309.627us       0.000us         0.00%       9.120us       1.520us             6  
+                                              aten::sub         1.39%      38.270us         2.26%      62.070us      10.345us       8.063us         7.96%       8.063us       1.344us             6  
+                                              aten::add         1.13%      31.111us         1.93%      52.881us       8.813us       7.936us         7.84%       7.936us       1.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.71%       7.808us       1.301us             6  
+                                Activity Buffer Request        52.71%       1.447ms        52.71%       1.447ms       1.447ms       1.312us         1.30%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.19%      32.762us         1.19%      32.762us       5.460us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.56%     317.516us        11.56%     317.516us      52.919us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.38%      65.270us         3.07%      84.260us       3.511us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.69%      18.990us         0.69%      18.990us       0.791us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.83%     214.935us         7.83%     214.935us       4.478us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.200us         0.19%       5.200us       5.200us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.677ms
-Self CUDA time total: 101.154us
+Self CPU time total: 2.746ms
+Self CUDA time total: 101.246us
 
 
 
@@ -4179,27 +4187,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.015ms       844.44%       1.015ms       1.015ms             1  
-                                            torch_eager        10.99%     299.529us        99.80%       2.720ms       2.720ms       0.000us         0.00%     122.045us     122.045us             1  
-                                              aten::mul         5.97%     162.734us        10.28%     280.227us      11.676us      61.856us        51.45%      61.856us       2.577us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.856us        51.45%      61.856us       2.577us            24  
-                                            aten::copy_         4.97%     135.364us        68.63%       1.870ms     103.912us      39.199us        32.61%      41.023us       2.279us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.704us        23.88%      28.704us       2.392us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.166us        15.94%      19.166us       1.597us            12  
-                                            aten::clone         0.84%      22.992us        64.39%       1.755ms     292.512us       0.000us         0.00%      12.319us       2.053us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.495us         8.73%      10.495us       1.749us             6  
-                                              aten::add         1.19%      32.530us         2.08%      56.691us       9.448us       9.598us         7.98%       9.598us       1.600us             6  
-                                              aten::sub         1.40%      38.111us         2.30%      62.811us      10.468us       9.568us         7.96%       9.568us       1.595us             6  
-                                Activity Buffer Request        52.53%       1.432ms        52.53%       1.432ms       1.432ms       1.824us         1.52%       1.824us       1.824us             1  
-                                    aten::empty_strided         1.18%      32.290us         1.18%      32.290us       5.382us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.53%     232.585us         8.53%     232.585us      38.764us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.71%      73.938us         3.49%      95.000us       3.958us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.77%      21.062us         0.77%      21.062us       0.878us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.70%     237.086us         8.70%     237.086us       4.939us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.570us         0.20%       5.570us       5.570us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     896.601us       744.17%     896.601us     896.601us             1  
+                                            torch_eager        10.66%     286.835us        99.81%       2.687ms       2.687ms       0.000us         0.00%     122.275us     122.275us             1  
+                                              aten::mul         5.47%     147.118us         9.41%     253.291us      10.554us      61.985us        51.45%      61.985us       2.583us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.985us        51.45%      61.985us       2.583us            24  
+                                            aten::copy_         3.72%     100.260us        70.38%       1.894ms     105.246us      39.265us        32.59%      41.057us       2.281us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.834us        23.93%      28.834us       2.403us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.233us        15.96%      19.233us       1.603us            12  
+                                            aten::clone         0.83%      22.211us        67.89%       1.827ms     304.542us       0.000us         0.00%      12.223us       2.037us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us         8.66%      10.431us       1.738us             6  
+                                              aten::add         1.14%      30.799us         1.94%      52.140us       8.690us       9.632us         7.99%       9.632us       1.605us             6  
+                                              aten::sub         1.37%      36.770us         2.23%      59.970us       9.995us       9.601us         7.97%       9.601us       1.600us             6  
+                                Activity Buffer Request        53.18%       1.431ms        53.18%       1.431ms       1.431ms       1.792us         1.49%       1.792us       1.792us             1  
+                                    aten::empty_strided         1.21%      32.491us         1.21%      32.491us       5.415us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.26%     303.147us        11.26%     303.147us      50.525us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.49%      66.932us         3.17%      85.280us       3.553us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.68%      18.348us         0.68%      18.348us       0.765us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.81%     210.347us         7.81%     210.347us       4.382us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.020us         0.19%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.726ms
-Self CUDA time total: 120.221us
+Self CPU time total: 2.692ms
+Self CUDA time total: 120.483us
 
 
 
@@ -4209,27 +4217,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     951.101us       552.87%     951.101us     951.101us             1  
-                                            torch_eager        11.67%     313.772us        99.81%       2.683ms       2.683ms       0.000us         0.00%     174.878us     174.878us             1  
-                                              aten::mul         5.73%     154.081us         9.89%     265.836us      11.076us      89.599us        52.08%      89.599us       3.733us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.599us        52.08%      89.599us       3.733us            24  
-                                            aten::copy_         3.89%     104.453us        68.40%       1.838ms     102.128us      57.664us        33.52%      60.512us       3.362us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.832us        23.74%      40.832us       3.403us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.767us        14.40%      24.767us       2.064us            12  
-                                            aten::clone         1.01%      27.120us        65.39%       1.758ms     292.937us       0.000us         0.00%      19.680us       3.280us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.832us         9.78%      16.832us       2.805us             6  
-                                              aten::add         1.27%      34.231us         2.14%      57.531us       9.588us      12.416us         7.22%      12.416us       2.069us             6  
-                                              aten::sub         1.34%      36.001us         2.22%      59.581us       9.930us      12.351us         7.18%      12.351us       2.059us             6  
-                                Activity Buffer Request        53.45%       1.437ms        53.45%       1.437ms       1.437ms       2.848us         1.66%       2.848us       2.848us             1  
-                                    aten::empty_strided         1.13%      30.290us         1.13%      30.290us       5.048us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.55%     229.865us         8.55%     229.865us      38.311us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.63%      70.721us         3.36%      90.322us       3.763us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.73%      19.601us         0.73%      19.601us       0.817us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.41%     225.976us         8.41%     225.976us       4.708us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.001us         0.19%       5.001us       5.001us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     885.202us       514.56%     885.202us     885.202us             1  
+                                            torch_eager        18.81%     279.303us        99.64%       1.480ms       1.480ms       0.000us         0.00%     174.944us     174.944us             1  
+                                              aten::mul         9.70%     144.115us        16.98%     252.116us      10.505us      89.439us        51.99%      89.439us       3.727us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.439us        51.99%      89.439us       3.727us            24  
+                                            aten::copy_         6.85%     101.723us        47.28%     702.206us      39.011us      57.632us        33.50%      60.544us       3.364us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.608us        23.60%      40.608us       3.384us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.961us        14.51%      24.961us       2.080us            12  
+                                            aten::clone         1.41%      20.892us        42.46%     630.635us     105.106us       0.000us         0.00%      19.936us       3.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us         9.90%      17.024us       2.837us             6  
+                                              aten::add         2.07%      30.702us         3.51%      52.142us       8.690us      12.545us         7.29%      12.545us       2.091us             6  
+                                              aten::sub         2.41%      35.732us         4.00%      59.442us       9.907us      12.416us         7.22%      12.416us       2.069us             6  
+                                Activity Buffer Request        17.15%     254.675us        17.15%     254.675us     254.675us       2.912us         1.69%       2.912us       2.912us             1  
+                                    aten::empty_strided         2.07%      30.780us         2.07%      30.780us       5.130us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        19.36%     287.456us        19.36%     287.456us      47.909us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.32%      64.164us         5.58%      82.803us       3.450us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.26%      18.639us         1.26%      18.639us       0.777us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.24%     211.503us        14.24%     211.503us       4.406us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       5.410us         0.36%       5.410us       5.410us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.688ms
-Self CUDA time total: 172.030us
+Self CPU time total: 1.485ms
+Self CUDA time total: 172.032us
 
 
 
@@ -4239,27 +4247,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     927.996us       768.63%     927.996us     927.996us             1  
-                                            torch_eager        20.13%     284.369us        99.65%       1.408ms       1.408ms       0.000us         0.00%     122.557us     122.557us             1  
-                                              aten::mul        10.77%     152.163us        18.72%     264.405us      11.017us      62.048us        51.39%      62.048us       2.585us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.048us        51.39%      62.048us       2.585us            24  
-                                            aten::copy_         7.56%     106.823us        43.43%     613.475us      34.082us      39.390us        32.63%      41.213us       2.290us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.864us        23.91%      28.864us       2.405us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.296us        15.98%      19.296us       1.608us            12  
-                                            aten::clone         1.39%      19.620us        37.04%     523.281us      87.213us       0.000us         0.00%      12.349us       2.058us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.526us         8.72%      10.526us       1.754us             6  
-                                              aten::add         2.28%      32.232us         3.86%      54.523us       9.087us       9.696us         8.03%       9.696us       1.616us             6  
-                                              aten::sub         2.48%      35.082us         4.10%      57.982us       9.664us       9.600us         7.95%       9.600us       1.600us             6  
-                                Activity Buffer Request        14.96%     211.375us        14.96%     211.375us     211.375us       1.823us         1.51%       1.823us       1.823us             1  
-                                    aten::empty_strided         2.07%      29.290us         2.07%      29.290us       4.882us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        16.27%     229.815us        16.27%     229.815us      38.302us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.68%      66.168us         5.95%      84.051us       3.502us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.27%      17.883us         1.27%      17.883us       0.745us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.78%     222.895us        15.78%     222.895us       4.644us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.35%       4.970us         0.35%       4.970us       4.970us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     907.735us       751.64%     907.735us     907.735us             1  
+                                            torch_eager        18.35%     272.536us        99.65%       1.480ms       1.480ms       0.000us         0.00%     122.527us     122.527us             1  
+                                              aten::mul         9.89%     146.883us        17.48%     259.553us      10.815us      62.078us        51.40%      62.078us       2.587us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.078us        51.40%      62.078us       2.587us            24  
+                                            aten::copy_         6.65%      98.730us        45.99%     682.885us      37.938us      39.328us        32.57%      41.088us       2.283us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.832us        23.87%      28.832us       2.403us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.361us        16.03%      19.361us       1.613us            12  
+                                            aten::clone         2.58%      38.249us        42.54%     631.763us     105.294us       0.000us         0.00%      12.256us       2.043us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.496us         8.69%      10.496us       1.749us             6  
+                                              aten::add         2.13%      31.663us         3.60%      53.483us       8.914us       9.728us         8.06%       9.728us       1.621us             6  
+                                              aten::sub         2.35%      34.954us         3.91%      58.043us       9.674us       9.633us         7.98%       9.633us       1.605us             6  
+                                Activity Buffer Request        16.88%     250.706us        16.88%     250.706us     250.706us       1.760us         1.46%       1.760us       1.760us             1  
+                                    aten::empty_strided         2.15%      31.912us         2.15%      31.912us       5.319us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        18.48%     274.437us        18.48%     274.437us      45.739us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.31%      63.964us         5.59%      83.053us       3.461us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.29%      19.089us         1.29%      19.089us       0.795us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.59%     216.591us        14.59%     216.591us       4.512us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       5.220us         0.35%       5.220us       5.220us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.413ms
-Self CUDA time total: 120.734us
+Self CPU time total: 1.485ms
+Self CUDA time total: 120.767us
 
 
 
@@ -4269,27 +4277,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     941.367us       547.21%     941.367us     941.367us             1  
-                                            torch_eager        19.36%     280.543us        99.66%       1.444ms       1.444ms       0.000us         0.00%     174.877us     174.877us             1  
-                                              aten::mul        10.67%     154.592us        18.48%     267.677us      11.153us      89.535us        52.05%      89.535us       3.731us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.535us        52.05%      89.535us       3.731us            24  
-                                            aten::copy_         7.38%     106.934us        44.27%     641.329us      35.629us      57.694us        33.54%      60.542us       3.363us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.701us        23.66%      40.701us       3.392us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.800us        14.42%      24.800us       2.067us            12  
-                                            aten::clone         1.44%      20.830us        37.97%     550.103us      91.684us       0.000us         0.00%      19.841us       3.307us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.993us         9.88%      16.993us       2.832us             6  
-                                              aten::add         2.36%      34.121us         3.90%      56.522us       9.420us      12.448us         7.24%      12.448us       2.075us             6  
-                                              aten::sub         2.56%      37.161us         4.27%      61.881us      10.313us      12.352us         7.18%      12.352us       2.059us             6  
-                                Activity Buffer Request        16.20%     234.686us        16.20%     234.686us     234.686us       2.848us         1.66%       2.848us       2.848us             1  
-                                    aten::empty_strided         2.02%      29.270us         2.02%      29.270us       4.878us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.95%     231.027us        15.95%     231.027us      38.505us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.63%      67.091us         5.92%      85.764us       3.573us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.29%      18.673us         1.29%      18.673us       0.778us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.80%     228.888us        15.80%     228.888us       4.768us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.34%       4.980us         0.34%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     894.399us       519.81%     894.399us     894.399us             1  
+                                            torch_eager        10.51%     278.801us        99.79%       2.648ms       2.648ms       0.000us         0.00%     174.911us     174.911us             1  
+                                              aten::mul         5.47%     145.104us         9.49%     251.734us      10.489us      89.535us        52.04%      89.535us       3.731us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.535us        52.04%      89.535us       3.731us            24  
+                                            aten::copy_         3.73%      98.901us        70.34%       1.866ms     103.682us      57.696us        33.53%      60.544us       3.364us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.704us        23.66%      40.704us       3.392us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.832us        14.43%      24.832us       2.069us            12  
+                                            aten::clone         0.84%      22.190us        67.69%       1.796ms     299.337us       0.000us         0.00%      19.840us       3.307us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.992us         9.88%      16.992us       2.832us             6  
+                                              aten::sub         1.44%      38.162us         2.33%      61.942us      10.324us      12.448us         7.23%      12.448us       2.075us             6  
+                                              aten::add         1.15%      30.549us         1.97%      52.171us       8.695us      12.384us         7.20%      12.384us       2.064us             6  
+                                Activity Buffer Request        54.02%       1.433ms        54.02%       1.433ms       1.433ms       2.848us         1.66%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.13%      30.052us         1.13%      30.052us       5.009us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.37%     275.065us        10.37%     275.065us      45.844us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.49%      65.991us         3.19%      84.601us       3.525us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.70%      18.610us         0.70%      18.610us       0.775us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.95%     211.023us         7.95%     211.023us       4.396us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.21%       5.640us         0.21%       5.640us       5.640us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.449ms
-Self CUDA time total: 172.029us
+Self CPU time total: 2.653ms
+Self CUDA time total: 172.063us
 
 
 
@@ -4299,27 +4307,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     950.141us       334.64%     950.141us     950.141us             1  
-                                            torch_eager        11.47%     310.562us        99.82%       2.702ms       2.702ms       0.000us         0.00%     302.012us     302.012us             1  
-                                              aten::mul         5.57%     150.802us         9.64%     260.955us      10.873us     133.822us        47.13%     133.822us       5.576us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.822us        47.13%     133.822us       5.576us            24  
-                                            aten::copy_         3.88%     105.155us        69.00%       1.868ms     103.782us     109.151us        38.44%     127.231us       7.068us            18  
-                                            aten::clone         0.99%      26.749us        66.03%       1.788ms     297.926us       0.000us         0.00%      69.886us      11.648us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.345us        20.20%      57.345us       4.779us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.806us        18.25%      51.806us       8.634us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.959us        14.43%      40.959us       3.413us            12  
-                                              aten::sub         1.29%      34.831us         2.15%      58.172us       9.695us      20.607us         7.26%      20.607us       3.435us             6  
-                                              aten::add         1.26%      34.242us         2.11%      57.104us       9.517us      20.352us         7.17%      20.352us       3.392us             6  
-                                Activity Buffer Request        54.34%       1.471ms        54.34%       1.471ms       1.471ms      18.080us         6.37%      18.080us      18.080us             1  
-                                    aten::empty_strided         1.13%      30.492us         1.13%      30.492us       5.082us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.33%     225.535us         8.33%     225.535us      37.589us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.63%      71.143us         3.33%      90.164us       3.757us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.70%      19.021us         0.70%      19.021us       0.793us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.22%     222.598us         8.22%     222.598us       4.637us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       4.920us         0.18%       4.920us       4.920us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     888.595us       313.84%     888.595us     888.595us             1  
+                                            torch_eager        18.64%     271.692us        99.64%       1.452ms       1.452ms       0.000us         0.00%     301.536us     301.536us             1  
+                                              aten::mul         9.98%     145.418us        17.29%     252.060us      10.503us     132.896us        46.94%     132.896us       5.537us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     132.896us        46.94%     132.896us       5.537us            24  
+                                            aten::copy_         6.89%     100.362us        46.38%     676.084us      37.560us     109.376us        38.63%     127.776us       7.099us            18  
+                                            aten::clone         1.48%      21.511us        41.22%     600.853us     100.142us       0.000us         0.00%      70.560us      11.760us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.216us        20.21%      57.216us       4.768us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.160us        18.42%      52.160us       8.693us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.864us        14.43%      40.864us       3.405us            12  
+                                              aten::sub         2.41%      35.143us         4.02%      58.572us       9.762us      20.512us         7.24%      20.512us       3.419us             6  
+                                              aten::add         2.12%      30.932us         3.62%      52.783us       8.797us      20.352us         7.19%      20.352us       3.392us             6  
+                                Activity Buffer Request        16.97%     247.406us        16.97%     247.406us     247.406us      18.400us         6.50%      18.400us      18.400us             1  
+                                    aten::empty_strided         2.15%      31.370us         2.15%      31.370us       5.228us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        18.35%     267.496us        18.35%     267.496us      44.583us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.85%      70.742us         6.06%      88.302us       3.679us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.20%      17.560us         1.20%      17.560us       0.732us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.59%     212.742us        14.59%     212.742us       4.432us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       5.280us         0.36%       5.280us       5.280us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.707ms
-Self CUDA time total: 283.932us
+Self CPU time total: 1.458ms
+Self CUDA time total: 283.136us
 
 
 
@@ -4329,27 +4337,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     966.098us       169.87%     966.098us     966.098us             1  
-                                            torch_eager        20.40%     290.715us        99.64%       1.420ms       1.420ms       0.000us         0.00%     592.377us     592.377us             1  
-                                            aten::copy_         7.41%     105.615us        41.73%     594.574us      33.032us     275.293us        48.40%     298.941us      16.608us            18  
-                                              aten::mul        10.90%     155.244us        18.92%     269.648us      11.235us     227.071us        39.93%     227.071us       9.461us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     227.071us        39.93%     227.071us       9.461us            24  
-                                            aten::clone         1.44%      20.483us        35.30%     502.923us      83.821us       0.000us         0.00%     207.134us      34.522us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     183.486us        32.26%     183.486us      30.581us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      91.807us        16.14%      91.807us       7.651us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      66.365us        11.67%      66.365us       5.530us            12  
-                                              aten::sub         2.66%      37.929us         4.43%      63.131us      10.522us      33.790us         5.94%      33.790us       5.632us             6  
-                                              aten::add         2.47%      35.251us         4.15%      59.172us       9.862us      32.575us         5.73%      32.575us       5.429us             6  
-                                Activity Buffer Request        13.81%     196.814us        13.81%     196.814us     196.814us      23.648us         4.16%      23.648us      23.648us             1  
-                                    aten::empty_strided         2.02%      28.790us         2.02%      28.790us       4.798us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.63%     222.685us        15.63%     222.685us      37.114us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         5.20%      74.092us         6.55%      93.282us       3.887us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.35%      19.190us         1.35%      19.190us       0.800us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.35%     232.987us        16.35%     232.987us       4.854us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.36%       5.080us         0.36%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     944.856us       167.33%     944.856us     944.856us             1  
+                                            torch_eager        19.10%     286.874us        99.66%       1.497ms       1.497ms       0.000us         0.00%     588.218us     588.218us             1  
+                                            aten::copy_         6.48%      97.352us        44.49%     668.224us      37.124us     273.885us        48.50%     297.437us      16.524us            18  
+                                              aten::mul        11.54%     173.280us        19.20%     288.361us      12.015us     224.990us        39.84%     224.990us       9.375us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     224.990us        39.84%     224.990us       9.375us            24  
+                                            aten::clone         1.34%      20.121us        39.51%     593.393us      98.899us       0.000us         0.00%     206.910us      34.485us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     183.358us        32.47%     183.358us      30.560us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.527us        16.03%      90.527us       7.544us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.791us        11.65%      65.791us       5.483us            12  
+                                              aten::sub         2.45%      36.872us         4.07%      61.073us      10.179us      33.407us         5.92%      33.407us       5.568us             6  
+                                              aten::add         2.13%      32.018us         3.64%      54.631us       9.105us      32.384us         5.74%      32.384us       5.397us             6  
+                                Activity Buffer Request        16.63%     249.816us        16.63%     249.816us     249.816us      23.552us         4.17%      23.552us      23.552us             1  
+                                    aten::empty_strided         2.02%      30.350us         2.02%      30.350us       5.058us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.28%     259.545us        17.28%     259.545us      43.258us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.52%      67.913us         5.81%      87.211us       3.634us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.28%      19.298us         1.28%      19.298us       0.804us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.87%     223.406us        14.87%     223.406us       4.654us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.34%       5.141us         0.34%       5.141us       5.141us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.425ms
-Self CUDA time total: 568.729us
+Self CPU time total: 1.502ms
+Self CUDA time total: 564.666us
 
 
 
@@ -4359,27 +4367,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     975.032us      1053.20%     975.032us     975.032us             1  
-                                            torch_eager        19.78%     289.798us        99.66%       1.460ms       1.460ms       0.000us         0.00%      93.698us      93.698us             1  
-                                              aten::mul        11.08%     162.260us        19.21%     281.475us      11.728us      49.665us        53.65%      49.665us       2.069us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.665us        53.65%      49.665us       2.069us            24  
-                                            aten::copy_         7.16%     104.830us        42.02%     615.673us      34.204us      29.441us        31.80%      30.561us       1.698us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.657us        24.47%      22.657us       1.888us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.472us        14.55%      13.472us       1.123us            12  
-                                            aten::clone         1.39%      20.311us        36.25%     531.032us      88.505us       0.000us         0.00%       7.904us       1.317us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us         7.33%       6.784us       1.131us             6  
-                                              aten::add         2.30%      33.730us         3.98%      58.302us       9.717us       6.752us         7.29%       6.752us       1.125us             6  
-                                              aten::sub         2.57%      37.640us         4.45%      65.262us      10.877us       6.720us         7.26%       6.720us       1.120us             6  
-                                Activity Buffer Request        14.75%     216.135us        14.75%     216.135us     216.135us       1.120us         1.21%       1.120us       1.120us             1  
-                                    aten::empty_strided         2.59%      37.931us         2.59%      37.931us       6.322us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.29%     223.986us        15.29%     223.986us      37.331us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.89%      71.623us         6.23%      91.274us       3.803us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.34%      19.651us         1.34%      19.651us       0.819us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.53%     242.131us        16.53%     242.131us       5.044us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.34%       5.040us         0.34%       5.040us       5.040us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     916.448us       990.96%     916.448us     916.448us             1  
+                                            torch_eager        10.63%     281.892us        99.80%       2.647ms       2.647ms       0.000us         0.00%      93.601us      93.601us             1  
+                                              aten::mul         5.58%     148.028us         9.67%     256.571us      10.690us      49.634us        53.67%      49.634us       2.068us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.634us        53.67%      49.634us       2.068us            24  
+                                            aten::copy_         3.99%     105.971us        69.88%       1.854ms     102.991us      29.439us        31.83%      30.559us       1.698us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.655us        24.50%      22.655us       1.888us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.408us        14.50%      13.408us       1.117us            12  
+                                            aten::clone         0.82%      21.802us        66.79%       1.772ms     295.325us       0.000us         0.00%       7.904us       1.317us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us         7.34%       6.784us       1.131us             6  
+                                              aten::sub         1.36%      36.061us         2.24%      59.441us       9.907us       6.720us         7.27%       6.720us       1.120us             6  
+                                              aten::add         1.25%      33.260us         2.10%      55.590us       9.265us       6.688us         7.23%       6.688us       1.115us             6  
+                                Activity Buffer Request        54.00%       1.433ms        54.00%       1.433ms       1.433ms       1.120us         1.21%       1.120us       1.120us             1  
+                                    aten::empty_strided         1.13%      29.861us         1.13%      29.861us       4.977us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.52%     252.488us         9.52%     252.488us      42.081us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.59%      68.801us         3.33%      88.471us       3.686us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.74%      19.670us         0.74%      19.670us       0.820us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.18%     216.965us         8.18%     216.965us       4.520us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.410us         0.20%       5.410us       5.410us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.465ms
-Self CUDA time total: 92.578us
+Self CPU time total: 2.653ms
+Self CUDA time total: 92.481us
 
 
 
@@ -4389,27 +4397,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     963.643us      1001.81%     963.643us     963.643us             1  
-                                            torch_eager        11.60%     311.071us        99.82%       2.676ms       2.676ms       0.000us         0.00%      97.534us      97.534us             1  
-                                              aten::mul         5.66%     151.593us        10.00%     268.127us      11.172us      51.103us        53.13%      51.103us       2.129us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.103us        53.13%      51.103us       2.129us            24  
-                                            aten::copy_         3.93%     105.441us        68.13%       1.826ms     101.459us      30.911us        32.14%      32.255us       1.792us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.007us        23.92%      23.007us       1.917us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.176us        14.74%      14.176us       1.181us            12  
-                                            aten::clone         1.04%      27.830us        65.21%       1.748ms     291.325us       0.000us         0.00%       9.248us       1.541us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us         8.22%       7.904us       1.317us             6  
-                                              aten::sub         1.38%      37.040us         2.30%      61.581us      10.264us       7.103us         7.38%       7.103us       1.184us             6  
-                                              aten::add         1.19%      32.000us         2.05%      54.860us       9.143us       7.073us         7.35%       7.073us       1.179us             6  
-                                Activity Buffer Request        53.57%       1.436ms        53.57%       1.436ms       1.436ms       1.344us         1.40%       1.344us       1.344us             1  
-                                    aten::empty_strided         1.19%      31.921us         1.19%      31.921us       5.320us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.14%     218.236us         8.14%     218.236us      36.373us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.76%      74.059us         3.52%      94.290us       3.929us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.75%      20.231us         0.75%      20.231us       0.843us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.60%     230.408us         8.60%     230.408us       4.800us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       4.700us         0.18%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     888.809us       923.35%     888.809us     888.809us             1  
+                                            torch_eager        19.05%     273.129us        99.67%       1.429ms       1.429ms       0.000us         0.00%      97.571us      97.571us             1  
+                                              aten::mul        10.09%     144.695us        17.61%     252.506us      10.521us      51.232us        53.22%      51.232us       2.135us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.232us        53.22%      51.232us       2.135us            24  
+                                            aten::copy_         6.72%      96.301us        45.37%     650.385us      36.132us      30.786us        31.98%      32.098us       1.783us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.944us        23.84%      22.944us       1.912us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.241us        14.79%      14.241us       1.187us            12  
+                                            aten::clone         1.39%      19.911us        40.43%     579.513us      96.586us       0.000us         0.00%       9.154us       1.526us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.842us         8.15%       7.842us       1.307us             6  
+                                              aten::add         2.26%      32.360us         3.79%      54.320us       9.053us       7.136us         7.41%       7.136us       1.189us             6  
+                                              aten::sub         2.55%      36.551us         4.17%      59.791us       9.965us       7.105us         7.38%       7.105us       1.184us             6  
+                                Activity Buffer Request        16.56%     237.415us        16.56%     237.415us     237.415us       1.312us         1.36%       1.312us       1.312us             1  
+                                    aten::empty_strided         2.18%      31.230us         2.18%      31.230us       5.205us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.96%     257.447us        17.96%     257.447us      42.908us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.71%      67.539us         6.11%      87.581us       3.649us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.40%      20.042us         1.40%      20.042us       0.835us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.80%     212.233us        14.80%     212.233us       4.422us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.33%       4.690us         0.33%       4.690us       4.690us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.681ms
-Self CUDA time total: 96.190us
+Self CPU time total: 1.434ms
+Self CUDA time total: 96.259us
 
 
 
@@ -4419,27 +4427,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     984.120us       950.08%     984.120us     984.120us             1  
-                                            torch_eager        21.32%     307.609us        99.66%       1.438ms       1.438ms       0.000us         0.00%     104.863us     104.863us             1  
-                                              aten::mul        11.11%     160.241us        19.03%     274.535us      11.439us      55.232us        53.32%      55.232us       2.301us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.232us        53.32%      55.232us       2.301us            24  
-                                            aten::copy_         7.56%     109.063us        40.34%     581.983us      32.332us      32.383us        31.26%      33.663us       1.870us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.639us        23.79%      24.639us       2.053us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.968us        15.42%      15.968us       1.331us            12  
-                                            aten::clone         1.50%      21.672us        34.18%     493.044us      82.174us       0.000us         0.00%       9.024us       1.504us             6  
-                                              aten::add         2.60%      37.520us         4.33%      62.511us      10.418us       8.031us         7.75%       8.031us       1.339us             6  
-                                              aten::sub         2.72%      39.231us         4.56%      65.841us      10.973us       7.937us         7.66%       7.937us       1.323us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         7.48%       7.744us       1.291us             6  
-                                Activity Buffer Request        13.05%     188.244us        13.05%     188.244us     188.244us       1.280us         1.24%       1.280us       1.280us             1  
-                                    aten::empty_strided         2.28%      32.882us         2.28%      32.882us       5.480us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.94%     215.555us        14.94%     215.555us      35.926us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.93%      71.162us         6.28%      90.612us       3.776us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.35%      19.450us         1.35%      19.450us       0.810us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.29%     235.016us        16.29%     235.016us       4.896us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.34%       4.880us         0.34%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     903.536us       870.95%     903.536us     903.536us             1  
+                                            torch_eager        18.87%     271.956us        99.65%       1.436ms       1.436ms       0.000us         0.00%     105.053us     105.053us             1  
+                                              aten::mul        10.20%     146.935us        17.83%     256.897us      10.704us      55.262us        53.27%      55.262us       2.303us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.262us        53.27%      55.262us       2.303us            24  
+                                            aten::copy_         6.83%      98.437us        45.05%     649.198us      36.067us      32.478us        31.31%      33.790us       1.877us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.670us        23.78%      24.670us       2.056us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.001us        15.42%      16.001us       1.333us            12  
+                                            aten::clone         1.50%      21.580us        40.06%     577.333us      96.222us       0.000us         0.00%       9.120us       1.520us             6  
+                                              aten::sub         2.49%      35.841us         4.72%      67.992us      11.332us       8.001us         7.71%       8.001us       1.333us             6  
+                                              aten::add         2.31%      33.350us         3.86%      55.670us       9.278us       8.000us         7.71%       8.000us       1.333us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.53%       7.808us       1.301us             6  
+                                Activity Buffer Request        16.46%     237.265us        16.46%     237.265us     237.265us       1.312us         1.26%       1.312us       1.312us             1  
+                                    aten::empty_strided         2.16%      31.090us         2.16%      31.090us       5.182us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.50%     252.196us        17.50%     252.196us      42.033us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.40%      63.461us         5.67%      81.650us       3.402us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.26%      18.189us         1.26%      18.189us       0.758us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.66%     225.733us        15.66%     225.733us       4.703us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       5.060us         0.35%       5.060us       5.060us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.443ms
-Self CUDA time total: 103.583us
+Self CPU time total: 1.441ms
+Self CUDA time total: 103.741us
 
 
 
@@ -4449,27 +4457,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     935.122us       757.84%     935.122us     935.122us             1  
-                                            torch_eager        19.99%     283.519us        99.60%       1.412ms       1.412ms       0.000us         0.00%     125.153us     125.153us             1  
-                                              aten::mul        10.97%     155.634us        18.77%     266.135us      11.089us      65.024us        52.70%      65.024us       2.709us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.024us        52.70%      65.024us       2.709us            24  
-                                            aten::copy_         7.53%     106.809us        43.10%     611.203us      33.956us      39.201us        31.77%      40.961us       2.276us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.769us        23.31%      28.769us       2.397us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.168us        15.53%      19.168us       1.597us            12  
-                                            aten::clone         1.50%      21.262us        37.00%     524.722us      87.454us       0.000us         0.00%      12.192us       2.032us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us         8.45%      10.432us       1.739us             6  
-                                              aten::add         2.41%      34.151us         3.94%      55.922us       9.320us       9.664us         7.83%       9.664us       1.611us             6  
-                                              aten::sub         2.49%      35.371us         4.21%      59.711us       9.952us       9.504us         7.70%       9.504us       1.584us             6  
-                                Activity Buffer Request        14.55%     206.375us        14.55%     206.375us     206.375us       1.760us         1.43%       1.760us       1.760us             1  
-                                    aten::empty_strided         2.12%      30.049us         2.12%      30.049us       5.008us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        16.20%     229.735us        16.20%     229.735us      38.289us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.63%      65.693us         5.97%      84.623us       3.526us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.33%      18.930us         1.33%      18.930us       0.789us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.86%     224.896us        15.86%     224.896us       4.685us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.40%       5.729us         0.40%       5.729us       5.729us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     903.548us       729.80%     903.548us     903.548us             1  
+                                            torch_eager        10.56%     280.674us        99.81%       2.652ms       2.652ms       0.000us         0.00%     125.567us     125.567us             1  
+                                              aten::mul         5.49%     145.805us         9.46%     251.467us      10.478us      65.184us        52.65%      65.184us       2.716us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.184us        52.65%      65.184us       2.716us            24  
+                                            aten::copy_         3.75%      99.563us        70.08%       1.862ms     103.468us      39.422us        31.84%      41.182us       2.288us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.928us        23.37%      28.928us       2.411us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.201us        15.51%      19.201us       1.600us            12  
+                                            aten::clone         0.92%      24.379us        67.48%       1.793ms     298.872us       0.000us         0.00%      12.254us       2.042us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.494us         8.48%      10.494us       1.749us             6  
+                                              aten::add         1.15%      30.622us         1.96%      52.162us       8.694us       9.633us         7.78%       9.633us       1.606us             6  
+                                              aten::sub         1.45%      38.422us         2.36%      62.661us      10.443us       9.568us         7.73%       9.568us       1.595us             6  
+                                Activity Buffer Request        54.94%       1.460ms        54.94%       1.460ms       1.460ms       1.760us         1.42%       1.760us       1.760us             1  
+                                    aten::empty_strided         1.16%      30.801us         1.16%      30.801us       5.133us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.14%     242.866us         9.14%     242.866us      40.478us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.56%      67.990us         3.30%      87.783us       3.658us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.74%      19.793us         0.74%      19.793us       0.825us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.96%     211.432us         7.96%     211.432us       4.405us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.160us         0.19%       5.160us       5.160us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.418ms
-Self CUDA time total: 123.393us
+Self CPU time total: 2.658ms
+Self CUDA time total: 123.807us
 
 
 
@@ -4479,27 +4487,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     964.101us       931.86%     964.101us     964.101us             1  
-                                            torch_eager        11.58%     311.269us        99.80%       2.682ms       2.682ms       0.000us         0.00%     104.772us     104.772us             1  
-                                              aten::mul         5.74%     154.165us         9.94%     267.067us      11.128us      55.236us        53.39%      55.236us       2.301us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.236us        53.39%      55.236us       2.301us            24  
-                                            aten::copy_         4.07%     109.351us        68.30%       1.836ms     101.989us      32.287us        31.21%      33.599us       1.867us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.511us        23.69%      24.511us       2.043us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.937us        15.40%      15.937us       1.328us            12  
-                                            aten::clone         1.02%      27.532us        65.06%       1.749ms     291.482us       0.000us         0.00%       9.088us       1.515us             6  
-                                              aten::add         1.31%      35.310us         2.20%      59.141us       9.857us       7.969us         7.70%       7.969us       1.328us             6  
-                                              aten::sub         1.38%      37.131us         2.33%      62.602us      10.434us       7.968us         7.70%       7.968us       1.328us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us         7.52%       7.776us       1.296us             6  
-                                Activity Buffer Request        53.54%       1.439ms        53.54%       1.439ms       1.439ms       1.312us         1.27%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.12%      30.190us         1.12%      30.190us       5.032us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.09%     217.335us         8.09%     217.335us      36.223us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.62%      70.291us         3.31%      88.901us       3.704us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.69%      18.610us         0.69%      18.610us       0.775us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.64%     232.137us         8.64%     232.137us       4.836us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.481us         0.20%       5.481us       5.481us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     889.436us       855.74%     889.436us     889.436us             1  
+                                            torch_eager        19.42%     274.045us        99.59%       1.406ms       1.406ms       0.000us         0.00%     105.282us     105.282us             1  
+                                              aten::mul        10.41%     146.921us        18.18%     256.563us      10.690us      55.486us        53.38%      55.486us       2.312us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.486us        53.38%      55.486us       2.312us            24  
+                                            aten::copy_         6.82%      96.302us        44.56%     628.895us      34.939us      32.513us        31.28%      33.857us       1.881us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.705us        23.77%      24.705us       2.059us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.939us        15.34%      15.939us       1.328us            12  
+                                            aten::clone         1.41%      19.928us        39.46%     556.871us      92.812us       0.000us         0.00%       9.152us       1.525us             6  
+                                              aten::sub         2.56%      36.082us         4.16%      58.744us       9.791us       7.970us         7.67%       7.970us       1.328us             6  
+                                              aten::add         2.23%      31.511us         3.85%      54.282us       9.047us       7.969us         7.67%       7.969us       1.328us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.51%       7.808us       1.301us             6  
+                                Activity Buffer Request        15.99%     225.676us        15.99%     225.676us     225.676us       1.344us         1.29%       1.344us       1.344us             1  
+                                    aten::empty_strided         2.17%      30.631us         2.17%      30.631us       5.105us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.52%     247.335us        17.52%     247.335us      41.223us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.52%      63.850us         5.84%      82.475us       3.436us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.32%      18.625us         1.32%      18.625us       0.776us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.21%     214.657us        15.21%     214.657us       4.472us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.41%       5.810us         0.41%       5.810us       5.810us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.688ms
-Self CUDA time total: 103.460us
+Self CPU time total: 1.411ms
+Self CUDA time total: 103.938us
 
 
 
@@ -4509,27 +4517,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     964.072us       780.68%     964.072us     964.072us             1  
-                                            torch_eager        11.45%     316.268us        99.81%       2.758ms       2.758ms       0.000us         0.00%     125.283us     125.283us             1  
-                                              aten::mul         5.46%     150.776us         9.46%     261.336us      10.889us      65.090us        52.71%      65.090us       2.712us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.090us        52.71%      65.090us       2.712us            24  
-                                            aten::copy_         3.85%     106.511us        68.83%       1.902ms     105.647us      39.266us        31.80%      41.058us       2.281us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.802us        23.32%      28.802us       2.400us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.135us        15.50%      19.135us       1.595us            12  
-                                            aten::clone         1.09%      30.231us        66.11%       1.827ms     304.441us       0.000us         0.00%      12.256us       2.043us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.464us         8.47%      10.464us       1.744us             6  
-                                              aten::add         1.22%      33.650us         2.08%      57.431us       9.572us       9.599us         7.77%       9.599us       1.600us             6  
-                                              aten::sub         1.35%      37.292us         2.48%      68.652us      11.442us       9.536us         7.72%       9.536us       1.589us             6  
-                                Activity Buffer Request        54.53%       1.507ms        54.53%       1.507ms       1.507ms       1.792us         1.45%       1.792us       1.792us             1  
-                                    aten::empty_strided         1.19%      32.821us         1.19%      32.821us       5.470us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.01%     221.424us         8.01%     221.424us      36.904us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.55%      70.592us         3.23%      89.363us       3.723us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.68%      18.771us         0.68%      18.771us       0.782us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.42%     232.664us         8.42%     232.664us       4.847us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.190us         0.19%       5.190us       5.190us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     888.135us       717.15%     888.135us     888.135us             1  
+                                            torch_eager        18.91%     268.465us        99.65%       1.415ms       1.415ms       0.000us         0.00%     125.666us     125.666us             1  
+                                              aten::mul        10.15%     144.114us        17.70%     251.265us      10.469us      65.346us        52.77%      65.346us       2.723us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.346us        52.77%      65.346us       2.723us            24  
+                                            aten::copy_         6.90%      97.992us        45.41%     644.725us      35.818us      39.328us        31.76%      41.152us       2.286us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.800us        23.26%      28.800us       2.400us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.168us        15.48%      19.168us       1.597us            12  
+                                            aten::clone         1.46%      20.690us        40.33%     572.532us      95.422us       0.000us         0.00%      12.352us       2.059us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us         8.50%      10.528us       1.755us             6  
+                                              aten::add         2.19%      31.029us         3.69%      52.390us       8.732us       9.600us         7.75%       9.600us       1.600us             6  
+                                              aten::sub         2.50%      35.469us         4.13%      58.580us       9.763us       9.568us         7.73%       9.568us       1.595us             6  
+                                Activity Buffer Request        15.69%     222.765us        15.69%     222.765us     222.765us       1.824us         1.47%       1.824us       1.824us             1  
+                                    aten::empty_strided         2.29%      32.500us         2.29%      32.500us       5.417us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        18.50%     262.716us        18.50%     262.716us      43.786us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.70%      66.710us         6.07%      86.108us       3.588us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.37%      19.398us         1.37%      19.398us       0.808us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.99%     212.875us        14.99%     212.875us       4.435us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       5.010us         0.35%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.763ms
-Self CUDA time total: 123.491us
+Self CPU time total: 1.420ms
+Self CUDA time total: 123.842us
 
 
 
@@ -4539,27 +4547,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     934.855us       527.63%     934.855us     934.855us             1  
-                                            torch_eager        19.51%     283.728us        99.66%       1.450ms       1.450ms       0.000us         0.00%     180.061us     180.061us             1  
-                                              aten::mul        10.43%     151.748us        18.10%     263.338us      10.972us      95.007us        53.62%      95.007us       3.959us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      95.007us        53.62%      95.007us       3.959us            24  
-                                            aten::copy_         7.11%     103.461us        44.35%     645.065us      35.837us      57.664us        32.55%      60.544us       3.364us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.608us        22.92%      40.608us       3.384us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.510us        13.83%      24.510us       2.042us            12  
-                                            aten::clone         1.46%      21.280us        38.39%     558.424us      93.071us       0.000us         0.00%      19.936us       3.323us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us         9.63%      17.056us       2.843us             6  
-                                              aten::add         2.36%      34.271us         3.99%      58.001us       9.667us      12.287us         6.93%      12.287us       2.048us             6  
-                                              aten::sub         2.55%      37.161us         4.24%      61.641us      10.274us      12.223us         6.90%      12.223us       2.037us             6  
-                                Activity Buffer Request        17.53%     255.006us        17.53%     255.006us     255.006us       2.880us         1.63%       2.880us       2.880us             1  
-                                    aten::empty_strided         2.02%      29.311us         2.02%      29.311us       4.885us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.21%     221.267us        15.21%     221.267us      36.878us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.73%      68.750us         6.01%      87.372us       3.641us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.28%      18.622us         1.28%      18.622us       0.776us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.48%     225.131us        15.48%     225.131us       4.690us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.34%       4.880us         0.34%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     910.045us       513.35%     910.045us     910.045us             1  
+                                            torch_eager         9.66%     280.213us        99.83%       2.894ms       2.894ms       0.000us         0.00%     180.188us     180.188us             1  
+                                              aten::mul         5.18%     150.102us         9.00%     260.863us      10.869us      94.655us        53.39%      94.655us       3.944us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.655us        53.39%      94.655us       3.944us            24  
+                                            aten::copy_         3.40%      98.673us        72.45%       2.101ms     116.706us      57.885us        32.65%      60.797us       3.378us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.799us        23.01%      40.799us       3.400us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.736us        13.95%      24.736us       2.061us            12  
+                                            aten::clone         0.79%      22.860us        70.00%       2.030ms     338.262us       0.000us         0.00%      19.998us       3.333us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.086us         9.64%      17.086us       2.848us             6  
+                                              aten::add         1.13%      32.880us         1.89%      54.761us       9.127us      12.416us         7.00%      12.416us       2.069us             6  
+                                              aten::sub         1.18%      34.239us         1.98%      57.551us       9.592us      12.320us         6.95%      12.320us       2.053us             6  
+                                Activity Buffer Request        58.76%       1.704ms        58.76%       1.704ms       1.704ms       2.912us         1.64%       2.912us       2.912us             1  
+                                    aten::empty_strided         1.11%      32.150us         1.11%      32.150us       5.358us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.21%     238.144us         8.21%     238.144us      39.691us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.29%      66.481us         2.94%      85.213us       3.551us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.65%      18.732us         0.65%      18.732us       0.781us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.46%     216.224us         7.46%     216.224us       4.505us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.17%       5.070us         0.17%       5.070us       5.070us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.455ms
-Self CUDA time total: 177.181us
+Self CPU time total: 2.899ms
+Self CUDA time total: 177.276us
 
 
 
@@ -4569,27 +4577,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     936.902us       314.34%     936.902us     936.902us             1  
-                                            torch_eager        19.95%     279.505us        99.63%       1.396ms       1.396ms       0.000us         0.00%     315.267us     315.267us             1  
-                                              aten::mul        10.85%     152.079us        18.94%     265.395us      11.058us     146.176us        49.04%     146.176us       6.091us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     146.176us        49.04%     146.176us       6.091us            24  
-                                            aten::copy_         7.66%     107.385us        42.60%     596.937us      33.163us     110.978us        37.23%     128.194us       7.122us            18  
-                                            aten::clone         1.45%      20.319us        36.31%     508.783us      84.797us       0.000us         0.00%      70.625us      11.771us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.569us        19.32%      57.569us       4.797us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.409us        17.92%      53.409us       8.902us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.897us        13.72%      40.897us       3.408us            12  
-                                              aten::sub         2.61%      36.531us         4.38%      61.402us      10.234us      20.449us         6.86%      20.449us       3.408us             6  
-                                              aten::add         2.39%      33.533us         3.98%      55.753us       9.292us      20.448us         6.86%      20.448us       3.408us             6  
-                                Activity Buffer Request        14.75%     206.705us        14.75%     206.705us     206.705us      17.216us         5.78%      17.216us      17.216us             1  
-                                    aten::empty_strided         2.13%      29.842us         2.13%      29.842us       4.974us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.44%     216.385us        15.44%     216.385us      36.064us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.91%      68.874us         6.21%      87.042us       3.627us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.30%      18.168us         1.30%      18.168us       0.757us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.19%     226.869us        16.19%     226.869us       4.726us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.37%       5.161us         0.37%       5.161us       5.161us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     919.612us       310.44%     919.612us     919.612us             1  
+                                            torch_eager        10.49%     286.464us        99.82%       2.726ms       2.726ms       0.000us         0.00%     313.057us     313.057us             1  
+                                              aten::mul         5.34%     145.716us         9.29%     253.789us      10.575us     145.182us        49.01%     145.182us       6.049us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.182us        49.01%     145.182us       6.049us            24  
+                                            aten::copy_         3.69%     100.696us        70.60%       1.928ms     107.115us     109.985us        37.13%     126.817us       7.045us            18  
+                                            aten::clone         0.88%      23.951us        68.02%       1.858ms     309.597us       0.000us         0.00%      69.474us      11.579us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.343us        19.36%      57.343us       4.779us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.642us        17.77%      52.642us       8.774us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.058us        13.86%      41.058us       3.421us            12  
+                                              aten::sub         1.33%      36.191us         2.18%      59.621us       9.937us      20.609us         6.96%      20.609us       3.435us             6  
+                                              aten::add         1.14%      31.230us         1.95%      53.190us       8.865us      20.449us         6.90%      20.449us       3.408us             6  
+                                Activity Buffer Request        56.07%       1.531ms        56.07%       1.531ms       1.531ms      16.832us         5.68%      16.832us      16.832us             1  
+                                    aten::empty_strided         1.17%      32.070us         1.17%      32.070us       5.345us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.59%     234.696us         8.59%     234.696us      39.116us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.53%      69.062us         3.26%      88.922us       3.705us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.73%      19.860us         0.73%      19.860us       0.827us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.86%     214.752us         7.86%     214.752us       4.474us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       4.930us         0.18%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.401ms
-Self CUDA time total: 298.051us
+Self CPU time total: 2.731ms
+Self CUDA time total: 296.225us
 
 
 
@@ -4599,27 +4607,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     953.069us       538.57%     953.069us     953.069us             1  
-                                            torch_eager        19.36%     280.983us        99.62%       1.446ms       1.446ms       0.000us         0.00%     179.812us     179.812us             1  
-                                              aten::mul        10.74%     155.876us        18.65%     270.688us      11.279us      94.916us        53.64%      94.916us       3.955us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.916us        53.64%      94.916us       3.955us            24  
-                                            aten::copy_         7.70%     111.823us        43.62%     633.117us      35.173us      57.568us        32.53%      60.416us       3.356us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.544us        22.91%      40.544us       3.379us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.480us        13.83%      24.480us       2.040us            12  
-                                            aten::clone         1.50%      21.731us        37.58%     545.384us      90.897us       0.000us         0.00%      19.872us       3.312us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us         9.62%      17.024us       2.837us             6  
-                                              aten::add         2.38%      34.509us         4.05%      58.781us       9.797us      12.256us         6.93%      12.256us       2.043us             6  
-                                              aten::sub         2.51%      36.442us         4.13%      59.923us       9.987us      12.224us         6.91%      12.224us       2.037us             6  
-                                Activity Buffer Request        15.40%     223.485us        15.40%     223.485us     223.485us       2.848us         1.61%       2.848us       2.848us             1  
-                                    aten::empty_strided         2.13%      30.930us         2.13%      30.930us       5.155us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.79%     229.197us        15.79%     229.197us      38.200us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.88%      70.882us         6.18%      89.652us       3.735us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.29%      18.770us         1.29%      18.770us       0.782us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.93%     231.177us        15.93%     231.177us       4.816us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.38%       5.510us         0.38%       5.510us       5.510us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     889.394us       501.79%     889.394us     889.394us             1  
+                                            torch_eager        17.97%     266.975us        99.65%       1.481ms       1.481ms       0.000us         0.00%     180.092us     180.092us             1  
+                                              aten::mul         9.80%     145.611us        16.96%     251.937us      10.497us      94.974us        53.58%      94.974us       3.957us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.974us        53.58%      94.974us       3.957us            24  
+                                            aten::copy_         6.75%     100.282us        47.98%     712.837us      39.602us      57.694us        32.55%      60.542us       3.363us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.736us        22.98%      40.736us       3.395us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.576us        13.87%      24.576us       2.048us            12  
+                                            aten::clone         1.38%      20.549us        43.06%     639.725us     106.621us       0.000us         0.00%      19.806us       3.301us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.958us         9.57%      16.958us       2.826us             6  
+                                              aten::sub         2.49%      37.040us         4.14%      61.531us      10.255us      12.289us         6.93%      12.289us       2.048us             6  
+                                              aten::add         2.11%      31.282us         3.59%      53.402us       8.900us      12.287us         6.93%      12.287us       2.048us             6  
+                                Activity Buffer Request        19.87%     295.257us        19.87%     295.257us     295.257us       2.848us         1.61%       2.848us       2.848us             1  
+                                    aten::empty_strided         2.04%      30.372us         2.04%      30.372us       5.062us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.34%     257.637us        17.34%     257.637us      42.940us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.31%      64.000us         5.58%      82.951us       3.456us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.28%      18.951us         1.28%      18.951us       0.790us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.31%     212.598us        14.31%     212.598us       4.429us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       5.130us         0.35%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.451ms
-Self CUDA time total: 176.964us
+Self CPU time total: 1.486ms
+Self CUDA time total: 177.244us
 
 
 
@@ -4629,27 +4637,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     992.756us       332.77%     992.756us     992.756us             1  
-                                            torch_eager        20.12%     289.006us        99.66%       1.432ms       1.432ms       0.000us         0.00%     316.222us     316.222us             1  
-                                              aten::mul        11.31%     162.528us        19.47%     279.759us      11.657us     146.880us        49.23%     146.880us       6.120us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     146.880us        49.23%     146.880us       6.120us            24  
-                                            aten::copy_         7.73%     111.012us        41.48%     595.895us      33.105us     110.942us        37.19%     128.830us       7.157us            18  
-                                            aten::clone         1.55%      22.310us        35.21%     505.793us      84.299us       0.000us         0.00%      71.424us      11.904us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.406us        19.24%      57.406us       4.784us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.536us        17.94%      53.536us       8.923us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.512us        13.58%      40.512us       3.376us            12  
-                                              aten::add         2.53%      36.289us         4.25%      61.011us      10.169us      20.352us         6.82%      20.352us       3.392us             6  
-                                              aten::sub         2.59%      37.162us         4.41%      63.291us      10.549us      20.160us         6.76%      20.160us       3.360us             6  
-                                Activity Buffer Request        13.10%     188.164us        13.10%     188.164us     188.164us      17.888us         6.00%      17.888us      17.888us             1  
-                                    aten::empty_strided         2.24%      32.121us         2.24%      32.121us       5.354us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.74%     226.067us        15.74%     226.067us      37.678us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.81%      69.111us         6.15%      88.363us       3.682us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.34%      19.252us         1.34%      19.252us       0.802us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.62%     238.734us        16.62%     238.734us       4.974us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.34%       4.940us         0.34%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     909.016us       306.24%     909.016us     909.016us             1  
+                                            torch_eager        19.05%     269.264us        99.66%       1.409ms       1.409ms       0.000us         0.00%     314.684us     314.684us             1  
+                                              aten::mul        10.56%     149.323us        19.02%     268.875us      11.203us     145.440us        49.00%     145.440us       6.060us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.440us        49.00%     145.440us       6.060us            24  
+                                            aten::copy_         6.96%      98.305us        44.09%     623.125us      34.618us     110.751us        37.31%     128.606us       7.145us            18  
+                                            aten::clone         1.45%      20.520us        38.80%     548.422us      91.404us       0.000us         0.00%      71.453us      11.909us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.153us        19.25%      57.153us       4.763us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.598us        18.06%      53.598us       8.933us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.638us        13.69%      40.638us       3.387us            12  
+                                              aten::add         2.27%      32.070us         3.85%      54.390us       9.065us      20.352us         6.86%      20.352us       3.392us             6  
+                                              aten::sub         2.35%      33.277us         4.05%      57.282us       9.547us      20.286us         6.83%      20.286us       3.381us             6  
+                                Activity Buffer Request        15.96%     225.655us        15.96%     225.655us     225.655us      17.855us         6.02%      17.855us      17.855us             1  
+                                    aten::empty_strided         2.15%      30.350us         2.15%      30.350us       5.058us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.79%     237.294us        16.79%     237.294us      39.549us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.69%      66.249us         6.00%      84.797us       3.533us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.31%      18.548us         1.31%      18.548us       0.773us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.11%     227.748us        16.11%     227.748us       4.745us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.34%       4.840us         0.34%       4.840us       4.840us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.437ms
-Self CUDA time total: 298.334us
+Self CPU time total: 1.413ms
+Self CUDA time total: 296.829us
 
 
 
@@ -4659,27 +4667,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     957.657us       163.29%     957.657us     957.657us             1  
-                                            torch_eager        20.09%     288.813us        99.63%       1.432ms       1.432ms       0.000us         0.00%     610.425us     610.425us             1  
-                                            aten::copy_         7.31%     105.011us        42.63%     612.724us      34.040us     268.572us        45.79%     292.508us      16.250us            18  
-                                              aten::mul        10.71%     153.870us        18.84%     270.776us      11.282us     252.607us        43.07%     252.607us      10.525us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     252.607us        43.07%     252.607us      10.525us            24  
-                                            aten::clone         1.42%      20.480us        36.58%     525.692us      87.615us       0.000us         0.00%     201.566us      33.594us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     177.630us        30.29%     177.630us      29.605us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.942us        15.51%      90.942us       7.578us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.310us        11.14%      65.310us       5.443us            12  
-                                              aten::sub         2.69%      38.720us         4.45%      63.991us      10.665us      32.991us         5.63%      32.991us       5.499us             6  
-                                              aten::add         2.37%      34.041us         3.93%      56.461us       9.410us      32.319us         5.51%      32.319us       5.387us             6  
-                                Activity Buffer Request        15.99%     229.866us        15.99%     229.866us     229.866us      23.936us         4.08%      23.936us      23.936us             1  
-                                    aten::empty_strided         2.02%      29.010us         2.02%      29.010us       4.835us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.72%     211.585us        14.72%     211.585us      35.264us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.83%      69.478us         6.24%      89.671us       3.736us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.40%      20.193us         1.40%      20.193us       0.841us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.06%     230.859us        16.06%     230.859us       4.810us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.37%       5.320us         0.37%       5.320us       5.320us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     916.757us       157.09%     916.757us     916.757us             1  
+                                            torch_eager        19.46%     274.242us        99.65%       1.404ms       1.404ms       0.000us         0.00%     607.350us     607.350us             1  
+                                            aten::copy_         7.01%      98.793us        43.42%     611.905us      33.995us     268.603us        46.03%     292.379us      16.243us            18  
+                                              aten::mul        10.57%     148.926us        18.84%     265.480us      11.062us     249.086us        42.68%     249.086us      10.379us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     249.086us        42.68%     249.086us      10.379us            24  
+                                            aten::clone         1.44%      20.340us        38.12%     537.253us      89.542us       0.000us         0.00%     202.173us      33.696us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     178.397us        30.57%     178.397us      29.733us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.206us        15.46%      90.206us       7.517us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.885us        11.29%      65.885us       5.490us            12  
+                                              aten::sub         2.63%      37.022us         4.37%      61.602us      10.267us      33.151us         5.68%      33.151us       5.525us             6  
+                                              aten::add         2.33%      32.810us         3.92%      55.180us       9.197us      32.734us         5.61%      32.734us       5.456us             6  
+                                Activity Buffer Request        15.58%     219.605us        15.58%     219.605us     219.605us      23.776us         4.07%      23.776us      23.776us             1  
+                                    aten::empty_strided         2.10%      29.631us         2.10%      29.631us       4.938us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.49%     232.396us        16.49%     232.396us      38.733us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.73%      66.612us         6.10%      85.953us       3.581us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.37%      19.341us         1.37%      19.341us       0.806us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.94%     224.615us        15.94%     224.615us       4.679us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       4.910us         0.35%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.437ms
-Self CUDA time total: 586.489us
+Self CPU time total: 1.409ms
+Self CUDA time total: 583.574us
 
 
 
@@ -4689,55 +4697,61 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         9.43%     329.378us        77.87%       2.720ms       2.720ms       0.000us         0.00%       1.842ms       1.842ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.815ms       102.19%       1.815ms       1.815ms             1  
-                                            aten::copy_         3.09%     107.951us        52.68%       1.840ms     102.235us     794.051us        44.71%     860.068us      47.782us            18  
-                                              aten::mul         4.59%     160.365us         8.02%     279.997us      11.667us     834.368us        46.99%     834.368us      34.765us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     834.368us        46.99%     834.368us      34.765us            24  
-                                            aten::clone         0.80%      28.034us        50.14%       1.751ms     291.882us       0.000us         0.00%     627.394us     104.566us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     561.377us        31.61%     561.377us      93.563us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     232.674us        13.10%     232.674us      19.389us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     147.392us         8.30%     147.392us      12.283us            12  
-                                              aten::sub         1.14%      39.970us         1.89%      66.170us      11.028us      89.952us         5.07%      89.952us      14.992us             6  
-                                Activity Buffer Request        41.31%       1.443ms        41.31%       1.443ms       1.443ms      66.017us         3.72%      66.017us      66.017us             1  
-                                              aten::add         0.95%      33.281us         1.61%      56.271us       9.379us      57.440us         3.23%      57.440us       9.573us             6  
-                                    aten::empty_strided         0.85%      29.670us         0.85%      29.670us       4.945us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.22%     217.146us         6.22%     217.146us      36.191us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.01%      70.292us         2.58%      90.182us       3.758us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.57%      19.890us         0.57%      19.890us       0.829us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         6.90%     240.975us         6.90%     240.975us       5.020us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize        22.13%     773.090us        22.13%     773.090us     773.090us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager        12.10%     272.127us        61.47%       1.382ms       1.382ms       0.000us         0.00%       1.837ms       1.837ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.810ms       102.21%       1.810ms       1.810ms             1  
+                                            aten::copy_         4.74%     106.692us        27.02%     607.756us      33.764us     794.110us        44.84%     859.966us      47.776us            18  
+                                              aten::mul         6.35%     142.895us        11.18%     251.386us      10.474us     829.085us        46.82%     829.085us      34.545us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     829.085us        46.82%     829.085us      34.545us            24  
+                                            aten::clone         0.94%      21.099us        23.42%     526.743us      87.790us       0.000us         0.00%     627.678us     104.613us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     561.822us        31.73%     561.822us      93.637us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     232.288us        13.12%     232.288us      19.357us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     147.650us         8.34%     147.650us      12.304us            12  
+                                              aten::sub         1.58%      35.541us         2.61%      58.661us       9.777us      89.538us         5.06%      89.538us      14.923us             6  
+                                Activity Buffer Request         9.29%     208.845us         9.29%     208.845us     208.845us      65.856us         3.72%      65.856us      65.856us             1  
+                                              aten::add         1.43%      32.251us         2.42%      54.461us       9.077us      58.112us         3.28%      58.112us       9.685us             6  
+                                    aten::empty_strided         1.39%      31.342us         1.39%      31.342us       5.224us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.27%     230.957us        10.27%     230.957us      38.493us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.99%      67.270us         3.80%      85.550us       3.565us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.81%      18.280us         0.81%      18.280us       0.762us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.56%     215.083us         9.56%     215.083us       4.481us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize        38.53%     866.589us        38.53%     866.589us     866.589us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.493ms
-Self CUDA time total: 1.776ms
+Self CPU time total: 2.249ms
+Self CUDA time total: 1.771ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_eager              cuda_B1_S128_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H32_D128_R64     0.21  True
 torch_eager              cuda_B1_S128_H32_D64_R32     0.22  True
-torch_eager              cuda_B1_S128_H8_D128_R64     0.23  True
-torch_eager              cuda_B1_S128_H8_D64_R32     0.18  True
-torch_eager              cuda_B1_S2048_H32_D128_R64     0.23  True
-torch_eager              cuda_B1_S2048_H32_D64_R32     0.22  True
-torch_eager              cuda_B1_S2048_H8_D128_R64     0.22  True
-torch_eager              cuda_B1_S2048_H8_D64_R32     0.22  True
-torch_eager              cuda_B1_S512_H32_D128_R64     0.22  True
-torch_eager              cuda_B1_S512_H32_D64_R32     0.22  True
-torch_eager              cuda_B1_S512_H8_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H8_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H8_D64_R32     0.17  True
+torch_eager              cuda_B1_S2048_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S2048_H32_D64_R32     0.21  True
+torch_eager              cuda_B1_S2048_H8_D128_R64     0.21  True
+torch_eager              cuda_B1_S2048_H8_D64_R32     0.21  True
+torch_eager              cuda_B1_S512_H32_D128_R64     0.21  True
+torch_eager              cuda_B1_S512_H32_D64_R32     0.21  True
+torch_eager              cuda_B1_S512_H8_D128_R64     0.21  True
 torch_eager              cuda_B1_S512_H8_D64_R32     0.22  True
-torch_eager              cuda_B2_S128_H32_D128_R64     0.22  True
-torch_eager              cuda_B2_S128_H32_D64_R32     0.22  True
-torch_eager              cuda_B2_S128_H8_D128_R64     0.22  True
-torch_eager              cuda_B2_S128_H8_D64_R32     0.22  True
+torch_eager              cuda_B2_S128_H32_D128_R64     0.21  True
+torch_eager              cuda_B2_S128_H32_D64_R32     0.21  True
+torch_eager              cuda_B2_S128_H8_D128_R64     0.21  True
+torch_eager              cuda_B2_S128_H8_D64_R32     0.21  True
 torch_eager              cuda_B2_S2048_H32_D128_R64     0.64  True
 torch_eager              cuda_B2_S2048_H32_D64_R32     0.23  True
 torch_eager              cuda_B2_S2048_H8_D128_R64     0.22  True
-torch_eager              cuda_B2_S2048_H8_D64_R32     0.22  True
-torch_eager              cuda_B2_S512_H32_D128_R64     0.22  True
-torch_eager              cuda_B2_S512_H32_D64_R32     0.22  True
+torch_eager              cuda_B2_S2048_H8_D64_R32     0.21  True
+torch_eager              cuda_B2_S512_H32_D128_R64     0.21  True
+torch_eager              cuda_B2_S512_H32_D64_R32     0.21  True
 torch_eager              cuda_B2_S512_H8_D128_R64     0.21  True
-torch_eager              cuda_B2_S512_H8_D64_R32     0.22  True
+torch_eager              cuda_B2_S512_H8_D64_R32     0.21  True
 
+
+
▶ UV Install Logs
+ +

Artifacts:

rotary.jsonl