diff --git "a/rotary/impls/torch_rotary.html" "b/rotary/impls/torch_rotary.html" --- "a/rotary/impls/torch_rotary.html" +++ "b/rotary/impls/torch_rotary.html" @@ -3904,7 +3904,7 @@ Cell: nv | 0.25s
-
Fri Dec 19 19:54:55 2025       
+
Fri Dec 19 23:00:45 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.25s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   36C    P0             90W /  350W |       0MiB /  46068MiB |     17%      Default |
+| N/A   40C    P0             85W /  350W |       0MiB /  46068MiB |     24%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3937,7 +3937,7 @@ Cell: nv | 0.25s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 7.93s
+Cell: benchmark | 7.89s
  | 
 
 Raw
@@ -4016,27 +4016,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.121ms      1256.65%       1.121ms       1.121ms             1  
-                                            torch_eager        12.91%     396.632us        99.49%       3.057ms       3.057ms       0.000us         0.00%      90.428us      90.428us             1  
-                                              aten::mul         6.11%     187.745us        10.64%     326.977us      13.624us      47.168us        52.87%      47.168us       1.965us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.168us        52.87%      47.168us       1.965us            24  
-                                            aten::copy_         3.73%     114.507us        64.51%       1.982ms     110.114us      28.957us        32.46%      30.173us       1.676us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.335us        25.04%      22.335us       1.861us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.087us        14.67%      13.087us       1.091us            12  
-                                            aten::clone         1.26%      38.732us        62.92%       1.933ms     322.183us       0.000us         0.00%       7.838us       1.306us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.622us         7.42%       6.622us       1.104us             6  
-                                              aten::sub         1.53%      47.130us         2.58%      79.281us      13.214us       6.559us         7.35%       6.559us       1.093us             6  
-                                              aten::add         1.27%      38.901us         2.21%      67.791us      11.299us       6.528us         7.32%       6.528us       1.088us             6  
-                                Activity Buffer Request        55.82%       1.715ms        55.82%       1.715ms       1.715ms       1.216us         1.36%       1.216us       1.216us             1  
-                                    aten::empty_strided         2.00%      61.422us         2.00%      61.422us      10.237us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.49%      76.434us         2.49%      76.434us      12.739us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.64%      81.233us         3.38%     103.763us       4.323us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.73%      22.530us         0.73%      22.530us       0.939us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         9.00%     276.368us         9.00%     276.368us       5.758us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.51%      15.650us         0.51%      15.650us      15.650us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.009ms      1132.93%       1.009ms       1.009ms             1  
+                                            torch_eager        12.41%     369.428us        99.52%       2.961ms       2.961ms       0.000us         0.00%      90.371us      90.371us             1  
+                                              aten::mul         5.71%     169.883us         9.58%     285.017us      11.876us      47.039us        52.80%      47.039us       1.960us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.039us        52.80%      47.039us       1.960us            24  
+                                            aten::copy_         3.61%     107.364us        66.50%       1.979ms     109.941us      28.963us        32.51%      30.243us       1.680us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.243us        24.97%      22.243us       1.854us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.089us        14.69%      13.089us       1.091us            12  
+                                            aten::clone         1.40%      41.760us        65.34%       1.944ms     324.049us       0.000us         0.00%       8.000us       1.333us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.720us         7.54%       6.720us       1.120us             6  
+                                              aten::sub         1.56%      46.400us         2.42%      72.061us      12.010us       6.561us         7.36%       6.561us       1.094us             6  
+                                              aten::add         1.27%      37.811us         2.01%      59.841us       9.973us       6.528us         7.33%       6.528us       1.088us             6  
+                                Activity Buffer Request        58.48%       1.740ms        58.48%       1.740ms       1.740ms       1.280us         1.44%       1.280us       1.280us             1  
+                                    aten::empty_strided         1.77%      52.541us         1.77%      52.541us       8.757us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.39%      71.252us         2.39%      71.252us      11.875us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.63%      78.291us         3.42%     101.892us       4.245us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.79%      23.601us         0.79%      23.601us       0.983us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.49%     222.745us         7.49%     222.745us       4.641us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.48%      14.390us         0.48%      14.390us      14.390us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.072ms
-Self CUDA time total: 89.212us
+Self CPU time total: 2.976ms
+Self CUDA time total: 89.091us
 
 
 
@@ -4046,27 +4046,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.007ms      1113.49%       1.007ms       1.007ms             1  
-                                            torch_eager        10.54%     298.489us        99.78%       2.827ms       2.827ms       0.000us         0.00%      91.582us      91.582us             1  
-                                              aten::mul         6.23%     176.544us        10.66%     301.947us      12.581us      47.549us        52.58%      47.549us       1.981us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.549us        52.58%      47.549us       1.981us            24  
-                                            aten::copy_         3.68%     104.131us        68.91%       1.952ms     108.446us      29.376us        32.49%      30.529us       1.696us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.591us        24.98%      22.591us       1.883us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.504us        14.93%      13.504us       1.125us            12  
-                                            aten::clone         0.77%      21.889us        65.76%       1.863ms     310.451us       0.000us         0.00%       7.938us       1.323us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.785us         7.50%       6.785us       1.131us             6  
-                                              aten::sub         1.41%      40.059us         2.39%      67.780us      11.297us       6.784us         7.50%       6.784us       1.131us             6  
-                                              aten::add         1.28%      36.151us         2.32%      65.853us      10.975us       6.720us         7.43%       6.720us       1.120us             6  
-                                Activity Buffer Request        60.61%       1.717ms        60.61%       1.717ms       1.717ms       1.153us         1.28%       1.153us       1.153us             1  
-                                    aten::empty_strided         1.12%      31.742us         1.12%      31.742us       5.290us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.04%      57.701us         2.04%      57.701us       9.617us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.45%      69.273us         3.06%      86.813us       3.617us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.62%      17.540us         0.62%      17.540us       0.731us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         9.05%     256.228us         9.05%     256.228us       5.338us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.22%       6.150us         0.22%       6.150us       6.150us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     914.978us      1014.29%     914.978us     914.978us             1  
+                                            torch_eager        10.68%     295.102us        99.80%       2.758ms       2.758ms       0.000us         0.00%      91.361us      91.361us             1  
+                                              aten::mul         5.65%     156.202us         9.56%     264.265us      11.011us      47.519us        52.68%      47.519us       1.980us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.519us        52.68%      47.519us       1.980us            24  
+                                            aten::copy_         3.57%      98.672us        70.68%       1.953ms     108.523us      29.314us        32.50%      30.466us       1.693us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.529us        24.97%      22.529us       1.877us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.376us        14.83%      13.376us       1.115us            12  
+                                            aten::clone         0.80%      22.200us        68.13%       1.883ms     313.805us       0.000us         0.00%       7.937us       1.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.785us         7.52%       6.785us       1.131us             6  
+                                              aten::add         1.12%      30.913us         1.88%      51.842us       8.640us       6.688us         7.41%       6.688us       1.115us             6  
+                                              aten::sub         1.32%      36.532us         2.17%      59.963us       9.994us       6.688us         7.41%       6.688us       1.115us             6  
+                                Activity Buffer Request        63.06%       1.743ms        63.06%       1.743ms       1.743ms       1.152us         1.28%       1.152us       1.152us             1  
+                                    aten::empty_strided         1.09%      30.200us         1.09%      30.200us       5.033us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         1.91%      52.752us         1.91%      52.752us       8.792us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.31%      63.802us         2.94%      81.195us       3.383us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.63%      17.393us         0.63%      17.393us       0.725us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.65%     211.533us         7.65%     211.533us       4.407us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.491us         0.20%       5.491us       5.491us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.833ms
-Self CUDA time total: 90.429us
+Self CPU time total: 2.764ms
+Self CUDA time total: 90.209us
 
 
 
@@ -4076,27 +4076,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.006ms      1072.44%       1.006ms       1.006ms             1  
-                                            torch_eager        10.52%     298.395us        99.78%       2.831ms       2.831ms       0.000us         0.00%      95.037us      95.037us             1  
-                                              aten::mul         6.11%     173.334us        10.70%     303.588us      12.649us      48.703us        51.91%      48.703us       2.029us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.703us        51.91%      48.703us       2.029us            24  
-                                            aten::copy_         3.65%     103.459us        69.19%       1.963ms     109.044us      30.654us        32.67%      31.870us       1.771us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.975us        24.49%      22.975us       1.915us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.464us        15.42%      14.464us       1.205us            12  
-                                            aten::clone         0.76%      21.692us        65.97%       1.871ms     311.909us       0.000us         0.00%       8.895us       1.482us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.679us         8.18%       7.679us       1.280us             6  
-                                              aten::add         1.20%      34.161us         2.12%      60.242us      10.040us       7.233us         7.71%       7.233us       1.206us             6  
-                                              aten::sub         1.38%      39.061us         2.38%      67.492us      11.249us       7.231us         7.71%       7.231us       1.205us             6  
-                                Activity Buffer Request        60.90%       1.728ms        60.90%       1.728ms       1.728ms       1.216us         1.30%       1.216us       1.216us             1  
-                                    aten::empty_strided         1.08%      30.530us         1.08%      30.530us       5.088us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.06%      58.531us         2.06%      58.531us       9.755us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.41%      68.445us         3.03%      85.834us       3.576us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.61%      17.389us         0.61%      17.389us       0.725us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         9.10%     258.010us         9.10%     258.010us       5.375us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.22%       6.240us         0.22%       6.240us       6.240us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     896.092us       952.15%     896.092us     896.092us             1  
+                                            torch_eager        10.34%     284.358us        99.81%       2.744ms       2.744ms       0.000us         0.00%      95.424us      95.424us             1  
+                                              aten::mul         5.45%     149.791us         9.32%     256.084us      10.670us      48.736us        51.79%      48.736us       2.031us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.736us        51.79%      48.736us       2.031us            24  
+                                            aten::copy_         3.52%      96.699us        71.27%       1.959ms     108.841us      30.943us        32.88%      32.255us       1.792us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.071us        24.51%      23.071us       1.923us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.433us        15.34%      14.433us       1.203us            12  
+                                            aten::clone         0.75%      20.741us        68.58%       1.885ms     314.190us       0.000us         0.00%       9.184us       1.531us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us         8.36%       7.872us       1.312us             6  
+                                              aten::sub         1.32%      36.203us         2.15%      58.992us       9.832us       7.264us         7.72%       7.264us       1.211us             6  
+                                              aten::add         1.12%      30.822us         1.89%      51.942us       8.657us       7.169us         7.62%       7.169us       1.195us             6  
+                                Activity Buffer Request        63.70%       1.751ms        63.70%       1.751ms       1.751ms       1.312us         1.39%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.08%      29.590us         1.08%      29.590us       4.932us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         1.85%      50.982us         1.85%      50.982us       8.497us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.36%      64.969us         3.01%      82.862us       3.453us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.65%      17.893us         0.65%      17.893us       0.746us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.66%     210.524us         7.66%     210.524us       4.386us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.270us         0.19%       5.270us       5.270us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.837ms
-Self CUDA time total: 93.821us
+Self CPU time total: 2.749ms
+Self CUDA time total: 94.112us
 
 
 
@@ -4106,27 +4106,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.008ms       991.12%       1.008ms       1.008ms             1  
-                                            torch_eager        11.39%     291.220us        99.77%       2.552ms       2.552ms       0.000us         0.00%     103.069us     103.069us             1  
-                                              aten::mul         6.75%     172.580us        11.78%     301.232us      12.551us      52.797us        51.90%      52.797us       2.200us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.797us        51.90%      52.797us       2.200us            24  
-                                            aten::copy_         3.99%     102.084us        65.85%       1.684ms      93.565us      32.544us        31.99%      33.888us       1.883us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.736us        24.32%      24.736us       2.061us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.384us        16.11%      16.384us       1.365us            12  
-                                            aten::clone         0.85%      21.760us        62.39%       1.596ms     265.957us       0.000us         0.00%       9.152us       1.525us             6  
-                                              aten::sub         1.61%      41.170us         2.75%      70.342us      11.724us       8.256us         8.12%       8.256us       1.376us             6  
-                                              aten::add         1.37%      35.121us         2.54%      64.851us      10.809us       8.128us         7.99%       8.128us       1.355us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.68%       7.808us       1.301us             6  
-                                Activity Buffer Request        49.27%       1.260ms        49.27%       1.260ms       1.260ms       1.344us         1.32%       1.344us       1.344us             1  
-                                    aten::empty_strided         1.23%      31.560us         1.23%      31.560us       5.260us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.74%     249.077us         9.74%     249.077us      41.513us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.69%      68.742us         3.39%      86.703us       3.613us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.70%      17.961us         0.70%      17.961us       0.748us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        10.18%     260.466us        10.18%     260.466us       5.426us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.23%       5.870us         0.23%       5.870us       5.870us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     914.202us       899.81%     914.202us     914.202us             1  
+                                            torch_eager         9.98%     278.657us        99.81%       2.786ms       2.786ms       0.000us         0.00%     102.911us     102.911us             1  
+                                              aten::mul         5.55%     155.014us         9.50%     265.176us      11.049us      52.799us        51.97%      52.799us       2.200us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.799us        51.97%      52.799us       2.200us            24  
+                                            aten::copy_         3.52%      98.370us        71.20%       1.987ms     110.396us      32.288us        31.78%      33.600us       1.867us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.544us        24.16%      24.544us       2.045us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.512us        16.25%      16.512us       1.376us            12  
+                                            aten::clone         0.72%      20.142us        68.59%       1.914ms     319.034us       0.000us         0.00%       9.056us       1.509us             6  
+                                              aten::sub         1.33%      37.171us         2.19%      61.241us      10.207us       8.320us         8.19%       8.320us       1.387us             6  
+                                              aten::add         1.42%      39.541us         2.20%      61.471us      10.245us       8.192us         8.06%       8.192us       1.365us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         7.62%       7.744us       1.291us             6  
+                                Activity Buffer Request        57.78%       1.613ms        57.78%       1.613ms       1.613ms       1.312us         1.29%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.10%      30.730us         1.10%      30.730us       5.122us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         7.81%     218.005us         7.81%     218.005us      36.334us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.28%      63.724us         2.90%      81.002us       3.375us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.62%      17.278us         0.62%      17.278us       0.720us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.68%     214.384us         7.68%     214.384us       4.466us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.360us         0.19%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.558ms
-Self CUDA time total: 101.725us
+Self CPU time total: 2.791ms
+Self CUDA time total: 101.599us
 
 
 
@@ -4136,27 +4136,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.014ms      1081.02%       1.014ms       1.014ms             1  
-                                            torch_eager        10.25%     325.467us        99.80%       3.170ms       3.170ms       0.000us         0.00%      95.040us      95.040us             1  
-                                              aten::mul         5.52%     175.344us         9.66%     306.970us      12.790us      48.800us        52.05%      48.800us       2.033us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.800us        52.05%      48.800us       2.033us            24  
-                                            aten::copy_         3.31%     105.233us        71.23%       2.263ms     125.701us      30.784us        32.83%      32.064us       1.781us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.041us        24.57%      23.041us       1.920us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.176us        15.12%      14.176us       1.181us            12  
-                                            aten::clone         0.88%      27.881us        68.64%       2.180ms     363.408us       0.000us         0.00%       9.023us       1.504us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.743us         8.26%       7.743us       1.290us             6  
-                                              aten::sub         1.21%      38.440us         2.12%      67.480us      11.247us       7.136us         7.61%       7.136us       1.189us             6  
-                                              aten::add         1.04%      32.931us         1.90%      60.251us      10.042us       7.040us         7.51%       7.040us       1.173us             6  
-                                Activity Buffer Request        54.81%       1.741ms        54.81%       1.741ms       1.741ms       1.280us         1.37%       1.280us       1.280us             1  
-                                    aten::empty_strided         1.02%      32.521us         1.02%      32.521us       5.420us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        10.79%     342.870us        10.79%     342.870us      57.145us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.15%      68.383us         2.74%      86.883us       3.620us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.58%      18.500us         0.58%      18.500us       0.771us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.23%     261.446us         8.23%     261.446us       5.447us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       6.450us         0.20%       6.450us       6.450us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     915.332us       974.61%     915.332us     915.332us             1  
+                                            torch_eager        10.01%     309.953us        99.82%       3.091ms       3.091ms       0.000us         0.00%      95.198us      95.198us             1  
+                                              aten::mul         4.96%     153.567us         8.55%     264.920us      11.038us      48.833us        52.00%      48.833us       2.035us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.833us        52.00%      48.833us       2.035us            24  
+                                            aten::copy_         3.18%      98.342us        72.96%       2.259ms     125.517us      30.876us        32.88%      32.156us       1.786us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.070us        24.56%      23.070us       1.923us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.209us        15.13%      14.209us       1.184us            12  
+                                            aten::clone         0.91%      28.102us        71.00%       2.199ms     366.430us       0.000us         0.00%       9.086us       1.514us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.806us         8.31%       7.806us       1.301us             6  
+                                              aten::sub         1.22%      37.870us         1.99%      61.740us      10.290us       7.168us         7.63%       7.168us       1.195us             6  
+                                              aten::add         0.97%      30.001us         1.65%      51.171us       8.529us       7.041us         7.50%       7.041us       1.173us             6  
+                                Activity Buffer Request        56.99%       1.765ms        56.99%       1.765ms       1.765ms       1.280us         1.36%       1.280us       1.280us             1  
+                                    aten::empty_strided         1.03%      31.751us         1.03%      31.751us       5.292us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.91%     337.859us        10.91%     337.859us      56.310us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.16%      66.933us         2.72%      84.261us       3.511us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.56%      17.328us         0.56%      17.328us       0.722us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         6.93%     214.663us         6.93%     214.663us       4.472us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.580us         0.18%       5.580us       5.580us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.177ms
-Self CUDA time total: 93.760us
+Self CPU time total: 3.097ms
+Self CUDA time total: 93.918us
 
 
 
@@ -4166,26 +4166,26 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.001ms       983.06%       1.001ms       1.001ms             1  
-                                            torch_eager         9.84%     295.255us        99.81%       2.995ms       2.995ms       0.000us         0.00%     103.136us     103.136us             1  
-                                              aten::mul         5.72%     171.754us         9.96%     298.939us      12.456us      52.896us        51.95%      52.896us       2.204us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.896us        51.95%      52.896us       2.204us            24  
-                                            aten::copy_         3.46%     103.695us        71.00%       2.131ms     118.374us      32.417us        31.84%      33.729us       1.874us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.673us        24.23%      24.673us       2.056us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.511us        16.22%      16.511us       1.376us            12  
-                                            aten::clone         0.71%      21.410us        67.93%       2.039ms     339.780us       0.000us         0.00%       9.056us       1.509us             6  
-                                              aten::sub         1.35%      40.422us         2.36%      70.682us      11.780us       8.319us         8.17%       8.319us       1.386us             6  
-                                              aten::add         1.19%      35.599us         2.15%      64.481us      10.747us       8.192us         8.05%       8.192us       1.365us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         7.61%       7.744us       1.291us             6  
-                                Activity Buffer Request        57.66%       1.731ms        57.66%       1.731ms       1.731ms       1.312us         1.29%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.04%      31.291us         1.04%      31.291us       5.215us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         7.41%     222.325us         7.41%     222.325us      37.054us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.17%      65.101us         2.75%      82.681us       3.445us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.59%      17.580us         0.59%      17.580us       0.733us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.68%     260.519us         8.68%     260.519us       5.427us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.671us         0.19%       5.671us       5.671us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     991.450us       973.69%     991.450us     991.450us             1  
+                                            torch_eager        12.00%     374.920us        99.82%       3.119ms       3.119ms       0.000us         0.00%     103.167us     103.167us             1  
+                                              aten::mul         4.76%     148.675us         8.21%     256.446us      10.685us      52.898us        51.95%      52.898us       2.204us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.898us        51.95%      52.898us       2.204us            24  
+                                            aten::copy_         3.10%      96.880us        71.33%       2.229ms     123.830us      32.414us        31.83%      33.757us       1.875us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.575us        24.13%      24.575us       2.048us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.512us        16.22%      16.512us       1.376us            12  
+                                            aten::clone         0.70%      21.890us        69.11%       2.160ms     359.941us       0.000us         0.00%       9.182us       1.530us             6  
+                                              aten::sub         1.16%      36.128us         1.91%      59.601us       9.934us       8.320us         8.17%       8.320us       1.387us             6  
+                                              aten::add         1.18%      36.821us         1.99%      62.122us      10.354us       8.192us         8.05%       8.192us       1.365us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.839us         7.70%       7.839us       1.307us             6  
+                                Activity Buffer Request        56.19%       1.756ms        56.19%       1.756ms       1.756ms       1.343us         1.32%       1.343us       1.343us             1  
+                                    aten::empty_strided         0.98%      30.741us         0.98%      30.741us       5.124us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.19%     318.287us        10.19%     318.287us      53.048us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.12%      66.300us         2.71%      84.551us       3.523us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.58%      18.251us         0.58%      18.251us       0.760us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         6.87%     214.557us         6.87%     214.557us       4.470us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.580us         0.18%       5.580us       5.580us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.001ms
+Self CPU time total: 3.125ms
 Self CUDA time total: 101.824us
 
 
@@ -4196,27 +4196,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.015ms       840.95%       1.015ms       1.015ms             1  
-                                            torch_eager         9.74%     291.664us        99.80%       2.987ms       2.987ms       0.000us         0.00%     122.530us     122.530us             1  
-                                              aten::mul         5.84%     174.903us        10.28%     307.588us      12.816us      62.144us        51.48%      62.144us       2.589us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.144us        51.48%      62.144us       2.589us            24  
-                                            aten::copy_         3.43%     102.713us        70.67%       2.115ms     117.501us      39.265us        32.53%      41.090us       2.283us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.864us        23.91%      28.864us       2.405us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.296us        15.99%      19.296us       1.608us            12  
-                                            aten::clone         0.71%      21.240us        67.46%       2.019ms     336.492us       0.000us         0.00%      12.226us       2.038us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.401us         8.62%      10.401us       1.733us             6  
-                                              aten::add         1.16%      34.581us         2.15%      64.442us      10.740us       9.696us         8.03%       9.696us       1.616us             6  
-                                              aten::sub         1.36%      40.831us         2.36%      70.613us      11.769us       9.600us         7.95%       9.600us       1.600us             6  
-                                Activity Buffer Request        57.99%       1.736ms        57.99%       1.736ms       1.736ms       1.825us         1.51%       1.825us       1.825us             1  
-                                    aten::empty_strided         1.04%      31.082us         1.04%      31.082us       5.180us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.60%     197.545us         6.60%     197.545us      32.924us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.24%      66.920us         2.85%      85.310us       3.555us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.61%      18.390us         0.61%      18.390us       0.766us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         9.07%     271.392us         9.07%     271.392us       5.654us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       6.001us         0.20%       6.001us       6.001us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     898.909us       743.56%     898.909us     898.909us             1  
+                                            torch_eager         9.91%     292.126us        99.82%       2.942ms       2.942ms       0.000us         0.00%     122.717us     122.717us             1  
+                                              aten::mul         5.16%     152.179us         8.78%     258.933us      10.789us      62.079us        51.35%      62.079us       2.587us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.079us        51.35%      62.079us       2.587us            24  
+                                            aten::copy_         3.24%      95.439us        72.74%       2.144ms     119.116us      39.487us        32.66%      41.311us       2.295us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.927us        23.93%      28.927us       2.411us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.327us        15.99%      19.327us       1.611us            12  
+                                            aten::clone         0.77%      22.660us        70.42%       2.076ms     345.928us       0.000us         0.00%      12.384us       2.064us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us         8.73%      10.560us       1.760us             6  
+                                              aten::sub         1.22%      36.000us         2.00%      58.931us       9.822us       9.664us         7.99%       9.664us       1.611us             6  
+                                              aten::add         1.08%      31.800us         1.80%      53.111us       8.852us       9.663us         7.99%       9.663us       1.611us             6  
+                                Activity Buffer Request        57.43%       1.693ms        57.43%       1.693ms       1.693ms       1.824us         1.51%       1.824us       1.824us             1  
+                                    aten::empty_strided         1.01%      29.882us         1.01%      29.882us       4.980us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.11%     297.916us        10.11%     297.916us      49.653us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.20%      64.806us         2.80%      82.537us       3.439us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.60%      17.731us         0.60%      17.731us       0.739us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.09%     208.898us         7.09%     208.898us       4.352us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.241us         0.18%       5.241us       5.241us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.993ms
-Self CUDA time total: 120.705us
+Self CPU time total: 2.948ms
+Self CUDA time total: 120.893us
 
 
 
@@ -4226,27 +4226,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.002ms       581.09%       1.002ms       1.002ms             1  
-                                            torch_eager        20.60%     295.691us        99.58%       1.430ms       1.430ms       0.000us         0.00%     175.327us     175.327us             1  
-                                              aten::mul        11.87%     170.476us        20.75%     297.916us      12.413us      89.503us        51.89%      89.503us       3.729us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.503us        51.89%      89.503us       3.729us            24  
-                                            aten::copy_         7.24%     103.879us        39.87%     572.479us      31.804us      57.887us        33.56%      60.735us       3.374us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.832us        23.67%      40.832us       3.403us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.089us        14.55%      25.089us       2.091us            12  
-                                            aten::clone         1.51%      21.660us        33.62%     482.661us      80.443us       0.000us         0.00%      19.903us       3.317us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.055us         9.89%      17.055us       2.842us             6  
-                                              aten::sub         2.80%      40.184us         4.70%      67.454us      11.242us      12.546us         7.27%      12.546us       2.091us             6  
-                                              aten::add         2.41%      34.531us         4.25%      60.992us      10.165us      12.543us         7.27%      12.543us       2.090us             6  
-                                Activity Buffer Request        14.04%     201.545us        14.04%     201.545us     201.545us       2.848us         1.65%       2.848us       2.848us             1  
-                                    aten::empty_strided         2.07%      29.712us         2.07%      29.712us       4.952us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.60%     195.253us        13.60%     195.253us      32.542us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.57%      65.572us         5.83%      83.731us       3.489us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.26%      18.159us         1.26%      18.159us       0.757us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        17.62%     252.973us        17.62%     252.973us       5.270us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.42%       6.061us         0.42%       6.061us       6.061us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     904.612us       526.12%     904.612us     904.612us             1  
+                                            torch_eager        19.42%     290.431us        99.64%       1.490ms       1.490ms       0.000us         0.00%     174.789us     174.789us             1  
+                                              aten::mul        10.19%     152.404us        17.63%     263.706us      10.988us      89.218us        51.89%      89.218us       3.717us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.218us        51.89%      89.218us       3.717us            24  
+                                            aten::copy_         6.40%      95.669us        46.12%     689.622us      38.312us      57.665us        33.54%      60.513us       3.362us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.577us        23.60%      40.577us       3.381us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.058us        14.57%      25.058us       2.088us            12  
+                                            aten::clone         1.32%      19.751us        41.22%     616.413us     102.735us       0.000us         0.00%      19.936us       3.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us         9.94%      17.088us       2.848us             6  
+                                              aten::add         2.23%      33.380us         3.73%      55.850us       9.308us      12.577us         7.31%      12.577us       2.096us             6  
+                                              aten::sub         2.48%      37.041us         4.08%      60.981us      10.164us      12.481us         7.26%      12.481us       2.080us             6  
+                                Activity Buffer Request        17.03%     254.616us        17.03%     254.616us     254.616us       2.848us         1.66%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.98%      29.551us         1.98%      29.551us       4.925us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        18.81%     281.226us        18.81%     281.226us      46.871us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.20%      62.788us         5.36%      80.180us       3.341us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.16%      17.392us         1.16%      17.392us       0.725us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.43%     215.823us        14.43%     215.823us       4.496us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       5.360us         0.36%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.436ms
-Self CUDA time total: 172.479us
+Self CPU time total: 1.495ms
+Self CUDA time total: 171.941us
 
 
 
@@ -4256,27 +4256,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     998.580us       827.30%     998.580us     998.580us             1  
-                                            torch_eager        20.47%     288.290us        99.58%       1.403ms       1.403ms       0.000us         0.00%     122.464us     122.464us             1  
-                                              aten::mul        12.45%     175.343us        21.53%     303.228us      12.634us      62.175us        51.51%      62.175us       2.591us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.175us        51.51%      62.175us       2.591us            24  
-                                            aten::copy_         7.38%     103.973us        38.88%     547.625us      30.424us      39.265us        32.53%      41.025us       2.279us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.800us        23.86%      28.800us       2.400us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.96%      19.264us       1.605us            12  
-                                            aten::clone         1.40%      19.710us        32.30%     454.921us      75.820us       0.000us         0.00%      12.225us       2.038us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.465us         8.67%      10.465us       1.744us             6  
-                                              aten::add         2.42%      34.049us         4.36%      61.370us      10.228us       9.695us         8.03%       9.695us       1.616us             6  
-                                              aten::sub         2.76%      38.918us         4.74%      66.750us      11.125us       9.569us         7.93%       9.569us       1.595us             6  
-                                Activity Buffer Request        13.00%     183.145us        13.00%     183.145us     183.145us       1.760us         1.46%       1.760us       1.760us             1  
-                                    aten::empty_strided         2.13%      30.010us         2.13%      30.010us       5.002us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.33%     187.794us        13.33%     187.794us      31.299us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.79%      67.456us         6.09%      85.722us       3.572us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.30%      18.266us         1.30%      18.266us       0.761us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        18.16%     255.751us        18.16%     255.751us       5.328us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.42%       5.851us         0.42%       5.851us       5.851us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     891.715us       739.56%     891.715us     891.715us             1  
+                                            torch_eager        18.90%     276.136us        99.61%       1.455ms       1.455ms       0.000us         0.00%     122.334us     122.334us             1  
+                                              aten::mul        10.73%     156.714us        18.01%     263.146us      10.964us      61.921us        51.36%      61.921us       2.580us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.921us        51.36%      61.921us       2.580us            24  
+                                            aten::copy_         6.60%      96.484us        46.13%     674.035us      37.446us      39.325us        32.61%      41.085us       2.283us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.894us        23.96%      28.894us       2.408us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.328us        16.03%      19.328us       1.611us            12  
+                                            aten::clone         1.42%      20.789us        41.25%     602.732us     100.455us       0.000us         0.00%      12.191us       2.032us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us         8.65%      10.431us       1.738us             6  
+                                              aten::add         2.19%      32.032us         3.63%      53.102us       8.850us       9.696us         8.04%       9.696us       1.616us             6  
+                                              aten::sub         2.44%      35.611us         4.02%      58.732us       9.789us       9.632us         7.99%       9.632us       1.605us             6  
+                                Activity Buffer Request        17.44%     254.736us        17.44%     254.736us     254.736us       1.760us         1.46%       1.760us       1.760us             1  
+                                    aten::empty_strided         2.08%      30.391us         2.08%      30.391us       5.065us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        18.06%     263.885us        18.06%     263.885us      43.981us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.23%      61.792us         5.41%      79.021us       3.293us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.18%      17.229us         1.18%      17.229us       0.718us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.34%     209.553us        14.34%     209.553us       4.366us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.39%       5.690us         0.39%       5.690us       5.690us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.409ms
-Self CUDA time total: 120.704us
+Self CPU time total: 1.461ms
+Self CUDA time total: 120.574us
 
 
 
@@ -4286,27 +4286,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     990.551us       575.17%     990.551us     990.551us             1  
-                                            torch_eager         9.55%     281.107us        99.81%       2.938ms       2.938ms       0.000us         0.00%     175.100us     175.100us             1  
-                                              aten::mul         5.81%     170.942us        10.23%     301.167us      12.549us      89.468us        51.95%      89.468us       3.728us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.468us        51.95%      89.468us       3.728us            24  
-                                            aten::copy_         3.50%     102.971us        70.92%       2.088ms     115.984us      57.728us        33.52%      60.608us       3.367us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.800us        23.69%      40.800us       3.400us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.024us        14.53%      25.024us       2.085us            12  
-                                            aten::clone         0.79%      23.370us        68.06%       2.004ms     333.942us       0.000us         0.00%      19.808us       3.301us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us         9.83%      16.928us       2.821us             6  
-                                              aten::add         1.19%      35.020us         2.08%      61.181us      10.197us      12.513us         7.27%      12.513us       2.085us             6  
-                                              aten::sub         1.27%      37.451us         2.29%      67.451us      11.242us      12.511us         7.26%      12.511us       2.085us             6  
-                                Activity Buffer Request        58.47%       1.721ms        58.47%       1.721ms       1.721ms       2.880us         1.67%       2.880us       2.880us             1  
-                                    aten::empty_strided         1.04%      30.722us         1.04%      30.722us       5.120us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.59%     193.904us         6.59%     193.904us      32.317us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.29%      67.323us         2.91%      85.623us       3.568us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.62%      18.300us         0.62%      18.300us       0.762us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.69%     255.869us         8.69%     255.869us       5.331us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.631us         0.19%       5.631us       5.631us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     898.269us       521.86%     898.269us     898.269us             1  
+                                            torch_eager         9.37%     284.685us        99.81%       3.032ms       3.032ms       0.000us         0.00%     174.974us     174.974us             1  
+                                              aten::mul         4.95%     150.441us         8.39%     254.798us      10.617us      89.503us        52.00%      89.503us       3.729us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.503us        52.00%      89.503us       3.729us            24  
+                                            aten::copy_         3.17%      96.407us        73.91%       2.246ms     124.750us      57.409us        33.35%      60.256us       3.348us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.513us        23.54%      40.513us       3.376us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.215us        14.65%      25.215us       2.101us            12  
+                                            aten::clone         0.72%      21.929us        71.62%       2.176ms     362.673us       0.000us         0.00%      19.743us       3.291us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.896us         9.82%      16.896us       2.816us             6  
+                                              aten::add         1.01%      30.763us         1.71%      51.873us       8.645us      12.671us         7.36%      12.671us       2.112us             6  
+                                              aten::sub         1.24%      37.781us         2.01%      61.191us      10.198us      12.544us         7.29%      12.544us       2.091us             6  
+                                Activity Buffer Request        60.27%       1.831ms        60.27%       1.831ms       1.831ms       2.847us         1.65%       2.847us       2.847us             1  
+                                    aten::empty_strided         0.99%      30.010us         0.99%      30.010us       5.002us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.59%     260.978us         8.59%     260.978us      43.496us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.13%      64.731us         2.72%      82.502us       3.438us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.58%      17.771us         0.58%      17.771us       0.740us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         6.78%     205.930us         6.78%     205.930us       4.290us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.660us         0.19%       5.660us       5.660us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.944ms
-Self CUDA time total: 172.220us
+Self CPU time total: 3.038ms
+Self CUDA time total: 172.127us
 
 
 
@@ -4316,27 +4316,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     996.043us       348.79%     996.043us     996.043us             1  
-                                            torch_eager        20.87%     290.654us        99.57%       1.387ms       1.387ms       0.000us         0.00%     304.030us     304.030us             1  
-                                              aten::mul        12.33%     171.709us        21.56%     300.326us      12.514us     133.152us        46.63%     133.152us       5.548us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.152us        46.63%     133.152us       5.548us            24  
-                                            aten::copy_         7.20%     100.319us        38.33%     533.940us      29.663us     111.295us        38.97%     129.758us       7.209us            18  
-                                            aten::clone         1.43%      19.920us        31.74%     442.109us      73.685us       0.000us         0.00%      72.510us      12.085us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.248us        20.05%      57.248us       4.771us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      54.047us        18.93%      54.047us       9.008us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.120us        14.40%      41.120us       3.427us            12  
-                                              aten::sub         2.81%      39.111us         4.76%      66.281us      11.047us      20.672us         7.24%      20.672us       3.445us             6  
-                                              aten::add         2.49%      34.670us         4.42%      61.560us      10.260us      20.448us         7.16%      20.448us       3.408us             6  
-                                Activity Buffer Request        12.17%     169.484us        12.17%     169.484us     169.484us      18.463us         6.47%      18.463us      18.463us             1  
-                                    aten::empty_strided         2.13%      29.640us         2.13%      29.640us       4.940us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.75%     191.524us        13.75%     191.524us      31.921us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.84%      67.441us         6.08%      84.703us       3.529us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.24%      17.262us         1.24%      17.262us       0.719us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        18.33%     255.290us        18.33%     255.290us       5.319us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.43%       5.940us         0.43%       5.940us       5.940us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     909.882us       318.77%     909.882us     909.882us             1  
+                                            torch_eager        19.71%     283.050us        99.60%       1.431ms       1.431ms       0.000us         0.00%     303.835us     303.835us             1  
+                                              aten::mul        10.48%     150.585us        17.83%     256.036us      10.668us     133.276us        46.69%     133.276us       5.553us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.276us        46.69%     133.276us       5.553us            24  
+                                            aten::copy_         6.71%      96.363us        44.37%     637.354us      35.409us     111.072us        38.91%     129.472us       7.193us            18  
+                                            aten::clone         1.41%      20.282us        39.28%     564.182us      94.030us       0.000us         0.00%      72.352us      12.059us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.120us        20.01%      57.120us       4.760us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.952us        18.90%      53.952us       8.992us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.087us        14.39%      41.087us       3.424us            12  
+                                              aten::sub         3.01%      43.193us         4.71%      67.582us      11.264us      20.671us         7.24%      20.671us       3.445us             6  
+                                              aten::add         2.32%      33.393us         3.83%      55.073us       9.179us      20.416us         7.15%      20.416us       3.403us             6  
+                                Activity Buffer Request        16.10%     231.245us        16.10%     231.245us     231.245us      18.400us         6.45%      18.400us      18.400us             1  
+                                    aten::empty_strided         2.13%      30.620us         2.13%      30.620us       5.103us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.40%     249.916us        17.40%     249.916us      41.653us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.43%      63.582us         5.61%      80.624us       3.359us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.19%      17.042us         1.19%      17.042us       0.710us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.71%     211.350us        14.71%     211.350us       4.403us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.40%       5.701us         0.40%       5.701us       5.701us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.393ms
-Self CUDA time total: 285.567us
+Self CPU time total: 1.436ms
+Self CUDA time total: 285.435us
 
 
 
@@ -4346,27 +4346,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.018ms       178.84%       1.018ms       1.018ms             1  
-                                            torch_eager        20.93%     294.497us        99.60%       1.402ms       1.402ms       0.000us         0.00%     592.727us     592.727us             1  
-                                            aten::copy_         7.21%     101.432us        37.46%     527.132us      29.285us     273.662us        48.09%     297.374us      16.521us            18  
-                                              aten::mul        12.77%     179.663us        21.96%     309.058us      12.877us     229.816us        40.39%     229.816us       9.576us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     229.816us        40.39%     229.816us       9.576us            24  
-                                            aten::clone         1.44%      20.252us        30.89%     434.772us      72.462us       0.000us         0.00%     206.559us      34.426us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     182.847us        32.13%     182.847us      30.474us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.815us        15.96%      90.815us       7.568us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.537us        11.52%      65.537us       5.461us            12  
-                                              aten::sub         2.76%      38.901us         5.06%      71.272us      11.879us      32.865us         5.78%      32.865us       5.477us             6  
-                                              aten::add         2.62%      36.830us         4.51%      63.532us      10.589us      32.672us         5.74%      32.672us       5.445us             6  
-                                Activity Buffer Request        11.90%     167.504us        11.90%     167.504us     167.504us      23.712us         4.17%      23.712us      23.712us             1  
-                                    aten::empty_strided         2.15%      30.291us         2.15%      30.291us       5.049us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.10%     184.385us        13.10%     184.385us      30.731us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.77%      67.152us         6.08%      85.571us       3.565us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.31%      18.419us         1.31%      18.419us       0.767us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        18.64%     262.279us        18.64%     262.279us       5.464us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.40%       5.670us         0.40%       5.670us       5.670us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     912.081us       160.07%     912.081us     912.081us             1  
+                                            torch_eager        19.62%     281.157us        99.59%       1.427ms       1.427ms       0.000us         0.00%     593.529us     593.529us             1  
+                                            aten::copy_         6.77%      97.080us        44.43%     636.741us      35.375us     273.659us        48.03%     297.370us      16.521us            18  
+                                              aten::mul        10.89%     156.021us        18.62%     266.885us      11.120us     230.237us        40.41%     230.237us       9.593us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     230.237us        40.41%     230.237us       9.593us            24  
+                                            aten::clone         1.32%      18.940us        39.02%     559.152us      93.192us       0.000us         0.00%     206.843us      34.474us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     183.132us        32.14%     183.132us      30.522us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.527us        15.89%      90.527us       7.544us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.922us        11.57%      65.922us       5.494us            12  
+                                              aten::sub         2.63%      37.723us         4.26%      61.043us      10.174us      33.089us         5.81%      33.089us       5.515us             6  
+                                              aten::add         2.15%      30.871us         3.64%      52.150us       8.692us      32.833us         5.76%      32.833us       5.472us             6  
+                                Activity Buffer Request        16.39%     234.945us        16.39%     234.945us     234.945us      23.711us         4.16%      23.711us      23.711us             1  
+                                    aten::empty_strided         2.06%      29.522us         2.06%      29.522us       4.920us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.05%     244.375us        17.05%     244.375us      40.729us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.41%      63.274us         5.65%      80.913us       3.371us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.23%      17.639us         1.23%      17.639us       0.735us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.06%     215.804us        15.06%     215.804us       4.496us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.41%       5.810us         0.41%       5.810us       5.810us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.407ms
-Self CUDA time total: 569.015us
+Self CPU time total: 1.433ms
+Self CUDA time total: 569.818us
 
 
 
@@ -4376,27 +4376,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.000ms      1080.76%       1.000ms       1.000ms             1  
-                                            torch_eager         9.93%     294.202us        99.82%       2.956ms       2.956ms       0.000us         0.00%      93.659us      93.659us             1  
-                                              aten::mul         5.79%     171.584us        10.10%     299.275us      12.470us      49.565us        53.56%      49.565us       2.065us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.565us        53.56%      49.565us       2.065us            24  
-                                            aten::copy_         3.59%     106.321us        70.88%       2.099ms     116.627us      29.406us        31.78%      30.526us       1.696us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.623us        24.45%      22.623us       1.885us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.568us        14.66%      13.568us       1.131us            12  
-                                            aten::clone         0.71%      20.970us        67.84%       2.009ms     334.883us       0.000us         0.00%       7.903us       1.317us             6  
-                                              aten::sub         1.35%      39.881us         2.33%      69.031us      11.505us       6.784us         7.33%       6.784us       1.131us             6  
-                                              aten::add         1.15%      34.189us         1.99%      58.861us       9.810us       6.784us         7.33%       6.784us       1.131us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.783us         7.33%       6.783us       1.130us             6  
-                                Activity Buffer Request        58.33%       1.728ms        58.33%       1.728ms       1.728ms       1.120us         1.21%       1.120us       1.120us             1  
-                                    aten::empty_strided         1.04%      30.880us         1.04%      30.880us       5.147us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.53%     193.506us         6.53%     193.506us      32.251us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.25%      66.634us         2.83%      83.844us       3.493us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.58%      17.210us         0.58%      17.210us       0.717us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.56%     253.394us         8.56%     253.394us       5.279us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       5.350us         0.18%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     982.303us      1061.08%     982.303us     982.303us             1  
+                                            torch_eager         9.91%     295.071us        99.80%       2.971ms       2.971ms       0.000us         0.00%      93.696us      93.696us             1  
+                                              aten::mul         5.34%     158.961us         9.11%     271.063us      11.294us      49.631us        53.61%      49.631us       2.068us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.631us        53.61%      49.631us       2.068us            24  
+                                            aten::copy_         3.28%      97.679us        70.68%       2.104ms     116.897us      29.410us        31.77%      30.530us       1.696us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.593us        24.40%      22.593us       1.883us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.535us        14.62%      13.535us       1.128us            12  
+                                            aten::clone         0.75%      22.189us        68.29%       2.033ms     338.797us       0.000us         0.00%       7.937us       1.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.817us         7.36%       6.817us       1.136us             6  
+                                              aten::sub         1.25%      37.104us         2.05%      60.974us      10.162us       6.784us         7.33%       6.784us       1.131us             6  
+                                              aten::add         1.06%      31.689us         1.78%      52.921us       8.820us       6.751us         7.29%       6.751us       1.125us             6  
+                                Activity Buffer Request        57.10%       1.700ms        57.10%       1.700ms       1.700ms       1.120us         1.21%       1.120us       1.120us             1  
+                                    aten::empty_strided         1.03%      30.660us         1.03%      30.660us       5.110us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.33%     247.996us         8.33%     247.996us      41.333us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.24%      66.630us         4.50%     133.983us       5.583us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         2.26%      67.353us         2.26%      67.353us       2.806us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.25%     215.936us         7.25%     215.936us       4.499us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.820us         0.20%       5.820us       5.820us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.962ms
-Self CUDA time total: 92.539us
+Self CPU time total: 2.977ms
+Self CUDA time total: 92.576us
 
 
 
@@ -4406,27 +4406,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.005ms      1046.18%       1.005ms       1.005ms             1  
-                                            torch_eager        20.70%     290.067us        99.57%       1.395ms       1.395ms       0.000us         0.00%      97.374us      97.374us             1  
-                                              aten::mul        12.93%     181.162us        22.03%     308.587us      12.858us      51.138us        53.23%      51.138us       2.131us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.138us        53.23%      51.138us       2.131us            24  
-                                            aten::copy_         7.20%     100.934us        37.57%     526.405us      29.245us      30.750us        32.01%      32.061us       1.781us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.975us        23.92%      22.975us       1.915us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.175us        14.76%      14.175us       1.181us            12  
-                                            aten::clone         1.53%      21.448us        31.30%     438.559us      73.093us       0.000us         0.00%       9.086us       1.514us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.775us         8.09%       7.775us       1.296us             6  
-                                              aten::sub         2.81%      39.410us         4.84%      67.761us      11.294us       7.103us         7.39%       7.103us       1.184us             6  
-                                              aten::add         2.52%      35.299us         4.51%      63.202us      10.534us       7.072us         7.36%       7.072us       1.179us             6  
-                                Activity Buffer Request        11.90%     166.735us        11.90%     166.735us     166.735us       1.311us         1.36%       1.311us       1.311us             1  
-                                    aten::empty_strided         2.22%      31.071us         2.22%      31.071us       5.178us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.33%     186.813us        13.33%     186.813us      31.136us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.90%      68.622us         6.17%      86.384us       3.599us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.27%      17.762us         1.27%      17.762us       0.740us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        18.24%     255.602us        18.24%     255.602us       5.325us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.43%       6.050us         0.43%       6.050us       6.050us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     884.987us       920.94%     884.987us     884.987us             1  
+                                            torch_eager        19.85%     274.487us        99.61%       1.377ms       1.377ms       0.000us         0.00%      97.408us      97.408us             1  
+                                              aten::mul        10.71%     148.094us        18.37%     254.035us      10.585us      51.231us        53.31%      51.231us       2.135us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.231us        53.31%      51.231us       2.135us            24  
+                                            aten::copy_         6.85%      94.742us        43.76%     605.074us      33.615us      30.720us        31.97%      32.032us       1.780us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.881us        23.81%      22.881us       1.907us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.145us        14.72%      14.145us       1.179us            12  
+                                            aten::clone         1.40%      19.328us        38.56%     533.201us      88.867us       0.000us         0.00%       9.151us       1.525us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.839us         8.16%       7.839us       1.306us             6  
+                                              aten::sub         2.67%      36.902us         4.36%      60.321us      10.054us       7.073us         7.36%       7.073us       1.179us             6  
+                                              aten::add         2.24%      30.942us         3.77%      52.122us       8.687us       7.072us         7.36%       7.072us       1.179us             6  
+                                Activity Buffer Request        15.48%     213.995us        15.48%     213.995us     213.995us       1.312us         1.37%       1.312us       1.312us             1  
+                                    aten::empty_strided         2.17%      30.062us         2.17%      30.062us       5.010us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.28%     238.866us        17.28%     238.866us      39.811us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.64%      64.092us         5.92%      81.901us       3.413us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.29%      17.809us         1.29%      17.809us       0.742us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.04%     208.011us        15.04%     208.011us       4.334us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.39%       5.351us         0.39%       5.351us       5.351us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.401ms
-Self CUDA time total: 96.063us
+Self CPU time total: 1.383ms
+Self CUDA time total: 96.096us
 
 
 
@@ -4436,27 +4436,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.008ms       966.66%       1.008ms       1.008ms             1  
-                                            torch_eager        19.87%     287.726us        99.59%       1.442ms       1.442ms       0.000us         0.00%     105.564us     105.564us             1  
-                                              aten::mul        12.48%     180.745us        21.62%     312.998us      13.042us      55.327us        53.07%      55.327us       2.305us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.327us        53.07%      55.327us       2.305us            24  
-                                            aten::copy_         7.05%     102.025us        39.57%     572.964us      31.831us      32.416us        31.09%      33.727us       1.874us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.737us        23.73%      24.737us       2.061us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.510us        15.84%      16.510us       1.376us            12  
-                                            aten::clone         1.40%      20.271us        33.26%     481.521us      80.254us       0.000us         0.00%       8.990us       1.498us             6  
-                                              aten::sub         2.64%      38.220us         4.70%      68.062us      11.344us       8.351us         8.01%       8.351us       1.392us             6  
-                                              aten::add         2.49%      36.083us         4.34%      62.773us      10.462us       8.159us         7.83%       8.159us       1.360us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.679us         7.37%       7.679us       1.280us             6  
-                                Activity Buffer Request        13.36%     193.394us        13.36%     193.394us     193.394us       1.311us         1.26%       1.311us       1.311us             1  
-                                    aten::empty_strided         2.11%      30.520us         2.11%      30.520us       5.087us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.10%     204.105us        14.10%     204.105us      34.017us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.72%      68.322us         5.99%      86.681us       3.612us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.27%      18.359us         1.27%      18.359us       0.765us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        18.11%     262.225us        18.11%     262.225us       5.463us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.41%       5.880us         0.41%       5.880us       5.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     884.827us       849.48%     884.827us     884.827us             1  
+                                            torch_eager        19.57%     271.768us        99.62%       1.383ms       1.383ms       0.000us         0.00%     105.505us     105.505us             1  
+                                              aten::mul        10.84%     150.541us        18.55%     257.594us      10.733us      55.295us        53.09%      55.295us       2.304us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.295us        53.09%      55.295us       2.304us            24  
+                                            aten::copy_         7.04%      97.750us        44.26%     614.622us      34.146us      32.417us        31.12%      33.761us       1.876us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.609us        23.63%      24.609us       2.051us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.449us        15.79%      16.449us       1.371us            12  
+                                            aten::clone         1.38%      19.214us        38.80%     538.794us      89.799us       0.000us         0.00%       9.152us       1.525us             6  
+                                              aten::sub         2.56%      35.499us         4.18%      58.050us       9.675us       8.289us         7.96%       8.289us       1.381us             6  
+                                              aten::add         2.19%      30.379us         3.71%      51.460us       8.577us       8.160us         7.83%       8.160us       1.360us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.50%       7.808us       1.301us             6  
+                                Activity Buffer Request        16.03%     222.545us        16.03%     222.545us     222.545us       1.344us         1.29%       1.344us       1.344us             1  
+                                    aten::empty_strided         2.12%      29.421us         2.12%      29.421us       4.904us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.95%     235.404us        16.95%     235.404us      39.234us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.56%      63.330us         5.86%      81.351us       3.390us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.30%      18.021us         1.30%      18.021us       0.751us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.09%     209.608us        15.09%     209.608us       4.367us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.38%       5.250us         0.38%       5.250us       5.250us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.448ms
-Self CUDA time total: 104.253us
+Self CPU time total: 1.389ms
+Self CUDA time total: 104.161us
 
 
 
@@ -4466,27 +4466,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.006ms       813.90%       1.006ms       1.006ms             1  
-                                            torch_eager         9.94%     295.168us        99.80%       2.964ms       2.964ms       0.000us         0.00%     125.309us     125.309us             1  
-                                              aten::mul         5.76%     171.113us        10.09%     299.797us      12.492us      65.183us        52.76%      65.183us       2.716us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.183us        52.76%      65.183us       2.716us            24  
-                                            aten::copy_         3.36%      99.933us        70.61%       2.097ms     116.499us      39.102us        31.65%      40.862us       2.270us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.702us        23.23%      28.702us       2.392us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.59%      19.264us       1.605us            12  
-                                            aten::clone         0.72%      21.400us        67.60%       2.008ms     334.638us       0.000us         0.00%      12.160us       2.027us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us         8.42%      10.400us       1.733us             6  
-                                              aten::sub         1.36%      40.342us         2.40%      71.402us      11.900us       9.664us         7.82%       9.664us       1.611us             6  
-                                              aten::add         1.19%      35.292us         2.11%      62.642us      10.440us       9.600us         7.77%       9.600us       1.600us             6  
-                                Activity Buffer Request        58.22%       1.729ms        58.22%       1.729ms       1.729ms       1.760us         1.42%       1.760us       1.760us             1  
-                                    aten::empty_strided         1.03%      30.470us         1.03%      30.470us       5.078us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.55%     194.565us         6.55%     194.565us      32.427us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.28%      67.682us         2.90%      86.242us       3.593us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.62%      18.560us         0.62%      18.560us       0.773us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.77%     260.395us         8.77%     260.395us       5.425us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.880us         0.20%       5.880us       5.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     919.709us       742.30%     919.709us     919.709us             1  
+                                            torch_eager         9.60%     287.984us        99.81%       2.995ms       2.995ms       0.000us         0.00%     125.756us     125.756us             1  
+                                              aten::mul         5.08%     152.565us         8.72%     261.687us      10.904us      65.119us        52.56%      65.119us       2.713us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.119us        52.56%      65.119us       2.713us            24  
+                                            aten::copy_         3.44%     103.340us        73.23%       2.197ms     122.068us      39.390us        31.79%      41.246us       2.291us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.863us        23.30%      28.863us       2.405us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.391us        15.65%      19.391us       1.616us            12  
+                                            aten::clone         0.71%      21.441us        70.60%       2.118ms     353.061us       0.000us         0.00%      12.383us       2.064us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.527us         8.50%      10.527us       1.754us             6  
+                                              aten::add         1.05%      31.401us         1.75%      52.512us       8.752us       9.728us         7.85%       9.728us       1.621us             6  
+                                              aten::sub         1.23%      37.000us         2.00%      60.082us      10.014us       9.663us         7.80%       9.663us       1.611us             6  
+                                Activity Buffer Request        59.85%       1.796ms        59.85%       1.796ms       1.796ms       1.856us         1.50%       1.856us       1.856us             1  
+                                    aten::empty_strided         1.02%      30.521us         1.02%      30.521us       5.087us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         7.96%     238.696us         7.96%     238.696us      39.783us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.18%      65.422us         2.78%      83.321us       3.472us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.60%      17.899us         0.60%      17.899us       0.746us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.09%     212.797us         7.09%     212.797us       4.433us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.701us         0.19%       5.701us       5.701us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.970ms
-Self CUDA time total: 123.549us
+Self CPU time total: 3.000ms
+Self CUDA time total: 123.900us
 
 
 
@@ -4496,27 +4496,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     983.089us       943.25%     983.089us     983.089us             1  
-                                            torch_eager        19.98%     288.033us        99.62%       1.436ms       1.436ms       0.000us         0.00%     105.536us     105.536us             1  
-                                              aten::mul        11.74%     169.216us        20.40%     294.119us      12.255us      55.424us        53.18%      55.424us       2.309us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.424us        53.18%      55.424us       2.309us            24  
-                                            aten::copy_         7.16%     103.226us        41.05%     591.856us      32.881us      32.352us        31.04%      33.664us       1.870us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.640us        23.64%      24.640us       2.053us            12  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     877.937us       842.38%     877.937us     877.937us             1  
+                                            torch_eager        19.95%     270.846us        99.61%       1.353ms       1.353ms       0.000us         0.00%     105.532us     105.532us             1  
+                                              aten::mul        11.07%     150.304us        18.76%     254.767us      10.615us      55.358us        53.12%      55.358us       2.307us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.358us        53.12%      55.358us       2.307us            24  
+                                            aten::copy_         7.05%      95.673us        43.37%     588.992us      32.722us      32.415us        31.10%      33.726us       1.874us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.672us        23.67%      24.672us       2.056us            12  
 void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.448us        15.78%      16.448us       1.371us            12  
-                                            aten::clone         1.52%      21.981us        35.14%     506.672us      84.445us       0.000us         0.00%       9.024us       1.504us             6  
-                                              aten::sub         2.73%      39.361us         4.47%      64.512us      10.752us       8.289us         7.95%       8.289us       1.381us             6  
-                                              aten::add         2.40%      34.591us         4.34%      62.531us      10.422us       8.159us         7.83%       8.159us       1.360us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us         7.40%       7.712us       1.285us             6  
-                                Activity Buffer Request        15.86%     228.735us        15.86%     228.735us     228.735us       1.312us         1.26%       1.312us       1.312us             1  
-                                    aten::empty_strided         2.19%      31.580us         2.19%      31.580us       5.263us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.29%     191.574us        13.29%     191.574us      31.929us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.47%      64.452us         5.67%      81.763us       3.407us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.20%      17.311us         1.20%      17.311us       0.721us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        17.08%     246.315us        17.08%     246.315us       5.132us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.38%       5.431us         0.38%       5.431us       5.431us       0.000us         0.00%       0.000us       0.000us             1  
+                                            aten::clone         1.39%      18.922us        38.11%     517.523us      86.254us       0.000us         0.00%       9.054us       1.509us             6  
+                                              aten::sub         2.65%      35.970us         4.36%      59.161us       9.860us       8.288us         7.95%       8.288us       1.381us             6  
+                                              aten::add         2.30%      31.190us         3.85%      52.221us       8.703us       8.160us         7.83%       8.160us       1.360us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.743us         7.43%       7.743us       1.290us             6  
+                                Activity Buffer Request        14.68%     199.344us        14.68%     199.344us     199.344us       1.311us         1.26%       1.311us       1.311us             1  
+                                    aten::empty_strided         2.13%      28.940us         2.13%      28.940us       4.823us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.49%     237.454us        17.49%     237.454us      39.576us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.56%      61.882us         5.80%      78.761us       3.282us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.24%      16.879us         1.24%      16.879us       0.703us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.11%     205.206us        15.11%     205.206us       4.275us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.39%       5.300us         0.39%       5.300us       5.300us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.442ms
-Self CUDA time total: 104.224us
+Self CPU time total: 1.358ms
+Self CUDA time total: 104.221us
 
 
 
@@ -4526,27 +4526,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.004ms       809.35%       1.004ms       1.004ms             1  
-                                            torch_eager        20.77%     293.259us        99.59%       1.406ms       1.406ms       0.000us         0.00%     125.785us     125.785us             1  
-                                              aten::mul        11.90%     168.115us        21.00%     296.496us      12.354us      65.470us        52.80%      65.470us       2.728us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.470us        52.80%      65.470us       2.728us            24  
-                                            aten::copy_         7.29%     102.893us        38.63%     545.534us      30.307us      39.326us        31.72%      41.117us       2.284us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.831us        23.25%      28.831us       2.403us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.198us        15.48%      19.198us       1.600us            12  
-                                            aten::clone         1.46%      20.620us        31.97%     451.491us      75.249us       0.000us         0.00%      12.286us       2.048us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.495us         8.46%      10.495us       1.749us             6  
-                                              aten::add         2.69%      38.013us         4.51%      63.752us      10.625us       9.599us         7.74%       9.599us       1.600us             6  
-                                              aten::sub         3.27%      46.223us         5.18%      73.182us      12.197us       9.599us         7.74%       9.599us       1.600us             6  
-                                Activity Buffer Request        12.70%     179.315us        12.70%     179.315us     179.315us       1.791us         1.44%       1.791us       1.791us             1  
-                                    aten::empty_strided         2.12%      29.941us         2.12%      29.941us       4.990us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.35%     188.535us        13.35%     188.535us      31.423us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.72%      66.700us         5.92%      83.651us       3.485us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.20%      16.951us         1.20%      16.951us       0.706us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        18.12%     255.870us        18.12%     255.870us       5.331us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.41%       5.720us         0.41%       5.720us       5.720us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     922.549us       744.77%     922.549us     922.549us             1  
+                                            torch_eager        21.60%     303.651us        99.57%       1.400ms       1.400ms       0.000us         0.00%     125.727us     125.727us             1  
+                                              aten::mul        10.95%     153.925us        18.62%     261.836us      10.910us      65.152us        52.60%      65.152us       2.715us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.152us        52.60%      65.152us       2.715us            24  
+                                            aten::copy_         6.95%      97.683us        42.26%     594.153us      33.008us      39.455us        31.85%      41.311us       2.295us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.863us        23.30%      28.863us       2.405us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.55%      19.264us       1.605us            12  
+                                            aten::clone         1.38%      19.461us        36.95%     519.462us      86.577us       0.000us         0.00%      12.448us       2.075us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.592us         8.55%      10.592us       1.765us             6  
+                                              aten::add         2.18%      30.620us         3.65%      51.250us       8.542us       9.664us         7.80%       9.664us       1.611us             6  
+                                              aten::sub         2.57%      36.162us         4.29%      60.263us      10.044us       9.600us         7.75%       9.600us       1.600us             6  
+                                Activity Buffer Request        14.58%     204.944us        14.58%     204.944us     204.944us       1.856us         1.50%       1.856us       1.856us             1  
+                                    aten::empty_strided         2.07%      29.171us         2.07%      29.171us       4.862us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.64%     233.894us        16.64%     233.894us      38.982us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.45%      62.564us         5.70%      80.065us       3.336us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.24%      17.501us         1.24%      17.501us       0.729us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.96%     210.274us        14.96%     210.274us       4.381us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.43%       5.990us         0.43%       5.990us       5.990us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.412ms
-Self CUDA time total: 123.994us
+Self CPU time total: 1.406ms
+Self CUDA time total: 123.871us
 
 
 
@@ -4556,27 +4556,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.022ms       574.36%       1.022ms       1.022ms             1  
-                                            torch_eager         9.88%     300.789us        99.79%       3.040ms       3.040ms       0.000us         0.00%     180.741us     180.741us             1  
-                                              aten::mul         5.87%     178.699us        10.24%     311.746us      12.989us      95.331us        53.59%      95.331us       3.972us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      95.331us        53.59%      95.331us       3.972us            24  
-                                            aten::copy_         3.42%     104.263us        70.83%       2.158ms     119.861us      57.731us        32.45%      60.579us       3.366us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.803us        22.94%      40.803us       3.400us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.831us        13.96%      24.831us       2.069us            12  
-                                            aten::clone         0.72%      21.809us        67.85%       2.067ms     344.453us       0.000us         0.00%      19.776us       3.296us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us         9.52%      16.928us       2.821us             6  
-                                              aten::add         1.17%      35.582us         2.06%      62.693us      10.449us      12.480us         7.02%      12.480us       2.080us             6  
-                                              aten::sub         1.34%      40.961us         2.23%      67.853us      11.309us      12.351us         6.94%      12.351us       2.059us             6  
-                                Activity Buffer Request        58.86%       1.793ms        58.86%       1.793ms       1.793ms       2.848us         1.60%       2.848us       2.848us             1  
-                                    aten::empty_strided         1.00%      30.461us         1.00%      30.461us       5.077us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.14%     187.135us         6.14%     187.135us      31.189us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.22%      67.750us         2.85%      86.690us       3.612us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.62%      18.940us         0.62%      18.940us       0.789us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.55%     260.511us         8.55%     260.511us       5.427us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.21%       6.321us         0.21%       6.321us       6.321us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     897.539us       506.00%     897.539us     897.539us             1  
+                                            torch_eager         9.74%     279.839us        99.80%       2.869ms       2.869ms       0.000us         0.00%     180.226us     180.226us             1  
+                                              aten::mul         5.31%     152.543us         9.01%     258.914us      10.788us      94.688us        53.38%      94.688us       3.945us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.688us        53.38%      94.688us       3.945us            24  
+                                            aten::copy_         3.38%      97.055us        72.52%       2.084ms     115.804us      57.633us        32.49%      60.481us       3.360us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.641us        22.91%      40.641us       3.387us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.057us        14.13%      25.057us       2.088us            12  
+                                            aten::clone         0.77%      22.210us        70.07%       2.014ms     335.666us       0.000us         0.00%      19.840us       3.307us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.992us         9.58%      16.992us       2.832us             6  
+                                              aten::add         1.09%      31.359us         1.84%      52.920us       8.820us      12.577us         7.09%      12.577us       2.096us             6  
+                                              aten::sub         1.28%      36.869us         2.08%      59.850us       9.975us      12.480us         7.04%      12.480us       2.080us             6  
+                                Activity Buffer Request        59.04%       1.697ms        59.04%       1.697ms       1.697ms       2.848us         1.61%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.03%      29.731us         1.03%      29.731us       4.955us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.09%     232.664us         8.09%     232.664us      38.777us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.21%      63.552us         2.81%      80.641us       3.360us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.59%      17.089us         0.59%      17.089us       0.712us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.26%     208.724us         7.26%     208.724us       4.348us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.620us         0.20%       5.620us       5.620us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.046ms
-Self CUDA time total: 177.893us
+Self CPU time total: 2.874ms
+Self CUDA time total: 177.378us
 
 
 
@@ -4586,27 +4586,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.032ms       346.83%       1.032ms       1.032ms             1  
-                                            torch_eager        10.94%     330.454us        99.81%       3.014ms       3.014ms       0.000us         0.00%     315.361us     315.361us             1  
-                                              aten::mul         5.75%     173.643us        10.12%     305.657us      12.736us     145.696us        48.95%     145.696us       6.071us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.696us        48.95%     145.696us       6.071us            24  
-                                            aten::copy_         3.49%     105.289us        69.61%       2.102ms     116.764us     110.818us        37.23%     128.546us       7.141us            18  
-                                            aten::clone         0.91%      27.529us        66.74%       2.015ms     335.845us       0.000us         0.00%      71.233us      11.872us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.313us        19.26%      57.313us       4.776us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.505us        17.98%      53.505us       8.918us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.119us        13.82%      41.119us       3.427us            12  
-                                              aten::sub         1.33%      40.040us         2.29%      68.991us      11.499us      20.607us         6.92%      20.607us       3.434us             6  
-                                              aten::add         1.18%      35.771us         2.04%      61.541us      10.257us      20.512us         6.89%      20.512us       3.419us             6  
-                                Activity Buffer Request        56.83%       1.716ms        56.83%       1.716ms       1.716ms      17.728us         5.96%      17.728us      17.728us             1  
-                                    aten::empty_strided         1.04%      31.352us         1.04%      31.352us       5.225us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.87%     207.436us         6.87%     207.436us      34.573us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.27%      68.592us         2.86%      86.271us       3.595us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.59%      17.679us         0.59%      17.679us       0.737us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.61%     259.847us         8.61%     259.847us       5.413us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.690us         0.19%       5.690us       5.690us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     898.810us       301.67%     898.810us     898.810us             1  
+                                            torch_eager         9.55%     282.366us        99.82%       2.951ms       2.951ms       0.000us         0.00%     315.933us     315.933us             1  
+                                              aten::mul         5.12%     151.293us         8.70%     257.314us      10.721us     145.309us        48.77%     145.309us       6.055us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.309us        48.77%     145.309us       6.055us            24  
+                                            aten::copy_         3.22%      95.316us        73.31%       2.167ms     120.400us     111.488us        37.42%     129.472us       7.193us            18  
+                                            aten::clone         0.72%      21.328us        70.93%       2.097ms     349.496us       0.000us         0.00%      72.192us      12.032us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.280us        19.22%      57.280us       4.773us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      54.208us        18.19%      54.208us       9.035us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.152us        13.81%      41.152us       3.429us            12  
+                                              aten::sub         1.21%      35.903us         1.98%      58.432us       9.739us      20.607us         6.92%      20.607us       3.434us             6  
+                                              aten::add         1.05%      31.001us         1.75%      51.651us       8.608us      20.545us         6.90%      20.545us       3.424us             6  
+                                Activity Buffer Request        60.43%       1.786ms        60.43%       1.786ms       1.786ms      17.984us         6.04%      17.984us      17.984us             1  
+                                    aten::empty_strided         1.00%      29.601us         1.00%      29.601us       4.934us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         7.72%     228.244us         7.72%     228.244us      38.041us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.20%      64.933us         2.81%      83.072us       3.461us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.61%      18.139us         0.61%      18.139us       0.756us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         6.98%     206.421us         6.98%     206.421us       4.300us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.270us         0.18%       5.270us       5.270us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.019ms
-Self CUDA time total: 297.633us
+Self CPU time total: 2.956ms
+Self CUDA time total: 297.949us
 
 
 
@@ -4616,27 +4616,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     997.224us       560.81%     997.224us     997.224us             1  
-                                            torch_eager        20.41%     294.666us        99.60%       1.438ms       1.438ms       0.000us         0.00%     180.699us     180.699us             1  
-                                              aten::mul        11.81%     170.426us        20.82%     300.510us      12.521us      95.263us        53.57%      95.263us       3.969us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      95.263us        53.57%      95.263us       3.969us            24  
-                                            aten::copy_         6.98%     100.734us        40.22%     580.546us      32.253us      57.725us        32.46%      60.604us       3.367us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.895us        23.00%      40.895us       3.408us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.832us        13.96%      24.832us       2.069us            12  
-                                            aten::clone         1.48%      21.402us        34.03%     491.202us      81.867us       0.000us         0.00%      19.709us       3.285us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.830us         9.46%      16.830us       2.805us             6  
-                                              aten::sub         2.72%      39.260us         4.81%      69.410us      11.568us      12.416us         6.98%      12.416us       2.069us             6  
-                                              aten::add         2.46%      35.491us         4.17%      60.132us      10.022us      12.416us         6.98%      12.416us       2.069us             6  
-                                Activity Buffer Request        14.76%     213.066us        14.76%     213.066us     213.066us       2.879us         1.62%       2.879us       2.879us             1  
-                                    aten::empty_strided         2.03%      29.329us         2.03%      29.329us       4.888us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.55%     195.534us        13.55%     195.534us      32.589us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.52%      65.271us         5.66%      81.651us       3.402us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.13%      16.380us         1.13%      16.380us       0.683us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        17.74%     256.087us        17.74%     256.087us       5.335us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.40%       5.801us         0.40%       5.801us       5.801us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     892.604us       503.24%     892.604us     892.604us             1  
+                                            torch_eager        19.40%     275.276us        99.62%       1.413ms       1.413ms       0.000us         0.00%     180.252us     180.252us             1  
+                                              aten::mul        10.76%     152.603us        18.14%     257.417us      10.726us      94.815us        53.46%      94.815us       3.951us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.815us        53.46%      94.815us       3.951us            24  
+                                            aten::copy_         6.67%      94.615us        44.72%     634.467us      35.248us      57.598us        32.47%      60.478us       3.360us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.510us        22.84%      40.510us       3.376us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.959us        14.07%      24.959us       2.080us            12  
+                                            aten::clone         1.36%      19.279us        39.62%     562.091us      93.682us       0.000us         0.00%      19.968us       3.328us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us         9.63%      17.088us       2.848us             6  
+                                              aten::add         2.18%      30.992us         4.25%      60.342us      10.057us      12.607us         7.11%      12.607us       2.101us             6  
+                                              aten::sub         2.49%      35.261us         4.05%      57.481us       9.580us      12.352us         6.96%      12.352us       2.059us             6  
+                                Activity Buffer Request        18.10%     256.816us        18.10%     256.816us     256.816us       2.880us         1.62%       2.880us       2.880us             1  
+                                    aten::empty_strided         2.07%      29.380us         2.07%      29.380us       4.897us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.91%     225.774us        15.91%     225.774us      37.629us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.38%      62.100us         5.62%      79.779us       3.324us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.25%      17.679us         1.25%      17.679us       0.737us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.06%     213.646us        15.06%     213.646us       4.451us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.38%       5.461us         0.38%       5.461us       5.461us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.443ms
-Self CUDA time total: 177.820us
+Self CPU time total: 1.419ms
+Self CUDA time total: 177.372us
 
 
 
@@ -4646,27 +4646,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.006ms       337.57%       1.006ms       1.006ms             1  
-                                            torch_eager        19.59%     286.301us        99.64%       1.456ms       1.456ms       0.000us         0.00%     316.065us     316.065us             1  
-                                              aten::mul        11.95%     174.684us        20.79%     303.846us      12.660us     145.439us        48.82%     145.439us       6.060us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.439us        48.82%     145.439us       6.060us            24  
-                                            aten::copy_         7.17%     104.791us        40.98%     598.785us      33.266us     111.649us        37.48%     129.826us       7.213us            18  
-                                            aten::clone         1.35%      19.700us        34.46%     503.492us      83.915us       0.000us         0.00%      72.450us      12.075us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.376us        19.26%      57.376us       4.781us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      54.273us        18.22%      54.273us       9.045us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.800us        13.70%      40.800us       3.400us            12  
-                                              aten::sub         2.64%      38.590us         4.63%      67.691us      11.282us      20.448us         6.86%      20.448us       3.408us             6  
-                                              aten::add         2.53%      37.009us         4.42%      64.522us      10.754us      20.352us         6.83%      20.352us       3.392us             6  
-                                Activity Buffer Request        15.77%     230.486us        15.77%     230.486us     230.486us      18.177us         6.10%      18.177us      18.177us             1  
-                                    aten::empty_strided         2.02%      29.450us         2.02%      29.450us       4.908us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        12.98%     189.676us        12.98%     189.676us      31.613us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.60%      67.290us         5.86%      85.691us       3.570us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.26%      18.401us         1.26%      18.401us       0.767us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        17.77%     259.608us        17.77%     259.608us       5.409us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.36%       5.300us         0.36%       5.300us       5.300us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     993.373us       332.44%     993.373us     993.373us             1  
+                                            torch_eager        19.67%     287.571us        99.61%       1.456ms       1.456ms       0.000us         0.00%     317.118us     317.118us             1  
+                                              aten::mul        10.73%     156.833us        18.24%     266.665us      11.111us     145.536us        48.70%     145.536us       6.064us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.536us        48.70%     145.536us       6.064us            24  
+                                            aten::copy_         6.97%     101.849us        44.88%     655.991us      36.444us     112.414us        37.62%     130.718us       7.262us            18  
+                                            aten::clone         1.31%      19.190us        34.67%     506.790us      84.465us       0.000us         0.00%      73.472us      12.245us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.246us        19.16%      57.246us       4.770us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      55.168us        18.46%      55.168us       9.195us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.864us        13.68%      40.864us       3.405us            12  
+                                              aten::sub         2.57%      37.523us         4.19%      61.312us      10.219us      20.447us         6.84%      20.447us       3.408us             6  
+                                              aten::add         2.11%      30.781us         3.53%      51.651us       8.608us      20.417us         6.83%      20.417us       3.403us             6  
+                                Activity Buffer Request        13.85%     202.395us        13.85%     202.395us     202.395us      18.304us         6.13%      18.304us      18.304us             1  
+                                    aten::empty_strided         2.07%      30.250us         2.07%      30.250us       5.042us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.36%     224.584us        15.36%     224.584us      37.431us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.45%      65.071us         5.71%      83.402us       3.475us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.25%      18.331us         1.25%      18.331us       0.764us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        19.27%     281.654us        19.27%     281.654us       5.868us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.39%       5.741us         0.39%       5.741us       5.741us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.461ms
-Self CUDA time total: 297.888us
+Self CPU time total: 1.462ms
+Self CUDA time total: 298.814us
 
 
 
@@ -4676,27 +4676,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.028ms       173.98%       1.028ms       1.028ms             1  
-                                            torch_eager        20.54%     307.941us        99.60%       1.493ms       1.493ms       0.000us         0.00%     614.720us     614.720us             1  
-                                            aten::copy_         6.96%     104.343us        39.80%     596.653us      33.147us     277.825us        47.01%     301.537us      16.752us            18  
-                                              aten::mul        11.79%     176.823us        20.94%     313.949us      13.081us     247.103us        41.81%     247.103us      10.296us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     247.103us        41.81%     247.103us      10.296us            24  
-                                            aten::clone         1.37%      20.599us        33.38%     500.402us      83.400us       0.000us         0.00%     210.816us      35.136us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     187.104us        31.66%     187.104us      31.184us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.721us        15.35%      90.721us       7.560us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      66.080us        11.18%      66.080us       5.507us            12  
-                                              aten::add         2.50%      37.532us         4.42%      66.263us      11.044us      33.056us         5.59%      33.056us       5.509us             6  
-                                              aten::sub         2.78%      41.622us         4.80%      72.031us      12.005us      33.024us         5.59%      33.024us       5.504us             6  
-                                Activity Buffer Request        15.35%     230.085us        15.35%     230.085us     230.085us      23.712us         4.01%      23.712us      23.712us             1  
-                                    aten::empty_strided         1.95%      29.191us         1.95%      29.191us       4.865us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        12.52%     187.674us        12.52%     187.674us      31.279us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.56%      68.431us         5.78%      86.660us       3.611us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.22%      18.229us         1.22%      18.229us       0.760us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        18.06%     270.817us        18.06%     270.817us       5.642us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.40%       6.020us         0.40%       6.020us       6.020us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     897.247us       151.96%     897.247us     897.247us             1  
+                                            torch_eager        20.14%     274.634us        99.62%       1.358ms       1.358ms       0.000us         0.00%     614.305us     614.305us             1  
+                                            aten::copy_         6.96%      94.863us        42.61%     580.932us      32.274us     278.975us        47.25%     302.847us      16.825us            18  
+                                              aten::mul        11.17%     152.301us        19.13%     260.794us      10.866us     245.758us        41.62%     245.758us      10.240us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     245.758us        41.62%     245.758us      10.240us            24  
+                                            aten::clone         1.47%      20.049us        37.19%     507.071us      84.512us       0.000us         0.00%     212.225us      35.371us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     188.353us        31.90%     188.353us      31.392us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.622us        15.35%      90.622us       7.552us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.700us        11.13%      65.700us       5.475us            12  
+                                              aten::add         2.26%      30.880us         3.85%      52.533us       8.756us      32.930us         5.58%      32.930us       5.488us             6  
+                                              aten::sub         2.73%      37.222us         4.45%      60.643us      10.107us      32.770us         5.55%      32.770us       5.462us             6  
+                                Activity Buffer Request        15.15%     206.524us        15.15%     206.524us     206.524us      23.872us         4.04%      23.872us      23.872us             1  
+                                    aten::empty_strided         2.13%      29.022us         2.13%      29.022us       4.837us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.16%     220.325us        16.16%     220.325us      36.721us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.58%      62.481us         5.83%      79.533us       3.314us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.25%      17.052us         1.25%      17.052us       0.711us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.61%     212.787us        15.61%     212.787us       4.433us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.38%       5.230us         0.38%       5.230us       5.230us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.499ms
-Self CUDA time total: 591.008us
+Self CPU time total: 1.363ms
+Self CUDA time total: 590.433us
 
 
 
@@ -4706,59 +4706,59 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager        13.10%     290.757us        64.42%       1.430ms       1.430ms       0.000us         0.00%       1.858ms       1.858ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.829ms       102.08%       1.829ms       1.829ms             1  
-                                            aten::copy_         4.50%      99.996us        26.00%     577.207us      32.067us     806.683us        45.02%     872.922us      48.496us            18  
-                                              aten::mul         7.46%     165.620us        13.35%     296.315us      12.346us     837.369us        46.73%     837.369us      34.890us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     837.369us        46.73%     837.369us      34.890us            24  
-                                            aten::clone         0.93%      20.729us        21.91%     486.420us      81.070us       0.000us         0.00%     620.507us     103.418us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     554.268us        30.93%     554.268us      92.378us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     252.415us        14.09%     252.415us      21.035us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     147.935us         8.26%     147.935us      12.328us            12  
-                                              aten::sub         1.77%      39.361us         2.98%      66.203us      11.034us      89.536us         5.00%      89.536us      14.923us             6  
-                                Activity Buffer Request         8.80%     195.395us         8.80%     195.395us     195.395us      66.239us         3.70%      66.239us      66.239us             1  
-                                              aten::add         1.55%      34.513us         2.81%      62.423us      10.404us      58.399us         3.26%      58.399us       9.733us             6  
-                                    aten::empty_strided         1.34%      29.799us         1.34%      29.799us       4.967us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.39%     208.344us         9.39%     208.344us      34.724us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.12%      69.193us         3.90%      86.553us       3.606us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.78%      17.360us         0.78%      17.360us       0.723us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        11.66%     258.919us        11.66%     258.919us       5.394us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize        35.58%     789.949us        35.58%     789.949us     789.949us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager        12.28%     274.098us        60.28%       1.346ms       1.346ms       0.000us         0.00%       1.856ms       1.856ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.827ms       102.10%       1.827ms       1.827ms             1  
+                                            aten::copy_         4.28%      95.634us        25.22%     562.931us      31.274us     804.864us        44.98%     871.488us      48.416us            18  
+                                              aten::mul         6.88%     153.564us        11.78%     262.937us      10.956us     837.983us        46.83%     837.983us      34.916us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     837.983us        46.83%     837.983us      34.916us            24  
+                                            aten::clone         0.88%      19.740us        22.00%     491.160us      81.860us       0.000us         0.00%     619.552us     103.259us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     552.928us        30.90%     552.928us      92.155us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     251.936us        14.08%     251.936us      20.995us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     146.496us         8.19%     146.496us      12.208us            12  
+                                              aten::sub         1.66%      37.140us         2.70%      60.281us      10.047us      88.032us         4.92%      88.032us      14.672us             6  
+                                Activity Buffer Request         8.33%     186.034us         8.33%     186.034us     186.034us      66.624us         3.72%      66.624us      66.624us             1  
+                                              aten::add         1.41%      31.471us         2.39%      53.240us       8.873us      58.464us         3.27%      58.464us       9.744us             6  
+                                    aten::empty_strided         1.36%      30.441us         1.36%      30.441us       5.074us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.01%     223.534us        10.01%     223.534us      37.256us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.89%      64.468us         3.67%      81.941us       3.414us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.78%      17.473us         0.78%      17.473us       0.728us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.50%     212.012us         9.50%     212.012us       4.417us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize        39.72%     886.609us        39.72%     886.609us     886.609us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.220ms
-Self CUDA time total: 1.792ms
+Self CPU time total: 2.232ms
+Self CUDA time total: 1.789ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_eager              cuda_B1_S128_H32_D128_R64     0.27  True
-torch_eager              cuda_B1_S128_H32_D64_R32     0.25  True
-torch_eager              cuda_B1_S128_H8_D128_R64     0.25  True
-torch_eager              cuda_B1_S128_H8_D64_R32     0.19  True
-torch_eager              cuda_B1_S2048_H32_D128_R64     0.25  True
-torch_eager              cuda_B1_S2048_H32_D64_R32     0.25  True
-torch_eager              cuda_B1_S2048_H8_D128_R64     0.24  True
-torch_eager              cuda_B1_S2048_H8_D64_R32     0.25  True
-torch_eager              cuda_B1_S512_H32_D128_R64     0.24  True
-torch_eager              cuda_B1_S512_H32_D64_R32     0.24  True
-torch_eager              cuda_B1_S512_H8_D128_R64     0.24  True
-torch_eager              cuda_B1_S512_H8_D64_R32     0.25  True
-torch_eager              cuda_B2_S128_H32_D128_R64     0.24  True
-torch_eager              cuda_B2_S128_H32_D64_R32     0.25  True
-torch_eager              cuda_B2_S128_H8_D128_R64     0.25  True
-torch_eager              cuda_B2_S128_H8_D64_R32     0.24  True
-torch_eager              cuda_B2_S2048_H32_D128_R64     0.65  True
-torch_eager              cuda_B2_S2048_H32_D64_R32     0.25  True
-torch_eager              cuda_B2_S2048_H8_D128_R64     0.25  True
-torch_eager              cuda_B2_S2048_H8_D64_R32     0.24  True
-torch_eager              cuda_B2_S512_H32_D128_R64     0.25  True
-torch_eager              cuda_B2_S512_H32_D64_R32     0.24  True
-torch_eager              cuda_B2_S512_H8_D128_R64     0.24  True
-torch_eager              cuda_B2_S512_H8_D64_R32     0.24  True
+torch_eager              cuda_B1_S128_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H32_D64_R32     0.22  True
+torch_eager              cuda_B1_S128_H8_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H8_D64_R32     0.17  True
+torch_eager              cuda_B1_S2048_H32_D128_R64     0.23  True
+torch_eager              cuda_B1_S2048_H32_D64_R32     0.22  True
+torch_eager              cuda_B1_S2048_H8_D128_R64     0.22  True
+torch_eager              cuda_B1_S2048_H8_D64_R32     0.22  True
+torch_eager              cuda_B1_S512_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S512_H32_D64_R32     0.21  True
+torch_eager              cuda_B1_S512_H8_D128_R64     0.21  True
+torch_eager              cuda_B1_S512_H8_D64_R32     0.22  True
+torch_eager              cuda_B2_S128_H32_D128_R64     0.21  True
+torch_eager              cuda_B2_S128_H32_D64_R32     0.21  True
+torch_eager              cuda_B2_S128_H8_D128_R64     0.22  True
+torch_eager              cuda_B2_S128_H8_D64_R32     0.21  True
+torch_eager              cuda_B2_S2048_H32_D128_R64     0.64  True
+torch_eager              cuda_B2_S2048_H32_D64_R32     0.23  True
+torch_eager              cuda_B2_S2048_H8_D128_R64     0.22  True
+torch_eager              cuda_B2_S2048_H8_D64_R32     0.22  True
+torch_eager              cuda_B2_S512_H32_D128_R64     0.22  True
+torch_eager              cuda_B2_S512_H32_D64_R32     0.22  True
+torch_eager              cuda_B2_S512_H8_D128_R64     0.21  True
+torch_eager              cuda_B2_S512_H8_D64_R32     0.21  True
 
▶ UV Install Logs