diff --git "a/rotary/impls/torch_rotary.html" "b/rotary/impls/torch_rotary.html" --- "a/rotary/impls/torch_rotary.html" +++ "b/rotary/impls/torch_rotary.html" @@ -3904,7 +3904,7 @@ Cell: nv | 0.25s
-
Fri Dec 19 18:55:27 2025       
+
Fri Dec 19 19:54:55 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.25s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   28C    P0             85W /  350W |       0MiB /  46068MiB |     34%      Default |
+| N/A   36C    P0             90W /  350W |       0MiB /  46068MiB |     17%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3935,9 +3935,9 @@ Cell: nv | 0.25s
 
 ▼ code 
 ▼ output
- ▶ uv-logs
+ ▶ uv-logs
  | 
-Cell: benchmark | 4.08s
+Cell: benchmark | 7.93s
  | 
 
 Raw
@@ -4016,27 +4016,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     975.966us      1092.36%     975.966us     975.966us             1  
-                                            torch_eager        12.28%     354.583us        99.52%       2.875ms       2.875ms       0.000us         0.00%      90.561us      90.561us             1  
-                                              aten::mul         5.61%     162.144us         9.55%     275.765us      11.490us      47.009us        52.62%      47.009us       1.959us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.009us        52.62%      47.009us       1.959us            24  
-                                            aten::copy_         3.85%     111.174us        67.17%       1.940ms     107.795us      29.153us        32.63%      30.369us       1.687us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.337us        25.00%      22.337us       1.861us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.183us        14.76%      13.183us       1.099us            12  
-                                            aten::clone         1.24%      35.691us        65.68%       1.897ms     316.212us       0.000us         0.00%       8.032us       1.339us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.816us         7.63%       6.816us       1.136us             6  
-                                              aten::sub         1.48%      42.821us         2.36%      68.262us      11.377us       6.592us         7.38%       6.592us       1.099us             6  
-                                              aten::add         1.19%      34.391us         1.96%      56.751us       9.459us       6.591us         7.38%       6.591us       1.098us             6  
-                                Activity Buffer Request        58.57%       1.692ms        58.57%       1.692ms       1.692ms       1.216us         1.36%       1.216us       1.216us             1  
-                                    aten::empty_strided         1.90%      54.851us         1.90%      54.851us       9.142us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.58%      74.440us         2.58%      74.440us      12.407us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.39%      69.079us         3.06%      88.511us       3.688us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.67%      19.432us         0.67%      19.432us       0.810us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         7.77%     224.384us         7.77%     224.384us       4.675us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.48%      13.900us         0.48%      13.900us      13.900us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.121ms      1256.65%       1.121ms       1.121ms             1  
+                                            torch_eager        12.91%     396.632us        99.49%       3.057ms       3.057ms       0.000us         0.00%      90.428us      90.428us             1  
+                                              aten::mul         6.11%     187.745us        10.64%     326.977us      13.624us      47.168us        52.87%      47.168us       1.965us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.168us        52.87%      47.168us       1.965us            24  
+                                            aten::copy_         3.73%     114.507us        64.51%       1.982ms     110.114us      28.957us        32.46%      30.173us       1.676us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.335us        25.04%      22.335us       1.861us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.087us        14.67%      13.087us       1.091us            12  
+                                            aten::clone         1.26%      38.732us        62.92%       1.933ms     322.183us       0.000us         0.00%       7.838us       1.306us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.622us         7.42%       6.622us       1.104us             6  
+                                              aten::sub         1.53%      47.130us         2.58%      79.281us      13.214us       6.559us         7.35%       6.559us       1.093us             6  
+                                              aten::add         1.27%      38.901us         2.21%      67.791us      11.299us       6.528us         7.32%       6.528us       1.088us             6  
+                                Activity Buffer Request        55.82%       1.715ms        55.82%       1.715ms       1.715ms       1.216us         1.36%       1.216us       1.216us             1  
+                                    aten::empty_strided         2.00%      61.422us         2.00%      61.422us      10.237us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.49%      76.434us         2.49%      76.434us      12.739us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.64%      81.233us         3.38%     103.763us       4.323us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.73%      22.530us         0.73%      22.530us       0.939us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.00%     276.368us         9.00%     276.368us       5.758us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.51%      15.650us         0.51%      15.650us      15.650us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.889ms
-Self CUDA time total: 89.345us
+Self CPU time total: 3.072ms
+Self CUDA time total: 89.212us
 
 
 
@@ -4046,27 +4046,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     912.155us      1011.53%     912.155us     912.155us             1  
-                                            torch_eager        10.76%     301.514us        99.79%       2.795ms       2.795ms       0.000us         0.00%      91.296us      91.296us             1  
-                                              aten::mul         5.37%     150.316us         9.21%     258.115us      10.755us      47.452us        52.62%      47.452us       1.977us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.452us        52.62%      47.452us       1.977us            24  
-                                            aten::copy_         3.63%     101.610us        71.00%       1.989ms     110.503us      29.347us        32.54%      30.467us       1.693us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.595us        25.06%      22.595us       1.883us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.377us        14.83%      13.377us       1.115us            12  
-                                            aten::clone         0.94%      26.442us        68.40%       1.916ms     319.344us       0.000us         0.00%       7.872us       1.312us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.752us         7.49%       6.752us       1.125us             6  
-                                              aten::add         1.08%      30.320us         1.84%      51.590us       8.598us       6.689us         7.42%       6.689us       1.115us             6  
-                                              aten::sub         1.25%      34.880us         2.05%      57.350us       9.558us       6.688us         7.42%       6.688us       1.115us             6  
-                                Activity Buffer Request        62.97%       1.764ms        62.97%       1.764ms       1.764ms       1.120us         1.24%       1.120us       1.120us             1  
-                                    aten::empty_strided         1.15%      32.200us         1.15%      32.200us       5.367us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.15%      60.121us         2.15%      60.121us      10.020us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.19%      61.232us         2.82%      79.123us       3.297us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.64%      17.891us         0.64%      17.891us       0.745us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         7.67%     214.771us         7.67%     214.771us       4.474us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.21%       5.950us         0.21%       5.950us       5.950us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.007ms      1113.49%       1.007ms       1.007ms             1  
+                                            torch_eager        10.54%     298.489us        99.78%       2.827ms       2.827ms       0.000us         0.00%      91.582us      91.582us             1  
+                                              aten::mul         6.23%     176.544us        10.66%     301.947us      12.581us      47.549us        52.58%      47.549us       1.981us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.549us        52.58%      47.549us       1.981us            24  
+                                            aten::copy_         3.68%     104.131us        68.91%       1.952ms     108.446us      29.376us        32.49%      30.529us       1.696us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.591us        24.98%      22.591us       1.883us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.504us        14.93%      13.504us       1.125us            12  
+                                            aten::clone         0.77%      21.889us        65.76%       1.863ms     310.451us       0.000us         0.00%       7.938us       1.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.785us         7.50%       6.785us       1.131us             6  
+                                              aten::sub         1.41%      40.059us         2.39%      67.780us      11.297us       6.784us         7.50%       6.784us       1.131us             6  
+                                              aten::add         1.28%      36.151us         2.32%      65.853us      10.975us       6.720us         7.43%       6.720us       1.120us             6  
+                                Activity Buffer Request        60.61%       1.717ms        60.61%       1.717ms       1.717ms       1.153us         1.28%       1.153us       1.153us             1  
+                                    aten::empty_strided         1.12%      31.742us         1.12%      31.742us       5.290us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.04%      57.701us         2.04%      57.701us       9.617us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.45%      69.273us         3.06%      86.813us       3.617us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.62%      17.540us         0.62%      17.540us       0.731us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.05%     256.228us         9.05%     256.228us       5.338us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.22%       6.150us         0.22%       6.150us       6.150us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.801ms
-Self CUDA time total: 90.176us
+Self CPU time total: 2.833ms
+Self CUDA time total: 90.429us
 
 
 
@@ -4076,27 +4076,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     913.022us       972.45%     913.022us     913.022us             1  
-                                            torch_eager        10.98%     301.066us        99.80%       2.737ms       2.737ms       0.000us         0.00%      95.200us      95.200us             1  
-                                              aten::mul         5.60%     153.463us         9.59%     262.853us      10.952us      48.737us        51.91%      48.737us       2.031us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.737us        51.91%      48.737us       2.031us            24  
-                                            aten::copy_         3.66%     100.372us        70.16%       1.924ms     106.884us      30.688us        32.69%      31.999us       1.778us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.912us        24.40%      22.912us       1.909us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.464us        15.41%      14.464us       1.205us            12  
-                                            aten::clone         0.95%      26.091us        67.47%       1.850ms     308.365us       0.000us         0.00%       9.087us       1.514us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us         8.28%       7.776us       1.296us             6  
-                                              aten::add         1.14%      31.200us         1.93%      53.020us       8.837us       7.264us         7.74%       7.264us       1.211us             6  
-                                              aten::sub         1.31%      35.942us         2.13%      58.462us       9.744us       7.200us         7.67%       7.200us       1.200us             6  
-                                Activity Buffer Request        62.12%       1.703ms        62.12%       1.703ms       1.703ms       1.311us         1.40%       1.311us       1.311us             1  
-                                    aten::empty_strided         1.13%      31.120us         1.13%      31.120us       5.187us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.07%      56.870us         2.07%      56.870us       9.478us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.29%      62.820us         2.92%      80.090us       3.337us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.63%      17.270us         0.63%      17.270us       0.720us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         7.91%     217.003us         7.91%     217.003us       4.521us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.600us         0.20%       5.600us       5.600us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.006ms      1072.44%       1.006ms       1.006ms             1  
+                                            torch_eager        10.52%     298.395us        99.78%       2.831ms       2.831ms       0.000us         0.00%      95.037us      95.037us             1  
+                                              aten::mul         6.11%     173.334us        10.70%     303.588us      12.649us      48.703us        51.91%      48.703us       2.029us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.703us        51.91%      48.703us       2.029us            24  
+                                            aten::copy_         3.65%     103.459us        69.19%       1.963ms     109.044us      30.654us        32.67%      31.870us       1.771us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.975us        24.49%      22.975us       1.915us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.464us        15.42%      14.464us       1.205us            12  
+                                            aten::clone         0.76%      21.692us        65.97%       1.871ms     311.909us       0.000us         0.00%       8.895us       1.482us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.679us         8.18%       7.679us       1.280us             6  
+                                              aten::add         1.20%      34.161us         2.12%      60.242us      10.040us       7.233us         7.71%       7.233us       1.206us             6  
+                                              aten::sub         1.38%      39.061us         2.38%      67.492us      11.249us       7.231us         7.71%       7.231us       1.205us             6  
+                                Activity Buffer Request        60.90%       1.728ms        60.90%       1.728ms       1.728ms       1.216us         1.30%       1.216us       1.216us             1  
+                                    aten::empty_strided         1.08%      30.530us         1.08%      30.530us       5.088us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.06%      58.531us         2.06%      58.531us       9.755us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.41%      68.445us         3.03%      85.834us       3.576us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.61%      17.389us         0.61%      17.389us       0.725us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.10%     258.010us         9.10%     258.010us       5.375us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.22%       6.240us         0.22%       6.240us       6.240us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.742ms
-Self CUDA time total: 93.889us
+Self CPU time total: 2.837ms
+Self CUDA time total: 93.821us
 
 
 
@@ -4106,27 +4106,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     939.645us       923.40%     939.645us     939.645us             1  
-                                            torch_eager        10.25%     301.966us        99.82%       2.940ms       2.940ms       0.000us         0.00%     103.039us     103.039us             1  
-                                              aten::mul         5.17%     152.369us         8.89%     261.782us      10.908us      52.831us        51.92%      52.831us       2.201us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.831us        51.92%      52.831us       2.201us            24  
-                                            aten::copy_         3.48%     102.582us        71.53%       2.107ms     117.036us      32.481us        31.92%      33.761us       1.876us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.737us        24.31%      24.737us       2.061us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.447us        16.16%      16.447us       1.371us            12  
-                                            aten::clone         0.93%      27.431us        68.93%       2.030ms     338.362us       0.000us         0.00%       9.024us       1.504us             6  
-                                              aten::sub         1.21%      35.711us         2.01%      59.102us       9.850us       8.319us         8.18%       8.319us       1.387us             6  
-                                              aten::add         1.06%      31.350us         1.79%      52.800us       8.800us       8.128us         7.99%       8.128us       1.355us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         7.61%       7.744us       1.291us             6  
-                                Activity Buffer Request        57.63%       1.697ms        57.63%       1.697ms       1.697ms       1.280us         1.26%       1.280us       1.280us             1  
-                                    aten::empty_strided         1.03%      30.229us         1.03%      30.229us       5.038us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.25%     243.123us         8.25%     243.123us      40.520us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.79%      82.098us         3.39%      99.942us       4.164us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.61%      17.844us         0.61%      17.844us       0.743us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         7.40%     217.957us         7.40%     217.957us       4.541us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       5.350us         0.18%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.008ms       991.12%       1.008ms       1.008ms             1  
+                                            torch_eager        11.39%     291.220us        99.77%       2.552ms       2.552ms       0.000us         0.00%     103.069us     103.069us             1  
+                                              aten::mul         6.75%     172.580us        11.78%     301.232us      12.551us      52.797us        51.90%      52.797us       2.200us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.797us        51.90%      52.797us       2.200us            24  
+                                            aten::copy_         3.99%     102.084us        65.85%       1.684ms      93.565us      32.544us        31.99%      33.888us       1.883us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.736us        24.32%      24.736us       2.061us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.384us        16.11%      16.384us       1.365us            12  
+                                            aten::clone         0.85%      21.760us        62.39%       1.596ms     265.957us       0.000us         0.00%       9.152us       1.525us             6  
+                                              aten::sub         1.61%      41.170us         2.75%      70.342us      11.724us       8.256us         8.12%       8.256us       1.376us             6  
+                                              aten::add         1.37%      35.121us         2.54%      64.851us      10.809us       8.128us         7.99%       8.128us       1.355us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.68%       7.808us       1.301us             6  
+                                Activity Buffer Request        49.27%       1.260ms        49.27%       1.260ms       1.260ms       1.344us         1.32%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.23%      31.560us         1.23%      31.560us       5.260us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.74%     249.077us         9.74%     249.077us      41.513us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.69%      68.742us         3.39%      86.703us       3.613us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.70%      17.961us         0.70%      17.961us       0.748us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        10.18%     260.466us        10.18%     260.466us       5.426us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.23%       5.870us         0.23%       5.870us       5.870us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.945ms
-Self CUDA time total: 101.759us
+Self CPU time total: 2.558ms
+Self CUDA time total: 101.725us
 
 
 
@@ -4136,27 +4136,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     916.066us       979.04%     916.066us     916.066us             1  
-                                            torch_eager        10.22%     297.753us        99.80%       2.907ms       2.907ms       0.000us         0.00%      94.848us      94.848us             1  
-                                              aten::mul         5.27%     153.390us         8.99%     261.760us      10.907us      48.609us        51.95%      48.609us       2.025us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.609us        51.95%      48.609us       2.025us            24  
-                                            aten::copy_         3.49%     101.793us        71.96%       2.096ms     116.462us      30.720us        32.83%      32.000us       1.778us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.976us        24.56%      22.976us       1.915us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.239us        15.22%      14.239us       1.187us            12  
-                                            aten::clone         1.04%      30.259us        69.44%       2.023ms     337.157us       0.000us         0.00%       9.024us       1.504us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         8.28%       7.744us       1.291us             6  
-                                              aten::sub         1.21%      35.379us         2.01%      58.671us       9.778us       7.135us         7.63%       7.135us       1.189us             6  
-                                              aten::add         1.06%      30.800us         1.78%      51.980us       8.663us       7.104us         7.59%       7.104us       1.184us             6  
-                                Activity Buffer Request        59.25%       1.726ms        59.25%       1.726ms       1.726ms       1.280us         1.37%       1.280us       1.280us             1  
-                                    aten::empty_strided         1.07%      31.081us         1.07%      31.081us       5.180us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.98%     203.214us         6.98%     203.214us      33.869us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.14%      62.478us         2.73%      79.587us       3.316us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.59%      17.109us         0.59%      17.109us       0.713us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         7.49%     218.173us         7.49%     218.173us       4.545us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.800us         0.20%       5.800us       5.800us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.014ms      1081.02%       1.014ms       1.014ms             1  
+                                            torch_eager        10.25%     325.467us        99.80%       3.170ms       3.170ms       0.000us         0.00%      95.040us      95.040us             1  
+                                              aten::mul         5.52%     175.344us         9.66%     306.970us      12.790us      48.800us        52.05%      48.800us       2.033us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.800us        52.05%      48.800us       2.033us            24  
+                                            aten::copy_         3.31%     105.233us        71.23%       2.263ms     125.701us      30.784us        32.83%      32.064us       1.781us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.041us        24.57%      23.041us       1.920us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.176us        15.12%      14.176us       1.181us            12  
+                                            aten::clone         0.88%      27.881us        68.64%       2.180ms     363.408us       0.000us         0.00%       9.023us       1.504us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.743us         8.26%       7.743us       1.290us             6  
+                                              aten::sub         1.21%      38.440us         2.12%      67.480us      11.247us       7.136us         7.61%       7.136us       1.189us             6  
+                                              aten::add         1.04%      32.931us         1.90%      60.251us      10.042us       7.040us         7.51%       7.040us       1.173us             6  
+                                Activity Buffer Request        54.81%       1.741ms        54.81%       1.741ms       1.741ms       1.280us         1.37%       1.280us       1.280us             1  
+                                    aten::empty_strided         1.02%      32.521us         1.02%      32.521us       5.420us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.79%     342.870us        10.79%     342.870us      57.145us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.15%      68.383us         2.74%      86.883us       3.620us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.58%      18.500us         0.58%      18.500us       0.771us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.23%     261.446us         8.23%     261.446us       5.447us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       6.450us         0.20%       6.450us       6.450us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.913ms
-Self CUDA time total: 93.568us
+Self CPU time total: 3.177ms
+Self CUDA time total: 93.760us
 
 
 
@@ -4166,27 +4166,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     889.506us       873.60%     889.506us     889.506us             1  
-                                            torch_eager         9.47%     263.734us        99.81%       2.780ms       2.780ms       0.000us         0.00%     103.133us     103.133us             1  
-                                              aten::mul         5.38%     149.780us         9.23%     257.043us      10.710us      52.958us        52.01%      52.958us       2.207us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.958us        52.01%      52.958us       2.207us            24  
-                                            aten::copy_         3.60%     100.145us        72.47%       2.019ms     112.141us      32.385us        31.81%      33.697us       1.872us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.609us        24.17%      24.609us       2.051us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.478us        16.18%      16.478us       1.373us            12  
-                                            aten::clone         0.76%      21.289us        69.67%       1.941ms     323.462us       0.000us         0.00%       9.088us       1.515us             6  
-                                              aten::sub         1.26%      35.149us         2.08%      57.971us       9.662us       8.318us         8.17%       8.318us       1.386us             6  
-                                              aten::add         1.12%      31.181us         1.89%      52.690us       8.782us       8.160us         8.01%       8.160us       1.360us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us         7.64%       7.776us       1.296us             6  
-                                Activity Buffer Request        59.87%       1.668ms        59.87%       1.668ms       1.668ms       1.312us         1.29%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.08%      30.102us         1.08%      30.102us       5.017us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.81%     189.822us         6.81%     189.822us      31.637us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.23%      62.254us         2.83%      78.841us       3.285us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.60%      16.587us         0.60%      16.587us       0.691us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         7.62%     212.384us         7.62%     212.384us       4.425us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.281us         0.19%       5.281us       5.281us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.001ms       983.06%       1.001ms       1.001ms             1  
+                                            torch_eager         9.84%     295.255us        99.81%       2.995ms       2.995ms       0.000us         0.00%     103.136us     103.136us             1  
+                                              aten::mul         5.72%     171.754us         9.96%     298.939us      12.456us      52.896us        51.95%      52.896us       2.204us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.896us        51.95%      52.896us       2.204us            24  
+                                            aten::copy_         3.46%     103.695us        71.00%       2.131ms     118.374us      32.417us        31.84%      33.729us       1.874us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.673us        24.23%      24.673us       2.056us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.511us        16.22%      16.511us       1.376us            12  
+                                            aten::clone         0.71%      21.410us        67.93%       2.039ms     339.780us       0.000us         0.00%       9.056us       1.509us             6  
+                                              aten::sub         1.35%      40.422us         2.36%      70.682us      11.780us       8.319us         8.17%       8.319us       1.386us             6  
+                                              aten::add         1.19%      35.599us         2.15%      64.481us      10.747us       8.192us         8.05%       8.192us       1.365us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         7.61%       7.744us       1.291us             6  
+                                Activity Buffer Request        57.66%       1.731ms        57.66%       1.731ms       1.731ms       1.312us         1.29%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.04%      31.291us         1.04%      31.291us       5.215us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         7.41%     222.325us         7.41%     222.325us      37.054us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.17%      65.101us         2.75%      82.681us       3.445us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.59%      17.580us         0.59%      17.580us       0.733us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.68%     260.519us         8.68%     260.519us       5.427us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.671us         0.19%       5.671us       5.671us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.785ms
-Self CUDA time total: 101.821us
+Self CPU time total: 3.001ms
+Self CUDA time total: 101.824us
 
 
 
@@ -4196,27 +4196,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     917.279us       760.79%     917.279us     917.279us             1  
-                                            torch_eager         9.49%     270.296us        99.79%       2.843ms       2.843ms       0.000us         0.00%     122.393us     122.393us             1  
-                                              aten::mul         5.35%     152.294us         9.23%     262.963us      10.957us      62.014us        51.43%      62.014us       2.584us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.014us        51.43%      62.014us       2.584us            24  
-                                            aten::copy_         3.56%     101.468us        72.26%       2.058ms     114.357us      39.420us        32.69%      41.243us       2.291us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.894us        23.96%      28.894us       2.408us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.136us        15.87%      19.136us       1.595us            12  
-                                            aten::clone         0.72%      20.551us        69.70%       1.986ms     330.917us       0.000us         0.00%      12.349us       2.058us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.526us         8.73%      10.526us       1.754us             6  
-                                              aten::sub         1.26%      35.841us         2.08%      59.281us       9.880us       9.600us         7.96%       9.600us       1.600us             6  
-                                              aten::add         1.13%      32.290us         1.93%      54.990us       9.165us       9.536us         7.91%       9.536us       1.589us             6  
-                                Activity Buffer Request        59.91%       1.707ms        59.91%       1.707ms       1.707ms       1.823us         1.51%       1.823us       1.823us             1  
-                                    aten::empty_strided         1.30%      36.920us         1.30%      36.920us       6.153us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.60%     187.893us         6.60%     187.893us      31.316us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.17%      61.750us         2.78%      79.243us       3.302us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.61%      17.493us         0.61%      17.493us       0.729us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         7.70%     219.342us         7.70%     219.342us       4.570us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.21%       5.890us         0.21%       5.890us       5.890us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.015ms       840.95%       1.015ms       1.015ms             1  
+                                            torch_eager         9.74%     291.664us        99.80%       2.987ms       2.987ms       0.000us         0.00%     122.530us     122.530us             1  
+                                              aten::mul         5.84%     174.903us        10.28%     307.588us      12.816us      62.144us        51.48%      62.144us       2.589us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.144us        51.48%      62.144us       2.589us            24  
+                                            aten::copy_         3.43%     102.713us        70.67%       2.115ms     117.501us      39.265us        32.53%      41.090us       2.283us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.864us        23.91%      28.864us       2.405us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.296us        15.99%      19.296us       1.608us            12  
+                                            aten::clone         0.71%      21.240us        67.46%       2.019ms     336.492us       0.000us         0.00%      12.226us       2.038us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.401us         8.62%      10.401us       1.733us             6  
+                                              aten::add         1.16%      34.581us         2.15%      64.442us      10.740us       9.696us         8.03%       9.696us       1.616us             6  
+                                              aten::sub         1.36%      40.831us         2.36%      70.613us      11.769us       9.600us         7.95%       9.600us       1.600us             6  
+                                Activity Buffer Request        57.99%       1.736ms        57.99%       1.736ms       1.736ms       1.825us         1.51%       1.825us       1.825us             1  
+                                    aten::empty_strided         1.04%      31.082us         1.04%      31.082us       5.180us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.60%     197.545us         6.60%     197.545us      32.924us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.24%      66.920us         2.85%      85.310us       3.555us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.61%      18.390us         0.61%      18.390us       0.766us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.07%     271.392us         9.07%     271.392us       5.654us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       6.001us         0.20%       6.001us       6.001us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.849ms
-Self CUDA time total: 120.570us
+Self CPU time total: 2.993ms
+Self CUDA time total: 120.705us
 
 
 
@@ -4226,27 +4226,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     924.864us       536.51%     924.864us     924.864us             1  
-                                            torch_eager        10.45%     305.132us        99.82%       2.915ms       2.915ms       0.000us         0.00%     175.296us     175.296us             1  
-                                              aten::mul         5.14%     150.181us         8.96%     261.653us      10.902us      89.408us        51.87%      89.408us       3.725us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.408us        51.87%      89.408us       3.725us            24  
-                                            aten::copy_         3.52%     102.713us        71.68%       2.093ms     116.286us      57.856us        33.56%      60.768us       3.376us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.768us        23.65%      40.768us       3.397us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.120us        14.57%      25.120us       2.093us            12  
-                                            aten::clone         1.06%      30.960us        69.26%       2.022ms     337.044us       0.000us         0.00%      20.000us       3.333us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us         9.91%      17.088us       2.848us             6  
-                                              aten::add         1.07%      31.150us         1.81%      52.780us       8.797us      12.672us         7.35%      12.672us       2.112us             6  
-                                              aten::sub         1.22%      35.659us         2.01%      58.661us       9.777us      12.448us         7.22%      12.448us       2.075us             6  
-                                Activity Buffer Request        59.67%       1.742ms        59.67%       1.742ms       1.742ms       2.912us         1.69%       2.912us       2.912us             1  
-                                    aten::empty_strided         1.10%      32.260us         1.10%      32.260us       5.377us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.30%     183.953us         6.30%     183.953us      30.659us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.15%      62.747us         2.74%      80.125us       3.339us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.60%      17.378us         0.60%      17.378us       0.724us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         7.55%     220.376us         7.55%     220.376us       4.591us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       5.231us         0.18%       5.231us       5.231us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.002ms       581.09%       1.002ms       1.002ms             1  
+                                            torch_eager        20.60%     295.691us        99.58%       1.430ms       1.430ms       0.000us         0.00%     175.327us     175.327us             1  
+                                              aten::mul        11.87%     170.476us        20.75%     297.916us      12.413us      89.503us        51.89%      89.503us       3.729us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.503us        51.89%      89.503us       3.729us            24  
+                                            aten::copy_         7.24%     103.879us        39.87%     572.479us      31.804us      57.887us        33.56%      60.735us       3.374us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.832us        23.67%      40.832us       3.403us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.089us        14.55%      25.089us       2.091us            12  
+                                            aten::clone         1.51%      21.660us        33.62%     482.661us      80.443us       0.000us         0.00%      19.903us       3.317us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.055us         9.89%      17.055us       2.842us             6  
+                                              aten::sub         2.80%      40.184us         4.70%      67.454us      11.242us      12.546us         7.27%      12.546us       2.091us             6  
+                                              aten::add         2.41%      34.531us         4.25%      60.992us      10.165us      12.543us         7.27%      12.543us       2.090us             6  
+                                Activity Buffer Request        14.04%     201.545us        14.04%     201.545us     201.545us       2.848us         1.65%       2.848us       2.848us             1  
+                                    aten::empty_strided         2.07%      29.712us         2.07%      29.712us       4.952us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        13.60%     195.253us        13.60%     195.253us      32.542us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.57%      65.572us         5.83%      83.731us       3.489us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.26%      18.159us         1.26%      18.159us       0.757us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        17.62%     252.973us        17.62%     252.973us       5.270us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.42%       6.061us         0.42%       6.061us       6.061us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.920ms
-Self CUDA time total: 172.384us
+Self CPU time total: 1.436ms
+Self CUDA time total: 172.479us
 
 
 
@@ -4256,27 +4256,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     891.930us       739.34%     891.930us     891.930us             1  
-                                            torch_eager        21.05%     265.905us        99.59%       1.258ms       1.258ms       0.000us         0.00%     122.398us     122.398us             1  
-                                              aten::mul        11.89%     150.169us        20.37%     257.293us      10.721us      62.046us        51.43%      62.046us       2.585us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.046us        51.43%      62.046us       2.585us            24  
-                                            aten::copy_         7.91%      99.966us        39.16%     494.681us      27.482us      39.361us        32.63%      41.121us       2.285us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.961us        24.01%      28.961us       2.413us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.231us        15.94%      19.231us       1.603us            12  
-                                            aten::clone         1.55%      19.527us        32.87%     415.246us      69.208us       0.000us         0.00%      12.160us       2.027us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us         8.62%      10.400us       1.733us             6  
-                                              aten::add         2.51%      31.670us         4.18%      52.860us       8.810us       9.663us         8.01%       9.663us       1.611us             6  
-                                              aten::sub         2.77%      34.981us         4.60%      58.113us       9.685us       9.568us         7.93%       9.568us       1.595us             6  
-                                Activity Buffer Request        11.68%     147.573us        11.68%     147.573us     147.573us       1.760us         1.46%       1.760us       1.760us             1  
-                                    aten::empty_strided         2.49%      31.403us         2.49%      31.403us       5.234us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.63%     184.842us        14.63%     184.842us      30.807us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.89%      61.717us         6.20%      78.378us       3.266us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.32%      16.661us         1.32%      16.661us       0.694us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.92%     213.746us        16.92%     213.746us       4.453us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.41%       5.161us         0.41%       5.161us       5.161us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     998.580us       827.30%     998.580us     998.580us             1  
+                                            torch_eager        20.47%     288.290us        99.58%       1.403ms       1.403ms       0.000us         0.00%     122.464us     122.464us             1  
+                                              aten::mul        12.45%     175.343us        21.53%     303.228us      12.634us      62.175us        51.51%      62.175us       2.591us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.175us        51.51%      62.175us       2.591us            24  
+                                            aten::copy_         7.38%     103.973us        38.88%     547.625us      30.424us      39.265us        32.53%      41.025us       2.279us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.800us        23.86%      28.800us       2.400us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.96%      19.264us       1.605us            12  
+                                            aten::clone         1.40%      19.710us        32.30%     454.921us      75.820us       0.000us         0.00%      12.225us       2.038us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.465us         8.67%      10.465us       1.744us             6  
+                                              aten::add         2.42%      34.049us         4.36%      61.370us      10.228us       9.695us         8.03%       9.695us       1.616us             6  
+                                              aten::sub         2.76%      38.918us         4.74%      66.750us      11.125us       9.569us         7.93%       9.569us       1.595us             6  
+                                Activity Buffer Request        13.00%     183.145us        13.00%     183.145us     183.145us       1.760us         1.46%       1.760us       1.760us             1  
+                                    aten::empty_strided         2.13%      30.010us         2.13%      30.010us       5.002us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        13.33%     187.794us        13.33%     187.794us      31.299us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.79%      67.456us         6.09%      85.722us       3.572us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.30%      18.266us         1.30%      18.266us       0.761us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        18.16%     255.751us        18.16%     255.751us       5.328us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.42%       5.851us         0.42%       5.851us       5.851us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.263ms
-Self CUDA time total: 120.638us
+Self CPU time total: 1.409ms
+Self CUDA time total: 120.704us
 
 
 
@@ -4286,27 +4286,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     896.465us       520.02%     896.465us     896.465us             1  
-                                            torch_eager        20.64%     267.500us        99.56%       1.290ms       1.290ms       0.000us         0.00%     175.237us     175.237us             1  
-                                              aten::mul        11.74%     152.190us        20.16%     261.202us      10.883us      89.569us        51.96%      89.569us       3.732us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.569us        51.96%      89.569us       3.732us            24  
-                                            aten::copy_         7.78%     100.776us        40.08%     519.323us      28.851us      57.731us        33.49%      60.579us       3.366us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.803us        23.67%      40.803us       3.400us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.089us        14.55%      25.089us       2.091us            12  
-                                            aten::clone         1.61%      20.829us        34.03%     440.978us      73.496us       0.000us         0.00%      19.776us       3.296us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us         9.82%      16.928us       2.821us             6  
-                                              aten::add         2.36%      30.520us         4.00%      51.870us       8.645us      12.608us         7.31%      12.608us       2.101us             6  
-                                              aten::sub         2.79%      36.204us         4.56%      59.064us       9.844us      12.481us         7.24%      12.481us       2.080us             6  
-                                Activity Buffer Request        13.56%     175.743us        13.56%     175.743us     175.743us       2.848us         1.65%       2.848us       2.848us             1  
-                                    aten::empty_strided         2.40%      31.051us         2.40%      31.051us       5.175us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.98%     181.154us        13.98%     181.154us      30.192us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.80%      62.202us         6.12%      79.302us       3.304us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.32%      17.100us         1.32%      17.100us       0.713us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.58%     214.872us        16.58%     214.872us       4.477us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.44%       5.731us         0.44%       5.731us       5.731us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     990.551us       575.17%     990.551us     990.551us             1  
+                                            torch_eager         9.55%     281.107us        99.81%       2.938ms       2.938ms       0.000us         0.00%     175.100us     175.100us             1  
+                                              aten::mul         5.81%     170.942us        10.23%     301.167us      12.549us      89.468us        51.95%      89.468us       3.728us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.468us        51.95%      89.468us       3.728us            24  
+                                            aten::copy_         3.50%     102.971us        70.92%       2.088ms     115.984us      57.728us        33.52%      60.608us       3.367us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.800us        23.69%      40.800us       3.400us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.024us        14.53%      25.024us       2.085us            12  
+                                            aten::clone         0.79%      23.370us        68.06%       2.004ms     333.942us       0.000us         0.00%      19.808us       3.301us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us         9.83%      16.928us       2.821us             6  
+                                              aten::add         1.19%      35.020us         2.08%      61.181us      10.197us      12.513us         7.27%      12.513us       2.085us             6  
+                                              aten::sub         1.27%      37.451us         2.29%      67.451us      11.242us      12.511us         7.26%      12.511us       2.085us             6  
+                                Activity Buffer Request        58.47%       1.721ms        58.47%       1.721ms       1.721ms       2.880us         1.67%       2.880us       2.880us             1  
+                                    aten::empty_strided         1.04%      30.722us         1.04%      30.722us       5.120us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.59%     193.904us         6.59%     193.904us      32.317us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.29%      67.323us         2.91%      85.623us       3.568us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.62%      18.300us         0.62%      18.300us       0.762us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.69%     255.869us         8.69%     255.869us       5.331us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.631us         0.19%       5.631us       5.631us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.296ms
-Self CUDA time total: 172.389us
+Self CPU time total: 2.944ms
+Self CUDA time total: 172.220us
 
 
 
@@ -4316,27 +4316,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     907.999us       318.14%     907.999us     907.999us             1  
-                                            torch_eager        10.33%     297.182us        99.78%       2.870ms       2.870ms       0.000us         0.00%     303.265us     303.265us             1  
-                                              aten::mul         5.13%     147.584us         8.90%     255.976us      10.666us     133.567us        46.80%     133.567us       5.565us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.567us        46.80%     133.567us       5.565us            24  
-                                            aten::copy_         3.51%     101.030us        71.83%       2.067ms     114.810us     110.722us        38.79%     128.578us       7.143us            18  
-                                            aten::clone         0.94%      26.992us        69.34%       1.995ms     332.479us       0.000us         0.00%      71.233us      11.872us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.345us        20.09%      57.345us       4.779us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.377us        18.70%      53.377us       8.896us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.120us        14.41%      41.120us       3.427us            12  
-                                              aten::sub         1.21%      34.680us         2.01%      57.770us       9.628us      20.833us         7.30%      20.833us       3.472us             6  
-                                              aten::add         1.09%      31.394us         1.89%      54.514us       9.086us      20.287us         7.11%      20.287us       3.381us             6  
-                                Activity Buffer Request        59.89%       1.723ms        59.89%       1.723ms       1.723ms      17.856us         6.26%      17.856us      17.856us             1  
-                                    aten::empty_strided         1.06%      30.410us         1.06%      30.410us       5.068us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.35%     182.753us         6.35%     182.753us      30.459us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.22%      63.807us         2.82%      81.011us       3.375us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.60%      17.204us         0.60%      17.204us       0.717us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         7.46%     214.543us         7.46%     214.543us       4.470us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.22%       6.420us         0.22%       6.420us       6.420us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     996.043us       348.79%     996.043us     996.043us             1  
+                                            torch_eager        20.87%     290.654us        99.57%       1.387ms       1.387ms       0.000us         0.00%     304.030us     304.030us             1  
+                                              aten::mul        12.33%     171.709us        21.56%     300.326us      12.514us     133.152us        46.63%     133.152us       5.548us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.152us        46.63%     133.152us       5.548us            24  
+                                            aten::copy_         7.20%     100.319us        38.33%     533.940us      29.663us     111.295us        38.97%     129.758us       7.209us            18  
+                                            aten::clone         1.43%      19.920us        31.74%     442.109us      73.685us       0.000us         0.00%      72.510us      12.085us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.248us        20.05%      57.248us       4.771us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      54.047us        18.93%      54.047us       9.008us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.120us        14.40%      41.120us       3.427us            12  
+                                              aten::sub         2.81%      39.111us         4.76%      66.281us      11.047us      20.672us         7.24%      20.672us       3.445us             6  
+                                              aten::add         2.49%      34.670us         4.42%      61.560us      10.260us      20.448us         7.16%      20.448us       3.408us             6  
+                                Activity Buffer Request        12.17%     169.484us        12.17%     169.484us     169.484us      18.463us         6.47%      18.463us      18.463us             1  
+                                    aten::empty_strided         2.13%      29.640us         2.13%      29.640us       4.940us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        13.75%     191.524us        13.75%     191.524us      31.921us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.84%      67.441us         6.08%      84.703us       3.529us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.24%      17.262us         1.24%      17.262us       0.719us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        18.33%     255.290us        18.33%     255.290us       5.319us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.43%       5.940us         0.43%       5.940us       5.940us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.877ms
-Self CUDA time total: 285.409us
+Self CPU time total: 1.393ms
+Self CUDA time total: 285.567us
 
 
 
@@ -4346,27 +4346,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     936.507us       164.60%     936.507us     936.507us             1  
-                                            torch_eager        21.10%     274.795us        99.58%       1.297ms       1.297ms       0.000us         0.00%     592.769us     592.769us             1  
-                                            aten::copy_         7.72%     100.512us        38.34%     499.337us      27.741us     273.025us        47.99%     296.833us      16.491us            18  
-                                              aten::mul        11.94%     155.505us        20.72%     269.807us      11.242us     230.561us        40.52%     230.561us       9.607us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     230.561us        40.52%     230.561us       9.607us            24  
-                                            aten::clone         1.56%      20.351us        32.41%     422.047us      70.341us       0.000us         0.00%     206.465us      34.411us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     182.657us        32.10%     182.657us      30.443us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.368us        15.88%      90.368us       7.531us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.375us        11.49%      65.375us       5.448us            12  
-                                              aten::sub         2.96%      38.510us         4.81%      62.580us      10.430us      32.991us         5.80%      32.991us       5.498us             6  
-                                              aten::add         2.43%      31.581us         4.16%      54.201us       9.034us      32.384us         5.69%      32.384us       5.397us             6  
-                                Activity Buffer Request        11.20%     145.802us        11.20%     145.802us     145.802us      23.808us         4.18%      23.808us      23.808us             1  
-                                    aten::empty_strided         2.79%      36.361us         2.79%      36.361us       6.060us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.42%     187.792us        14.42%     187.792us      31.299us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.78%      62.259us         6.10%      79.429us       3.310us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.32%      17.170us         1.32%      17.170us       0.715us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        17.37%     226.223us        17.37%     226.223us       4.713us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.42%       5.440us         0.42%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.018ms       178.84%       1.018ms       1.018ms             1  
+                                            torch_eager        20.93%     294.497us        99.60%       1.402ms       1.402ms       0.000us         0.00%     592.727us     592.727us             1  
+                                            aten::copy_         7.21%     101.432us        37.46%     527.132us      29.285us     273.662us        48.09%     297.374us      16.521us            18  
+                                              aten::mul        12.77%     179.663us        21.96%     309.058us      12.877us     229.816us        40.39%     229.816us       9.576us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     229.816us        40.39%     229.816us       9.576us            24  
+                                            aten::clone         1.44%      20.252us        30.89%     434.772us      72.462us       0.000us         0.00%     206.559us      34.426us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     182.847us        32.13%     182.847us      30.474us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.815us        15.96%      90.815us       7.568us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.537us        11.52%      65.537us       5.461us            12  
+                                              aten::sub         2.76%      38.901us         5.06%      71.272us      11.879us      32.865us         5.78%      32.865us       5.477us             6  
+                                              aten::add         2.62%      36.830us         4.51%      63.532us      10.589us      32.672us         5.74%      32.672us       5.445us             6  
+                                Activity Buffer Request        11.90%     167.504us        11.90%     167.504us     167.504us      23.712us         4.17%      23.712us      23.712us             1  
+                                    aten::empty_strided         2.15%      30.291us         2.15%      30.291us       5.049us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        13.10%     184.385us        13.10%     184.385us      30.731us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.77%      67.152us         6.08%      85.571us       3.565us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.31%      18.419us         1.31%      18.419us       0.767us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        18.64%     262.279us        18.64%     262.279us       5.464us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.40%       5.670us         0.40%       5.670us       5.670us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.302ms
-Self CUDA time total: 568.961us
+Self CPU time total: 1.407ms
+Self CUDA time total: 569.015us
 
 
 
@@ -4376,27 +4376,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     912.214us       987.11%     912.214us     912.214us             1  
-                                            torch_eager        20.73%     270.361us        99.60%       1.299ms       1.299ms       0.000us         0.00%      93.533us      93.533us             1  
-                                              aten::mul        11.80%     153.962us        20.35%     265.434us      11.060us      49.471us        53.53%      49.471us       2.061us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.471us        53.53%      49.471us       2.061us            24  
-                                            aten::copy_         7.83%     102.175us        39.79%     518.911us      28.828us      29.343us        31.75%      30.463us       1.692us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.559us        24.41%      22.559us       1.880us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.599us        14.72%      13.599us       1.133us            12  
-                                            aten::clone         1.57%      20.491us        33.39%     435.488us      72.581us       0.000us         0.00%       7.904us       1.317us             6  
-                                              aten::add         2.51%      32.770us         4.19%      54.680us       9.113us       6.815us         7.37%       6.815us       1.136us             6  
-                                              aten::sub         2.78%      36.231us         4.59%      59.802us       9.967us       6.784us         7.34%       6.784us       1.131us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us         7.34%       6.784us       1.131us             6  
-                                Activity Buffer Request        12.62%     164.552us        12.62%     164.552us     164.552us       1.120us         1.21%       1.120us       1.120us             1  
-                                    aten::empty_strided         2.31%      30.071us         2.31%      30.071us       5.012us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.34%     187.054us        14.34%     187.054us      31.176us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.77%      62.210us         6.08%      79.271us       3.303us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.31%      17.061us         1.31%      17.061us       0.711us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        17.03%     222.083us        17.03%     222.083us       4.627us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.40%       5.200us         0.40%       5.200us       5.200us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.000ms      1080.76%       1.000ms       1.000ms             1  
+                                            torch_eager         9.93%     294.202us        99.82%       2.956ms       2.956ms       0.000us         0.00%      93.659us      93.659us             1  
+                                              aten::mul         5.79%     171.584us        10.10%     299.275us      12.470us      49.565us        53.56%      49.565us       2.065us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.565us        53.56%      49.565us       2.065us            24  
+                                            aten::copy_         3.59%     106.321us        70.88%       2.099ms     116.627us      29.406us        31.78%      30.526us       1.696us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.623us        24.45%      22.623us       1.885us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.568us        14.66%      13.568us       1.131us            12  
+                                            aten::clone         0.71%      20.970us        67.84%       2.009ms     334.883us       0.000us         0.00%       7.903us       1.317us             6  
+                                              aten::sub         1.35%      39.881us         2.33%      69.031us      11.505us       6.784us         7.33%       6.784us       1.131us             6  
+                                              aten::add         1.15%      34.189us         1.99%      58.861us       9.810us       6.784us         7.33%       6.784us       1.131us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.783us         7.33%       6.783us       1.130us             6  
+                                Activity Buffer Request        58.33%       1.728ms        58.33%       1.728ms       1.728ms       1.120us         1.21%       1.120us       1.120us             1  
+                                    aten::empty_strided         1.04%      30.880us         1.04%      30.880us       5.147us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.53%     193.506us         6.53%     193.506us      32.251us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.25%      66.634us         2.83%      83.844us       3.493us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.58%      17.210us         0.58%      17.210us       0.717us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.56%     253.394us         8.56%     253.394us       5.279us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.350us         0.18%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.304ms
-Self CUDA time total: 92.413us
+Self CPU time total: 2.962ms
+Self CUDA time total: 92.539us
 
 
 
@@ -4406,27 +4406,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     927.225us       964.57%     927.225us     927.225us             1  
-                                            torch_eager        10.42%     298.637us        99.83%       2.862ms       2.862ms       0.000us         0.00%      97.440us      97.440us             1  
-                                              aten::mul         5.27%     151.202us         9.04%     259.193us      10.800us      51.264us        53.33%      51.264us       2.136us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.264us        53.33%      51.264us       2.136us            24  
-                                            aten::copy_         3.63%     103.959us        71.40%       2.047ms     113.712us      30.719us        31.96%      32.031us       1.780us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.878us        23.80%      22.878us       1.907us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.145us        14.71%      14.145us       1.179us            12  
-                                            aten::clone         0.93%      26.740us        68.76%       1.971ms     328.494us       0.000us         0.00%       9.153us       1.526us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.841us         8.16%       7.841us       1.307us             6  
-                                              aten::add         1.34%      38.470us         2.12%      60.910us      10.152us       7.105us         7.39%       7.105us       1.184us             6  
-                                              aten::sub         1.24%      35.481us         2.03%      58.281us       9.714us       7.040us         7.32%       7.040us       1.173us             6  
-                                Activity Buffer Request        58.96%       1.690ms        58.96%       1.690ms       1.690ms       1.312us         1.36%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.07%      30.811us         1.07%      30.811us       5.135us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.60%     189.265us         6.60%     189.265us      31.544us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.21%      63.469us         2.80%      80.301us       3.346us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.59%      16.832us         0.59%      16.832us       0.701us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         7.56%     216.723us         7.56%     216.723us       4.515us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.17%       4.900us         0.17%       4.900us       4.900us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.005ms      1046.18%       1.005ms       1.005ms             1  
+                                            torch_eager        20.70%     290.067us        99.57%       1.395ms       1.395ms       0.000us         0.00%      97.374us      97.374us             1  
+                                              aten::mul        12.93%     181.162us        22.03%     308.587us      12.858us      51.138us        53.23%      51.138us       2.131us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.138us        53.23%      51.138us       2.131us            24  
+                                            aten::copy_         7.20%     100.934us        37.57%     526.405us      29.245us      30.750us        32.01%      32.061us       1.781us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.975us        23.92%      22.975us       1.915us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.175us        14.76%      14.175us       1.181us            12  
+                                            aten::clone         1.53%      21.448us        31.30%     438.559us      73.093us       0.000us         0.00%       9.086us       1.514us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.775us         8.09%       7.775us       1.296us             6  
+                                              aten::sub         2.81%      39.410us         4.84%      67.761us      11.294us       7.103us         7.39%       7.103us       1.184us             6  
+                                              aten::add         2.52%      35.299us         4.51%      63.202us      10.534us       7.072us         7.36%       7.072us       1.179us             6  
+                                Activity Buffer Request        11.90%     166.735us        11.90%     166.735us     166.735us       1.311us         1.36%       1.311us       1.311us             1  
+                                    aten::empty_strided         2.22%      31.071us         2.22%      31.071us       5.178us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        13.33%     186.813us        13.33%     186.813us      31.136us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.90%      68.622us         6.17%      86.384us       3.599us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.27%      17.762us         1.27%      17.762us       0.740us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        18.24%     255.602us        18.24%     255.602us       5.325us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.43%       6.050us         0.43%       6.050us       6.050us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.867ms
-Self CUDA time total: 96.128us
+Self CPU time total: 1.401ms
+Self CUDA time total: 96.063us
 
 
 
@@ -4436,27 +4436,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     904.125us       867.77%     904.125us     904.125us             1  
-                                            torch_eager        21.17%     270.270us        99.58%       1.271ms       1.271ms       0.000us         0.00%     105.501us     105.501us             1  
-                                              aten::mul        11.71%     149.569us        20.29%     259.044us      10.793us      55.455us        53.23%      55.455us       2.311us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.455us        53.23%      55.455us       2.311us            24  
-                                            aten::copy_         7.94%     101.312us        39.18%     500.180us      27.788us      32.416us        31.11%      33.728us       1.874us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.704us        23.71%      24.704us       2.059us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.318us        15.66%      16.318us       1.360us            12  
-                                            aten::clone         1.62%      20.642us        32.82%     418.978us      69.830us       0.000us         0.00%       9.024us       1.504us             6  
-                                              aten::sub         2.73%      34.811us         4.52%      57.673us       9.612us       8.256us         7.92%       8.256us       1.376us             6  
-                                              aten::add         2.55%      32.590us         4.27%      54.470us       9.078us       8.062us         7.74%       8.062us       1.344us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us         7.40%       7.712us       1.285us             6  
-                                Activity Buffer Request        11.60%     148.113us        11.60%     148.113us     148.113us       1.312us         1.26%       1.312us       1.312us             1  
-                                    aten::empty_strided         2.40%      30.670us         2.40%      30.670us       5.112us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.64%     186.881us        14.64%     186.881us      31.147us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.82%      61.602us         6.15%      78.501us       3.271us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.32%      16.899us         1.32%      16.899us       0.704us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        17.08%     218.091us        17.08%     218.091us       4.544us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.42%       5.300us         0.42%       5.300us       5.300us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.008ms       966.66%       1.008ms       1.008ms             1  
+                                            torch_eager        19.87%     287.726us        99.59%       1.442ms       1.442ms       0.000us         0.00%     105.564us     105.564us             1  
+                                              aten::mul        12.48%     180.745us        21.62%     312.998us      13.042us      55.327us        53.07%      55.327us       2.305us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.327us        53.07%      55.327us       2.305us            24  
+                                            aten::copy_         7.05%     102.025us        39.57%     572.964us      31.831us      32.416us        31.09%      33.727us       1.874us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.737us        23.73%      24.737us       2.061us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.510us        15.84%      16.510us       1.376us            12  
+                                            aten::clone         1.40%      20.271us        33.26%     481.521us      80.254us       0.000us         0.00%       8.990us       1.498us             6  
+                                              aten::sub         2.64%      38.220us         4.70%      68.062us      11.344us       8.351us         8.01%       8.351us       1.392us             6  
+                                              aten::add         2.49%      36.083us         4.34%      62.773us      10.462us       8.159us         7.83%       8.159us       1.360us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.679us         7.37%       7.679us       1.280us             6  
+                                Activity Buffer Request        13.36%     193.394us        13.36%     193.394us     193.394us       1.311us         1.26%       1.311us       1.311us             1  
+                                    aten::empty_strided         2.11%      30.520us         2.11%      30.520us       5.087us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        14.10%     204.105us        14.10%     204.105us      34.017us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.72%      68.322us         5.99%      86.681us       3.612us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.27%      18.359us         1.27%      18.359us       0.765us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        18.11%     262.225us        18.11%     262.225us       5.463us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.41%       5.880us         0.41%       5.880us       5.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.277ms
-Self CUDA time total: 104.189us
+Self CPU time total: 1.448ms
+Self CUDA time total: 104.253us
 
 
 
@@ -4466,27 +4466,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     918.160us       742.36%     918.160us     918.160us             1  
-                                            torch_eager        20.43%     265.704us        99.60%       1.295ms       1.295ms       0.000us         0.00%     125.506us     125.506us             1  
-                                              aten::mul        11.98%     155.738us        20.81%     270.642us      11.277us      65.023us        52.57%      65.023us       2.709us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.023us        52.57%      65.023us       2.709us            24  
-                                            aten::copy_         7.90%     102.742us        39.27%     510.720us      28.373us      39.331us        31.80%      41.155us       2.286us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.898us        23.36%      28.898us       2.408us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.328us        15.63%      19.328us       1.611us            12  
-                                            aten::clone         1.56%      20.250us        32.83%     426.918us      71.153us       0.000us         0.00%      12.257us       2.043us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.433us         8.44%      10.433us       1.739us             6  
-                                              aten::add         2.56%      33.260us         4.26%      55.420us       9.237us       9.728us         7.87%       9.728us       1.621us             6  
-                                              aten::sub         2.77%      36.070us         4.64%      60.350us      10.058us       9.600us         7.76%       9.600us       1.600us             6  
-                                Activity Buffer Request        12.28%     159.673us        12.28%     159.673us     159.673us       1.824us         1.47%       1.824us       1.824us             1  
-                                    aten::empty_strided         2.40%      31.241us         2.40%      31.241us       5.207us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.07%     182.993us        14.07%     182.993us      30.499us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.89%      63.631us         6.22%      80.874us       3.370us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.33%      17.243us         1.33%      17.243us       0.718us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        17.43%     226.656us        17.43%     226.656us       4.722us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.40%       5.220us         0.40%       5.220us       5.220us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.006ms       813.90%       1.006ms       1.006ms             1  
+                                            torch_eager         9.94%     295.168us        99.80%       2.964ms       2.964ms       0.000us         0.00%     125.309us     125.309us             1  
+                                              aten::mul         5.76%     171.113us        10.09%     299.797us      12.492us      65.183us        52.76%      65.183us       2.716us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.183us        52.76%      65.183us       2.716us            24  
+                                            aten::copy_         3.36%      99.933us        70.61%       2.097ms     116.499us      39.102us        31.65%      40.862us       2.270us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.702us        23.23%      28.702us       2.392us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.59%      19.264us       1.605us            12  
+                                            aten::clone         0.72%      21.400us        67.60%       2.008ms     334.638us       0.000us         0.00%      12.160us       2.027us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us         8.42%      10.400us       1.733us             6  
+                                              aten::sub         1.36%      40.342us         2.40%      71.402us      11.900us       9.664us         7.82%       9.664us       1.611us             6  
+                                              aten::add         1.19%      35.292us         2.11%      62.642us      10.440us       9.600us         7.77%       9.600us       1.600us             6  
+                                Activity Buffer Request        58.22%       1.729ms        58.22%       1.729ms       1.729ms       1.760us         1.42%       1.760us       1.760us             1  
+                                    aten::empty_strided         1.03%      30.470us         1.03%      30.470us       5.078us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.55%     194.565us         6.55%     194.565us      32.427us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.28%      67.682us         2.90%      86.242us       3.593us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.62%      18.560us         0.62%      18.560us       0.773us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.77%     260.395us         8.77%     260.395us       5.425us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.880us         0.20%       5.880us       5.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.300ms
-Self CUDA time total: 123.682us
+Self CPU time total: 2.970ms
+Self CUDA time total: 123.549us
 
 
 
@@ -4496,27 +4496,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     920.156us       882.07%     920.156us     920.156us             1  
-                                            torch_eager        10.51%     302.454us        99.79%       2.872ms       2.872ms       0.000us         0.00%     105.630us     105.630us             1  
-                                              aten::mul         5.35%     153.872us         9.14%     263.076us      10.962us      55.392us        53.10%      55.392us       2.308us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.392us        53.10%      55.392us       2.308us            24  
-                                            aten::copy_         3.49%     100.493us        71.36%       2.053ms     114.078us      32.479us        31.13%      33.791us       1.877us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.736us        23.71%      24.736us       2.061us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.447us        15.77%      16.447us       1.371us            12  
-                                            aten::clone         1.01%      28.981us        68.90%       1.983ms     330.436us       0.000us         0.00%       9.055us       1.509us             6  
-                                              aten::sub         1.22%      35.200us         2.05%      58.980us       9.830us       8.288us         7.94%       8.288us       1.381us             6  
-                                              aten::add         1.08%      30.999us         1.81%      52.042us       8.674us       8.159us         7.82%       8.159us       1.360us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.743us         7.42%       7.743us       1.290us             6  
-                                Activity Buffer Request        59.15%       1.702ms        59.15%       1.702ms       1.702ms       1.312us         1.26%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.11%      32.080us         1.11%      32.080us       5.347us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.49%     186.892us         6.49%     186.892us      31.149us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.21%      63.599us         2.80%      80.681us       3.362us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.59%      17.082us         0.59%      17.082us       0.712us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         7.57%     217.817us         7.57%     217.817us       4.538us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.21%       6.030us         0.21%       6.030us       6.030us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     983.089us       943.25%     983.089us     983.089us             1  
+                                            torch_eager        19.98%     288.033us        99.62%       1.436ms       1.436ms       0.000us         0.00%     105.536us     105.536us             1  
+                                              aten::mul        11.74%     169.216us        20.40%     294.119us      12.255us      55.424us        53.18%      55.424us       2.309us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.424us        53.18%      55.424us       2.309us            24  
+                                            aten::copy_         7.16%     103.226us        41.05%     591.856us      32.881us      32.352us        31.04%      33.664us       1.870us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.640us        23.64%      24.640us       2.053us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.448us        15.78%      16.448us       1.371us            12  
+                                            aten::clone         1.52%      21.981us        35.14%     506.672us      84.445us       0.000us         0.00%       9.024us       1.504us             6  
+                                              aten::sub         2.73%      39.361us         4.47%      64.512us      10.752us       8.289us         7.95%       8.289us       1.381us             6  
+                                              aten::add         2.40%      34.591us         4.34%      62.531us      10.422us       8.159us         7.83%       8.159us       1.360us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us         7.40%       7.712us       1.285us             6  
+                                Activity Buffer Request        15.86%     228.735us        15.86%     228.735us     228.735us       1.312us         1.26%       1.312us       1.312us             1  
+                                    aten::empty_strided         2.19%      31.580us         2.19%      31.580us       5.263us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        13.29%     191.574us        13.29%     191.574us      31.929us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.47%      64.452us         5.67%      81.763us       3.407us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.20%      17.311us         1.20%      17.311us       0.721us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        17.08%     246.315us        17.08%     246.315us       5.132us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.38%       5.431us         0.38%       5.431us       5.431us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.878ms
-Self CUDA time total: 104.318us
+Self CPU time total: 1.442ms
+Self CUDA time total: 104.224us
 
 
 
@@ -4526,27 +4526,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     907.897us       732.75%     907.897us     907.897us             1  
-                                            torch_eager        10.14%     290.438us        99.82%       2.860ms       2.860ms       0.000us         0.00%     125.694us     125.694us             1  
-                                              aten::mul         5.20%     149.121us         8.91%     255.361us      10.640us      65.405us        52.79%      65.405us       2.725us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.405us        52.79%      65.405us       2.725us            24  
-                                            aten::copy_         3.52%     100.835us        71.92%       2.061ms     114.473us      39.359us        31.77%      41.151us       2.286us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.927us        23.35%      28.927us       2.411us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.138us        15.45%      19.138us       1.595us            12  
-                                            aten::clone         0.91%      26.167us        69.31%       1.986ms     330.944us       0.000us         0.00%      12.224us       2.037us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us         8.42%      10.432us       1.739us             6  
-                                              aten::sub         1.49%      42.714us         2.30%      65.852us      10.975us       9.601us         7.75%       9.601us       1.600us             6  
-                                              aten::add         1.10%      31.550us         1.85%      52.920us       8.820us       9.537us         7.70%       9.537us       1.589us             6  
-                                Activity Buffer Request        59.69%       1.710ms        59.69%       1.710ms       1.710ms       1.792us         1.45%       1.792us       1.792us             1  
-                                    aten::empty_strided         1.05%      30.111us         1.05%      30.111us       5.018us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.49%     186.023us         6.49%     186.023us      31.004us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.15%      61.602us         2.74%      78.501us       3.271us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.59%      16.899us         0.59%      16.899us       0.704us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         7.48%     214.199us         7.48%     214.199us       4.462us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       5.240us         0.18%       5.240us       5.240us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.004ms       809.35%       1.004ms       1.004ms             1  
+                                            torch_eager        20.77%     293.259us        99.59%       1.406ms       1.406ms       0.000us         0.00%     125.785us     125.785us             1  
+                                              aten::mul        11.90%     168.115us        21.00%     296.496us      12.354us      65.470us        52.80%      65.470us       2.728us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.470us        52.80%      65.470us       2.728us            24  
+                                            aten::copy_         7.29%     102.893us        38.63%     545.534us      30.307us      39.326us        31.72%      41.117us       2.284us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.831us        23.25%      28.831us       2.403us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.198us        15.48%      19.198us       1.600us            12  
+                                            aten::clone         1.46%      20.620us        31.97%     451.491us      75.249us       0.000us         0.00%      12.286us       2.048us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.495us         8.46%      10.495us       1.749us             6  
+                                              aten::add         2.69%      38.013us         4.51%      63.752us      10.625us       9.599us         7.74%       9.599us       1.600us             6  
+                                              aten::sub         3.27%      46.223us         5.18%      73.182us      12.197us       9.599us         7.74%       9.599us       1.600us             6  
+                                Activity Buffer Request        12.70%     179.315us        12.70%     179.315us     179.315us       1.791us         1.44%       1.791us       1.791us             1  
+                                    aten::empty_strided         2.12%      29.941us         2.12%      29.941us       4.990us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        13.35%     188.535us        13.35%     188.535us      31.423us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.72%      66.700us         5.92%      83.651us       3.485us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.20%      16.951us         1.20%      16.951us       0.706us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        18.12%     255.870us        18.12%     255.870us       5.331us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.41%       5.720us         0.41%       5.720us       5.720us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.865ms
-Self CUDA time total: 123.902us
+Self CPU time total: 1.412ms
+Self CUDA time total: 123.994us
 
 
 
@@ -4556,27 +4556,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     893.688us       504.20%     893.688us     893.688us             1  
-                                            torch_eager        20.45%     267.171us        99.57%       1.301ms       1.301ms       0.000us         0.00%     180.096us     180.096us             1  
-                                              aten::mul        11.55%     150.960us        19.81%     258.802us      10.783us      94.720us        53.44%      94.720us       3.947us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.720us        53.44%      94.720us       3.947us            24  
-                                            aten::copy_         7.76%     101.418us        41.00%     535.738us      29.763us      57.761us        32.59%      60.609us       3.367us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.769us        23.00%      40.769us       3.397us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.767us        13.97%      24.767us       2.064us            12  
-                                            aten::clone         1.49%      19.482us        34.75%     454.110us      75.685us       0.000us         0.00%      19.840us       3.307us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.992us         9.59%      16.992us       2.832us             6  
-                                              aten::add         2.31%      30.129us         3.96%      51.730us       8.622us      12.479us         7.04%      12.479us       2.080us             6  
-                                              aten::sub         2.69%      35.182us         4.51%      58.883us       9.814us      12.288us         6.93%      12.288us       2.048us             6  
-                                Activity Buffer Request        14.52%     189.694us        14.52%     189.694us     189.694us       2.848us         1.61%       2.848us       2.848us             1  
-                                    aten::empty_strided         2.22%      29.022us         2.22%      29.022us       4.837us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.04%     183.444us        14.04%     183.444us      30.574us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.81%      62.873us         6.14%      80.203us       3.342us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.33%      17.330us         1.33%      17.330us       0.722us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.40%     214.326us        16.40%     214.326us       4.465us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.43%       5.660us         0.43%       5.660us       5.660us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.022ms       574.36%       1.022ms       1.022ms             1  
+                                            torch_eager         9.88%     300.789us        99.79%       3.040ms       3.040ms       0.000us         0.00%     180.741us     180.741us             1  
+                                              aten::mul         5.87%     178.699us        10.24%     311.746us      12.989us      95.331us        53.59%      95.331us       3.972us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      95.331us        53.59%      95.331us       3.972us            24  
+                                            aten::copy_         3.42%     104.263us        70.83%       2.158ms     119.861us      57.731us        32.45%      60.579us       3.366us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.803us        22.94%      40.803us       3.400us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.831us        13.96%      24.831us       2.069us            12  
+                                            aten::clone         0.72%      21.809us        67.85%       2.067ms     344.453us       0.000us         0.00%      19.776us       3.296us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us         9.52%      16.928us       2.821us             6  
+                                              aten::add         1.17%      35.582us         2.06%      62.693us      10.449us      12.480us         7.02%      12.480us       2.080us             6  
+                                              aten::sub         1.34%      40.961us         2.23%      67.853us      11.309us      12.351us         6.94%      12.351us       2.059us             6  
+                                Activity Buffer Request        58.86%       1.793ms        58.86%       1.793ms       1.793ms       2.848us         1.60%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.00%      30.461us         1.00%      30.461us       5.077us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.14%     187.135us         6.14%     187.135us      31.189us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.22%      67.750us         2.85%      86.690us       3.612us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.62%      18.940us         0.62%      18.940us       0.789us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.55%     260.511us         8.55%     260.511us       5.427us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.21%       6.321us         0.21%       6.321us       6.321us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.307ms
-Self CUDA time total: 177.248us
+Self CPU time total: 3.046ms
+Self CUDA time total: 177.893us
 
 
 
@@ -4586,27 +4586,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     902.974us       302.94%     902.974us     902.974us             1  
-                                            torch_eager        20.55%     268.018us        99.60%       1.299ms       1.299ms       0.000us         0.00%     315.867us     315.867us             1  
-                                              aten::mul        11.45%     149.281us        19.89%     259.364us      10.807us     145.821us        48.92%     145.821us       6.076us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.821us        48.92%     145.821us       6.076us            24  
-                                            aten::copy_         7.72%     100.690us        40.33%     526.007us      29.223us     111.006us        37.24%     128.798us       7.155us            18  
-                                            aten::clone         1.57%      20.500us        34.18%     445.716us      74.286us       0.000us         0.00%      71.584us      11.931us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.214us        19.19%      57.214us       4.768us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.792us        18.05%      53.792us       8.965us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.248us        13.84%      41.248us       3.437us            12  
-                                              aten::sub         2.77%      36.180us         4.59%      59.800us       9.967us      20.672us         6.94%      20.672us       3.445us             6  
-                                              aten::add         2.40%      31.251us         4.19%      54.701us       9.117us      20.576us         6.90%      20.576us       3.429us             6  
-                                Activity Buffer Request        13.76%     179.443us        13.76%     179.443us     179.443us      17.792us         5.97%      17.792us      17.792us             1  
-                                    aten::empty_strided         2.33%      30.440us         2.33%      30.440us       5.073us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.06%     183.364us        14.06%     183.364us      30.561us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.83%      63.022us         6.14%      80.061us       3.336us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.31%      17.039us         1.31%      17.039us       0.710us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.84%     219.663us        16.84%     219.663us       4.576us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.40%       5.210us         0.40%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.032ms       346.83%       1.032ms       1.032ms             1  
+                                            torch_eager        10.94%     330.454us        99.81%       3.014ms       3.014ms       0.000us         0.00%     315.361us     315.361us             1  
+                                              aten::mul         5.75%     173.643us        10.12%     305.657us      12.736us     145.696us        48.95%     145.696us       6.071us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.696us        48.95%     145.696us       6.071us            24  
+                                            aten::copy_         3.49%     105.289us        69.61%       2.102ms     116.764us     110.818us        37.23%     128.546us       7.141us            18  
+                                            aten::clone         0.91%      27.529us        66.74%       2.015ms     335.845us       0.000us         0.00%      71.233us      11.872us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.313us        19.26%      57.313us       4.776us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.505us        17.98%      53.505us       8.918us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.119us        13.82%      41.119us       3.427us            12  
+                                              aten::sub         1.33%      40.040us         2.29%      68.991us      11.499us      20.607us         6.92%      20.607us       3.434us             6  
+                                              aten::add         1.18%      35.771us         2.04%      61.541us      10.257us      20.512us         6.89%      20.512us       3.419us             6  
+                                Activity Buffer Request        56.83%       1.716ms        56.83%       1.716ms       1.716ms      17.728us         5.96%      17.728us      17.728us             1  
+                                    aten::empty_strided         1.04%      31.352us         1.04%      31.352us       5.225us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.87%     207.436us         6.87%     207.436us      34.573us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.27%      68.592us         2.86%      86.271us       3.595us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.59%      17.679us         0.59%      17.679us       0.737us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.61%     259.847us         8.61%     259.847us       5.413us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.690us         0.19%       5.690us       5.690us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.304ms
-Self CUDA time total: 298.075us
+Self CPU time total: 3.019ms
+Self CUDA time total: 297.633us
 
 
 
@@ -4616,27 +4616,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     892.343us       502.65%     892.343us     892.343us             1  
-                                            torch_eager        20.29%     263.739us        99.57%       1.294ms       1.294ms       0.000us         0.00%     180.376us     180.376us             1  
-                                              aten::mul        11.80%     153.414us        20.02%     260.283us      10.845us      95.163us        53.60%      95.163us       3.965us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      95.163us        53.60%      95.163us       3.965us            24  
-                                            aten::copy_         7.77%     100.952us        40.82%     530.557us      29.475us      57.598us        32.44%      60.446us       3.358us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.672us        22.91%      40.672us       3.389us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.767us        13.95%      24.767us       2.064us            12  
-                                            aten::clone         1.59%      20.651us        34.74%     451.538us      75.256us       0.000us         0.00%      19.774us       3.296us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.926us         9.53%      16.926us       2.821us             6  
-                                              aten::add         2.40%      31.260us         4.02%      52.270us       8.712us      12.447us         7.01%      12.447us       2.074us             6  
-                                              aten::sub         2.80%      36.342us         4.59%      59.652us       9.942us      12.320us         6.94%      12.320us       2.053us             6  
-                                Activity Buffer Request        14.29%     185.813us        14.29%     185.813us     185.813us       2.848us         1.60%       2.848us       2.848us             1  
-                                    aten::empty_strided         2.21%      28.780us         2.21%      28.780us       4.797us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.07%     182.883us        14.07%     182.883us      30.480us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.73%      61.469us         6.03%      78.389us       3.266us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.30%      16.920us         1.30%      16.920us       0.705us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.32%     212.098us        16.32%     212.098us       4.419us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.43%       5.551us         0.43%       5.551us       5.551us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     997.224us       560.81%     997.224us     997.224us             1  
+                                            torch_eager        20.41%     294.666us        99.60%       1.438ms       1.438ms       0.000us         0.00%     180.699us     180.699us             1  
+                                              aten::mul        11.81%     170.426us        20.82%     300.510us      12.521us      95.263us        53.57%      95.263us       3.969us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      95.263us        53.57%      95.263us       3.969us            24  
+                                            aten::copy_         6.98%     100.734us        40.22%     580.546us      32.253us      57.725us        32.46%      60.604us       3.367us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.895us        23.00%      40.895us       3.408us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.832us        13.96%      24.832us       2.069us            12  
+                                            aten::clone         1.48%      21.402us        34.03%     491.202us      81.867us       0.000us         0.00%      19.709us       3.285us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.830us         9.46%      16.830us       2.805us             6  
+                                              aten::sub         2.72%      39.260us         4.81%      69.410us      11.568us      12.416us         6.98%      12.416us       2.069us             6  
+                                              aten::add         2.46%      35.491us         4.17%      60.132us      10.022us      12.416us         6.98%      12.416us       2.069us             6  
+                                Activity Buffer Request        14.76%     213.066us        14.76%     213.066us     213.066us       2.879us         1.62%       2.879us       2.879us             1  
+                                    aten::empty_strided         2.03%      29.329us         2.03%      29.329us       4.888us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        13.55%     195.534us        13.55%     195.534us      32.589us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.52%      65.271us         5.66%      81.651us       3.402us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.13%      16.380us         1.13%      16.380us       0.683us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        17.74%     256.087us        17.74%     256.087us       5.335us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.40%       5.801us         0.40%       5.801us       5.801us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.300ms
-Self CUDA time total: 177.528us
+Self CPU time total: 1.443ms
+Self CUDA time total: 177.820us
 
 
 
@@ -4646,27 +4646,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     904.750us       303.27%     904.750us     904.750us             1  
-                                            torch_eager        21.10%     268.921us        99.56%       1.269ms       1.269ms       0.000us         0.00%     316.413us     316.413us             1  
-                                              aten::mul        11.89%     151.482us        20.35%     259.306us      10.804us     145.920us        48.91%     145.920us       6.080us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.920us        48.91%     145.920us       6.080us            24  
-                                            aten::copy_         8.00%     101.944us        39.08%     497.980us      27.666us     111.613us        37.41%     129.693us       7.205us            18  
-                                            aten::clone         1.62%      20.659us        32.72%     416.907us      69.484us       0.000us         0.00%      72.512us      12.085us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.181us        19.17%      57.181us       4.765us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      54.432us        18.25%      54.432us       9.072us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.800us        13.68%      40.800us       3.400us            12  
-                                              aten::add         2.47%      31.422us         4.17%      53.191us       8.865us      20.512us         6.88%      20.512us       3.419us             6  
-                                              aten::sub         2.72%      34.608us         4.46%      56.810us       9.468us      20.288us         6.80%      20.288us       3.381us             6  
-                                Activity Buffer Request        10.32%     131.451us        10.32%     131.451us     131.451us      18.080us         6.06%      18.080us      18.080us             1  
-                                    aten::empty_strided         2.44%      31.142us         2.44%      31.142us       5.190us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.74%     200.533us        15.74%     200.533us      33.422us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.99%      63.539us         6.33%      80.621us       3.359us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.34%      17.082us         1.34%      17.082us       0.712us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.94%     215.847us        16.94%     215.847us       4.497us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.44%       5.651us         0.44%       5.651us       5.651us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.006ms       337.57%       1.006ms       1.006ms             1  
+                                            torch_eager        19.59%     286.301us        99.64%       1.456ms       1.456ms       0.000us         0.00%     316.065us     316.065us             1  
+                                              aten::mul        11.95%     174.684us        20.79%     303.846us      12.660us     145.439us        48.82%     145.439us       6.060us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.439us        48.82%     145.439us       6.060us            24  
+                                            aten::copy_         7.17%     104.791us        40.98%     598.785us      33.266us     111.649us        37.48%     129.826us       7.213us            18  
+                                            aten::clone         1.35%      19.700us        34.46%     503.492us      83.915us       0.000us         0.00%      72.450us      12.075us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.376us        19.26%      57.376us       4.781us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      54.273us        18.22%      54.273us       9.045us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.800us        13.70%      40.800us       3.400us            12  
+                                              aten::sub         2.64%      38.590us         4.63%      67.691us      11.282us      20.448us         6.86%      20.448us       3.408us             6  
+                                              aten::add         2.53%      37.009us         4.42%      64.522us      10.754us      20.352us         6.83%      20.352us       3.392us             6  
+                                Activity Buffer Request        15.77%     230.486us        15.77%     230.486us     230.486us      18.177us         6.10%      18.177us      18.177us             1  
+                                    aten::empty_strided         2.02%      29.450us         2.02%      29.450us       4.908us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        12.98%     189.676us        12.98%     189.676us      31.613us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.60%      67.290us         5.86%      85.691us       3.570us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.26%      18.401us         1.26%      18.401us       0.767us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        17.77%     259.608us        17.77%     259.608us       5.409us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       5.300us         0.36%       5.300us       5.300us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.274ms
-Self CUDA time total: 298.333us
+Self CPU time total: 1.461ms
+Self CUDA time total: 297.888us
 
 
 
@@ -4676,27 +4676,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     937.429us       158.23%     937.429us     937.429us             1  
-                                            torch_eager        20.59%     270.182us        99.56%       1.306ms       1.306ms       0.000us         0.00%     616.217us     616.217us             1  
-                                            aten::copy_         7.81%     102.469us        38.72%     508.048us      28.225us     278.876us        47.07%     302.652us      16.814us            18  
-                                              aten::mul        12.17%     159.666us        21.13%     277.305us      11.554us     247.965us        41.85%     247.965us      10.332us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     247.965us        41.85%     247.965us      10.332us            24  
-                                            aten::clone         1.57%      20.662us        32.21%     422.588us      70.431us       0.000us         0.00%     211.645us      35.274us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     187.869us        31.71%     187.869us      31.312us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      91.007us        15.36%      91.007us       7.584us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.600us        11.07%      65.600us       5.467us            12  
-                                              aten::sub         2.87%      37.612us         4.70%      61.651us      10.275us      32.896us         5.55%      32.896us       5.483us             6  
-                                              aten::add         2.50%      32.870us         4.35%      57.120us       9.520us      32.704us         5.52%      32.704us       5.451us             6  
-                                Activity Buffer Request        12.01%     157.623us        12.01%     157.623us     157.623us      23.776us         4.01%      23.776us      23.776us             1  
-                                    aten::empty_strided         2.36%      30.970us         2.36%      30.970us       5.162us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.83%     181.533us        13.83%     181.533us      30.256us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.81%      63.053us         6.13%      80.473us       3.353us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.33%      17.420us         1.33%      17.420us       0.726us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        17.71%     232.351us        17.71%     232.351us       4.841us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.44%       5.770us         0.44%       5.770us       5.770us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.028ms       173.98%       1.028ms       1.028ms             1  
+                                            torch_eager        20.54%     307.941us        99.60%       1.493ms       1.493ms       0.000us         0.00%     614.720us     614.720us             1  
+                                            aten::copy_         6.96%     104.343us        39.80%     596.653us      33.147us     277.825us        47.01%     301.537us      16.752us            18  
+                                              aten::mul        11.79%     176.823us        20.94%     313.949us      13.081us     247.103us        41.81%     247.103us      10.296us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     247.103us        41.81%     247.103us      10.296us            24  
+                                            aten::clone         1.37%      20.599us        33.38%     500.402us      83.400us       0.000us         0.00%     210.816us      35.136us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     187.104us        31.66%     187.104us      31.184us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.721us        15.35%      90.721us       7.560us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      66.080us        11.18%      66.080us       5.507us            12  
+                                              aten::add         2.50%      37.532us         4.42%      66.263us      11.044us      33.056us         5.59%      33.056us       5.509us             6  
+                                              aten::sub         2.78%      41.622us         4.80%      72.031us      12.005us      33.024us         5.59%      33.024us       5.504us             6  
+                                Activity Buffer Request        15.35%     230.085us        15.35%     230.085us     230.085us      23.712us         4.01%      23.712us      23.712us             1  
+                                    aten::empty_strided         1.95%      29.191us         1.95%      29.191us       4.865us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        12.52%     187.674us        12.52%     187.674us      31.279us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.56%      68.431us         5.78%      86.660us       3.611us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.22%      18.229us         1.22%      18.229us       0.760us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        18.06%     270.817us        18.06%     270.817us       5.642us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.40%       6.020us         0.40%       6.020us       6.020us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.312ms
-Self CUDA time total: 592.441us
+Self CPU time total: 1.499ms
+Self CUDA time total: 591.008us
 
 
 
@@ -4706,55 +4706,61 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         7.94%     296.330us        77.11%       2.877ms       2.877ms       0.000us         0.00%       1.855ms       1.855ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.826ms       102.11%       1.826ms       1.826ms             1  
-                                            aten::copy_         2.78%     103.676us        55.32%       2.064ms     114.670us     804.351us        44.97%     870.879us      48.382us            18  
-                                              aten::mul         4.03%     150.284us         7.00%     261.125us      10.880us     838.429us        46.87%     838.429us      34.935us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     838.429us        46.87%     838.429us      34.935us            24  
-                                            aten::clone         0.73%      27.418us        53.07%       1.980ms     330.025us       0.000us         0.00%     619.584us     103.264us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     553.056us        30.92%     553.056us      92.176us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     251.295us        14.05%     251.295us      20.941us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     145.983us         8.16%     145.983us      12.165us            12  
-                                              aten::sub         1.02%      38.061us         1.68%      62.715us      10.453us      87.583us         4.90%      87.583us      14.597us             6  
-                                Activity Buffer Request        44.74%       1.669ms        44.74%       1.669ms       1.669ms      66.528us         3.72%      66.528us      66.528us             1  
-                                              aten::add         0.88%      32.700us         1.50%      55.970us       9.328us      58.400us         3.26%      58.400us       9.733us             6  
-                                    aten::empty_strided         0.81%      30.382us         0.81%      30.382us       5.064us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         5.84%     217.963us         5.84%     217.963us      36.327us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.66%      61.939us         2.12%      79.030us       3.293us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.46%      17.091us         0.46%      17.091us       0.712us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         6.21%     231.875us         6.21%     231.875us       4.831us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize        22.89%     854.084us        22.89%     854.084us     854.084us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager        13.10%     290.757us        64.42%       1.430ms       1.430ms       0.000us         0.00%       1.858ms       1.858ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.829ms       102.08%       1.829ms       1.829ms             1  
+                                            aten::copy_         4.50%      99.996us        26.00%     577.207us      32.067us     806.683us        45.02%     872.922us      48.496us            18  
+                                              aten::mul         7.46%     165.620us        13.35%     296.315us      12.346us     837.369us        46.73%     837.369us      34.890us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     837.369us        46.73%     837.369us      34.890us            24  
+                                            aten::clone         0.93%      20.729us        21.91%     486.420us      81.070us       0.000us         0.00%     620.507us     103.418us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     554.268us        30.93%     554.268us      92.378us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     252.415us        14.09%     252.415us      21.035us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     147.935us         8.26%     147.935us      12.328us            12  
+                                              aten::sub         1.77%      39.361us         2.98%      66.203us      11.034us      89.536us         5.00%      89.536us      14.923us             6  
+                                Activity Buffer Request         8.80%     195.395us         8.80%     195.395us     195.395us      66.239us         3.70%      66.239us      66.239us             1  
+                                              aten::add         1.55%      34.513us         2.81%      62.423us      10.404us      58.399us         3.26%      58.399us       9.733us             6  
+                                    aten::empty_strided         1.34%      29.799us         1.34%      29.799us       4.967us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.39%     208.344us         9.39%     208.344us      34.724us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.12%      69.193us         3.90%      86.553us       3.606us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.78%      17.360us         0.78%      17.360us       0.723us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        11.66%     258.919us        11.66%     258.919us       5.394us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize        35.58%     789.949us        35.58%     789.949us     789.949us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.731ms
-Self CUDA time total: 1.789ms
+Self CPU time total: 2.220ms
+Self CUDA time total: 1.792ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_eager              cuda_B1_S128_H32_D128_R64     0.22  True
-torch_eager              cuda_B1_S128_H32_D64_R32     0.22  True
-torch_eager              cuda_B1_S128_H8_D128_R64     0.22  True
-torch_eager              cuda_B1_S128_H8_D64_R32     0.18  True
-torch_eager              cuda_B1_S2048_H32_D128_R64     0.22  True
-torch_eager              cuda_B1_S2048_H32_D64_R32     0.21  True
-torch_eager              cuda_B1_S2048_H8_D128_R64     0.21  True
-torch_eager              cuda_B1_S2048_H8_D64_R32     0.21  True
-torch_eager              cuda_B1_S512_H32_D128_R64     0.21  True
-torch_eager              cuda_B1_S512_H32_D64_R32     0.21  True
-torch_eager              cuda_B1_S512_H8_D128_R64     0.21  True
-torch_eager              cuda_B1_S512_H8_D64_R32     0.22  True
-torch_eager              cuda_B2_S128_H32_D128_R64     0.22  True
-torch_eager              cuda_B2_S128_H32_D64_R32     0.22  True
-torch_eager              cuda_B2_S128_H8_D128_R64     0.22  True
-torch_eager              cuda_B2_S128_H8_D64_R32     0.21  True
+torch_eager              cuda_B1_S128_H32_D128_R64     0.27  True
+torch_eager              cuda_B1_S128_H32_D64_R32     0.25  True
+torch_eager              cuda_B1_S128_H8_D128_R64     0.25  True
+torch_eager              cuda_B1_S128_H8_D64_R32     0.19  True
+torch_eager              cuda_B1_S2048_H32_D128_R64     0.25  True
+torch_eager              cuda_B1_S2048_H32_D64_R32     0.25  True
+torch_eager              cuda_B1_S2048_H8_D128_R64     0.24  True
+torch_eager              cuda_B1_S2048_H8_D64_R32     0.25  True
+torch_eager              cuda_B1_S512_H32_D128_R64     0.24  True
+torch_eager              cuda_B1_S512_H32_D64_R32     0.24  True
+torch_eager              cuda_B1_S512_H8_D128_R64     0.24  True
+torch_eager              cuda_B1_S512_H8_D64_R32     0.25  True
+torch_eager              cuda_B2_S128_H32_D128_R64     0.24  True
+torch_eager              cuda_B2_S128_H32_D64_R32     0.25  True
+torch_eager              cuda_B2_S128_H8_D128_R64     0.25  True
+torch_eager              cuda_B2_S128_H8_D64_R32     0.24  True
 torch_eager              cuda_B2_S2048_H32_D128_R64     0.65  True
-torch_eager              cuda_B2_S2048_H32_D64_R32     0.23  True
-torch_eager              cuda_B2_S2048_H8_D128_R64     0.21  True
-torch_eager              cuda_B2_S2048_H8_D64_R32     0.22  True
-torch_eager              cuda_B2_S512_H32_D128_R64     0.21  True
-torch_eager              cuda_B2_S512_H32_D64_R32     0.22  True
-torch_eager              cuda_B2_S512_H8_D128_R64     0.22  True
-torch_eager              cuda_B2_S512_H8_D64_R32     0.22  True
+torch_eager              cuda_B2_S2048_H32_D64_R32     0.25  True
+torch_eager              cuda_B2_S2048_H8_D128_R64     0.25  True
+torch_eager              cuda_B2_S2048_H8_D64_R32     0.24  True
+torch_eager              cuda_B2_S512_H32_D128_R64     0.25  True
+torch_eager              cuda_B2_S512_H32_D64_R32     0.24  True
+torch_eager              cuda_B2_S512_H8_D128_R64     0.24  True
+torch_eager              cuda_B2_S512_H8_D64_R32     0.24  True
 
+
+
▶ UV Install Logs
+ +

Artifacts:

rotary.jsonl