diff --git "a/causal_conv1d/impls/torch_causal_conv1d.html" "b/causal_conv1d/impls/torch_causal_conv1d.html" --- "a/causal_conv1d/impls/torch_causal_conv1d.html" +++ "b/causal_conv1d/impls/torch_causal_conv1d.html" @@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Generated on:
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35 + Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.22s +Cell: nv | 0.25s | Raw @@ -3904,16 +3904,16 @@ Cell: nv | 0.22s
-
Mon Nov 10 21:57:49 2025       
+
Fri Dec 19 18:56:39 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
+| NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   27C    P0             77W /  350W |       0MiB /  46068MiB |     18%      Default |
+| N/A   33C    P0             78W /  350W |       0MiB /  46068MiB |     11%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3937,7 +3937,7 @@ Cell: nv | 0.22s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 3.89s
+Cell: benchmark | 3.84s
  | 
 
 Raw
@@ -3999,29 +3999,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     411.136us      2127.15%     411.136us     411.136us             1  
-                                            torch_eager         8.60%     205.173us        99.40%       2.372ms       2.372ms       0.000us         0.00%      21.632us      21.632us             1  
-                                               aten::to         0.40%       9.649us        83.06%       1.982ms     330.358us       0.000us         0.00%      14.272us       2.379us             6  
-                                         aten::_to_copy         1.47%      35.141us        82.65%       1.973ms     328.750us       0.000us         0.00%      14.272us       2.379us             6  
-                                            aten::copy_         2.42%      57.830us        79.13%       1.889ms     314.753us      11.968us        61.92%      14.272us       2.379us             6  
-                                           aten::conv1d         0.32%       7.640us         6.22%     148.384us      49.461us       0.000us         0.00%       7.360us       2.453us             3  
-                                      aten::convolution         0.55%      13.222us         5.90%     140.744us      46.915us       0.000us         0.00%       7.360us       2.453us             3  
-                                     aten::_convolution         1.23%      29.427us         5.34%     127.522us      42.507us       0.000us         0.00%       7.360us       2.453us             3  
-                                aten::_conv_depthwise2d         1.41%      33.690us         3.44%      82.073us      27.358us       7.360us        38.08%       7.360us       2.453us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        38.08%       7.360us       2.453us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        32.62%       6.304us       2.101us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        29.30%       5.664us       1.888us             3  
-                                Activity Buffer Request        73.85%       1.762ms        73.85%       1.762ms       1.762ms       2.304us        11.92%       2.304us       2.304us             1  
-                                    aten::empty_strided         2.05%      48.841us         2.05%      48.841us       8.140us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.88%      92.484us         3.88%      92.484us      10.276us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.94%      22.551us         1.23%      29.352us       3.261us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.46%      10.991us         0.46%      10.991us       0.733us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.53%      12.660us         0.53%      12.660us       4.220us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.49%      11.631us         0.49%      11.631us       3.877us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.27%       6.340us         0.32%       7.570us       2.523us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     420.863us      2177.59%     420.863us     420.863us             1  
+                                            torch_eager         8.44%     198.071us        99.39%       2.333ms       2.333ms       0.000us         0.00%      21.695us      21.695us             1  
+                                               aten::to         0.42%       9.821us        82.68%       1.941ms     323.474us       0.000us         0.00%      14.367us       2.395us             6  
+                                         aten::_to_copy         1.47%      34.509us        82.26%       1.931ms     321.837us       0.000us         0.00%      14.367us       2.395us             6  
+                                            aten::copy_         2.58%      60.614us        78.68%       1.847ms     307.834us      11.999us        62.08%      14.367us       2.395us             6  
+                                           aten::conv1d         0.31%       7.291us         6.62%     155.424us      51.808us       0.000us         0.00%       7.328us       2.443us             3  
+                                      aten::convolution         0.56%      13.171us         6.31%     148.133us      49.378us       0.000us         0.00%       7.328us       2.443us             3  
+                                     aten::_convolution         1.45%      33.972us         5.75%     134.962us      44.987us       0.000us         0.00%       7.328us       2.443us             3  
+                                aten::_conv_depthwise2d         1.44%      33.841us         3.61%      84.652us      28.217us       7.328us        37.92%       7.328us       2.443us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        37.92%       7.328us       2.443us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.335us        32.78%       6.335us       2.112us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        29.31%       5.664us       1.888us             3  
+                                Activity Buffer Request        73.26%       1.720ms        73.26%       1.720ms       1.720ms       2.368us        12.25%       2.368us       2.368us             1  
+                                    aten::empty_strided         2.11%      49.511us         2.11%      49.511us       8.252us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.97%      93.272us         3.97%      93.272us      10.364us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.01%      23.716us         1.33%      31.137us       3.460us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.50%      11.722us         0.50%      11.722us       0.781us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.50%      11.630us         0.50%      11.630us       3.877us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.53%      12.550us         0.53%      12.550us       4.183us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.29%       6.740us         0.34%       8.071us       2.690us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.386ms
-Self CUDA time total: 19.328us
+Self CPU time total: 2.348ms
+Self CUDA time total: 19.327us
 
 
 
@@ -4031,29 +4031,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     320.094us      1629.14%     320.094us     320.094us             1  
-                                            torch_eager         6.61%     147.267us        99.75%       2.222ms       2.222ms       0.000us         0.00%      21.856us      21.856us             1  
-                                               aten::to         0.28%       6.328us        86.86%       1.935ms     322.525us       0.000us         0.00%      13.888us       2.315us             6  
-                                         aten::_to_copy         0.99%      22.058us        86.58%       1.929ms     321.470us       0.000us         0.00%      13.888us       2.315us             6  
-                                            aten::copy_         2.09%      46.581us        84.13%       1.874ms     312.384us      11.680us        59.45%      13.888us       2.315us             6  
-                                           aten::conv1d         0.26%       5.880us         5.20%     115.901us      38.634us       0.000us         0.00%       7.968us       2.656us             3  
-                                      aten::convolution         0.41%       9.201us         4.94%     110.021us      36.674us       0.000us         0.00%       7.968us       2.656us             3  
-                                     aten::_convolution         0.99%      22.029us         4.53%     100.820us      33.607us       0.000us         0.00%       7.968us       2.656us             3  
-                                aten::_conv_depthwise2d         0.98%      21.809us         2.84%      63.210us      21.070us       7.968us        40.55%       7.968us       2.656us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.968us        40.55%       7.968us       2.656us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.112us        31.11%       6.112us       2.037us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        28.34%       5.568us       1.856us             3  
-                                Activity Buffer Request        79.89%       1.780ms        79.89%       1.780ms       1.780ms       2.208us        11.24%       2.208us       2.208us             1  
-                                    aten::empty_strided         1.46%      32.461us         1.46%      32.461us       5.410us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.22%      71.802us         3.22%      71.802us       7.978us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.71%      15.809us         0.93%      20.750us       2.306us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.38%       8.492us         0.38%       8.492us       0.566us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.41%       9.081us         0.41%       9.081us       3.027us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.38%       8.530us         0.38%       8.530us       2.843us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.26%       5.730us         0.32%       7.140us       2.380us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     313.250us      1596.75%     313.250us     313.250us             1  
+                                            torch_eager         6.72%     145.881us        99.75%       2.164ms       2.164ms       0.000us         0.00%      21.826us      21.826us             1  
+                                               aten::to         0.27%       5.881us        86.54%       1.878ms     312.937us       0.000us         0.00%      13.921us       2.320us             6  
+                                         aten::_to_copy         1.01%      21.932us        86.27%       1.872ms     311.957us       0.000us         0.00%      13.921us       2.320us             6  
+                                            aten::copy_         2.19%      47.440us        83.99%       1.822ms     303.712us      11.713us        59.71%      13.921us       2.320us             6  
+                                           aten::conv1d         0.28%       6.160us         5.31%     115.143us      38.381us       0.000us         0.00%       7.905us       2.635us             3  
+                                      aten::convolution         0.41%       8.901us         5.02%     108.983us      36.328us       0.000us         0.00%       7.905us       2.635us             3  
+                                     aten::_convolution         1.01%      21.821us         4.61%     100.082us      33.361us       0.000us         0.00%       7.905us       2.635us             3  
+                                aten::_conv_depthwise2d         0.97%      21.100us         2.86%      62.101us      20.700us       7.905us        40.29%       7.905us       2.635us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.905us        40.29%       7.905us       2.635us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.113us        31.16%       6.113us       2.038us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.600us        28.55%       5.600us       1.867us             3  
+                                Activity Buffer Request        79.65%       1.728ms        79.65%       1.728ms       1.728ms       2.208us        11.25%       2.208us       2.208us             1  
+                                    aten::empty_strided         1.27%      27.539us         1.27%      27.539us       4.590us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.21%      69.681us         3.21%      69.681us       7.742us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.78%      16.828us         1.04%      22.610us       2.512us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.44%       9.512us         0.44%       9.512us       0.634us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.43%       9.331us         0.43%       9.331us       3.110us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.41%       8.800us         0.41%       8.800us       2.933us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.26%       5.580us         0.33%       7.220us       2.407us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.228ms
-Self CUDA time total: 19.648us
+Self CPU time total: 2.170ms
+Self CUDA time total: 19.618us
 
 
 
@@ -4063,29 +4063,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     322.750us      1724.09%     322.750us     322.750us             1  
-                                            torch_eager         6.97%     154.353us        99.74%       2.208ms       2.208ms       0.000us         0.00%      20.736us      20.736us             1  
-                                               aten::to         0.30%       6.580us        86.44%       1.913ms     318.849us       0.000us         0.00%      13.791us       2.299us             6  
-                                         aten::_to_copy         1.09%      24.161us        86.14%       1.907ms     317.752us       0.000us         0.00%      13.791us       2.299us             6  
-                                            aten::copy_         2.12%      46.909us        83.64%       1.851ms     308.533us      11.775us        62.90%      13.791us       2.299us             6  
-                                           aten::conv1d         0.30%       6.591us         5.18%     114.662us      38.221us       0.000us         0.00%       6.945us       2.315us             3  
-                                      aten::convolution         0.40%       8.811us         4.88%     108.071us      36.024us       0.000us         0.00%       6.945us       2.315us             3  
-                                     aten::_convolution         0.96%      21.188us         4.48%      99.260us      33.087us       0.000us         0.00%       6.945us       2.315us             3  
-                                aten::_conv_depthwise2d         0.97%      21.520us         2.82%      62.461us      20.820us       6.945us        37.10%       6.945us       2.315us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.945us        37.10%       6.945us       2.315us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.047us        32.30%       6.047us       2.016us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        30.60%       5.728us       1.909us             3  
-                                Activity Buffer Request        79.41%       1.758ms        79.41%       1.758ms       1.758ms       2.016us        10.77%       2.016us       2.016us             1  
-                                    aten::empty_strided         1.41%      31.151us         1.41%      31.151us       5.192us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.17%      70.153us         3.17%      70.153us       7.795us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.77%      17.060us         1.01%      22.310us       2.479us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.39%       8.641us         0.39%       8.641us       0.576us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.42%       9.380us         0.42%       9.380us       3.127us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.37%       8.090us         0.37%       8.090us       2.697us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.25%       5.450us         0.31%       6.801us       2.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     311.776us      1662.54%     311.776us     311.776us             1  
+                                            torch_eager         6.35%     136.063us        99.73%       2.136ms       2.136ms       0.000us         0.00%      20.769us      20.769us             1  
+                                               aten::to         0.26%       5.629us        86.85%       1.860ms     310.039us       0.000us         0.00%      13.823us       2.304us             6  
+                                         aten::_to_copy         0.98%      21.032us        86.59%       1.855ms     309.100us       0.000us         0.00%      13.823us       2.304us             6  
+                                            aten::copy_         2.10%      44.987us        84.33%       1.806ms     301.043us      11.807us        62.96%      13.823us       2.304us             6  
+                                           aten::conv1d         0.27%       5.831us         5.33%     114.232us      38.077us       0.000us         0.00%       6.946us       2.315us             3  
+                                      aten::convolution         0.40%       8.671us         5.06%     108.401us      36.134us       0.000us         0.00%       6.946us       2.315us             3  
+                                     aten::_convolution         0.98%      20.999us         4.66%      99.730us      33.243us       0.000us         0.00%       6.946us       2.315us             3  
+                                aten::_conv_depthwise2d         0.97%      20.860us         2.94%      63.051us      21.017us       6.946us        37.04%       6.946us       2.315us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.946us        37.04%       6.946us       2.315us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.047us        32.25%       6.047us       2.016us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.760us        30.72%       5.760us       1.920us             3  
+                                Activity Buffer Request        79.94%       1.712ms        79.94%       1.712ms       1.712ms       2.016us        10.75%       2.016us       2.016us             1  
+                                    aten::empty_strided         1.28%      27.310us         1.28%      27.310us       4.552us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.44%      73.575us         3.44%      73.575us       8.175us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.79%      16.901us         1.05%      22.540us       2.504us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.42%       9.019us         0.42%       9.019us       0.601us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.43%       9.230us         0.43%       9.230us       3.077us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.40%       8.580us         0.40%       8.580us       2.860us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.25%       5.250us         0.31%       6.610us       2.203us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.213ms
-Self CUDA time total: 18.720us
+Self CPU time total: 2.142ms
+Self CUDA time total: 18.753us
 
 
 
@@ -4095,29 +4095,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.254us      1673.31%     328.254us     328.254us             1  
-                                            torch_eager         6.02%     146.742us        99.79%       2.431ms       2.431ms       0.000us         0.00%      21.729us      21.729us             1  
-                                               aten::to         0.25%       6.201us        87.89%       2.141ms     356.794us       0.000us         0.00%      14.048us       2.341us             6  
-                                         aten::_to_copy         0.95%      23.051us        87.64%       2.135ms     355.761us       0.000us         0.00%      14.048us       2.341us             6  
-                                            aten::copy_         1.93%      46.899us        85.39%       2.080ms     346.662us      11.936us        60.85%      14.048us       2.341us             6  
-                                           aten::conv1d         0.28%       6.941us         4.83%     117.552us      39.184us       0.000us         0.00%       7.681us       2.560us             3  
-                                      aten::convolution         0.38%       9.320us         4.54%     110.611us      36.870us       0.000us         0.00%       7.681us       2.560us             3  
-                                     aten::_convolution         0.86%      20.861us         4.16%     101.291us      33.764us       0.000us         0.00%       7.681us       2.560us             3  
-                                aten::_conv_depthwise2d         0.93%      22.752us         2.67%      64.991us      21.664us       7.681us        39.15%       7.681us       2.560us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.681us        39.15%       7.681us       2.560us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.208us        31.65%       6.208us       2.069us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        29.20%       5.728us       1.909us             3  
-                                Activity Buffer Request        75.50%       1.839ms        75.50%       1.839ms       1.839ms       2.112us        10.77%       2.112us       2.112us             1  
-                                    aten::empty_strided         1.29%      31.540us         1.29%      31.540us       5.257us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.87%     216.103us         8.87%     216.103us      24.011us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.70%      16.989us         0.90%      21.970us       2.441us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.35%       8.601us         0.35%       8.601us       0.573us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.43%      10.359us         0.43%      10.359us       3.453us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.40%       9.840us         0.40%       9.840us       3.280us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.22%       5.410us         0.28%       6.920us       2.307us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     315.647us      1619.78%     315.647us     315.647us             1  
+                                            torch_eager         6.63%     154.700us        99.79%       2.328ms       2.328ms       0.000us         0.00%      21.599us      21.599us             1  
+                                               aten::to         0.26%       6.031us        87.19%       2.034ms     338.971us       0.000us         0.00%      13.983us       2.331us             6  
+                                         aten::_to_copy         0.93%      21.742us        86.93%       2.028ms     337.966us       0.000us         0.00%      13.983us       2.331us             6  
+                                            aten::copy_         2.02%      47.211us        84.82%       1.979ms     329.757us      11.871us        60.92%      13.983us       2.331us             6  
+                                           aten::conv1d         0.25%       5.920us         4.90%     114.312us      38.104us       0.000us         0.00%       7.616us       2.539us             3  
+                                      aten::convolution         0.39%       8.981us         4.65%     108.392us      36.131us       0.000us         0.00%       7.616us       2.539us             3  
+                                     aten::_convolution         0.89%      20.821us         4.26%      99.411us      33.137us       0.000us         0.00%       7.616us       2.539us             3  
+                                aten::_conv_depthwise2d         0.89%      20.790us         2.71%      63.250us      21.083us       7.616us        39.08%       7.616us       2.539us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.616us        39.08%       7.616us       2.539us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.143us        31.52%       6.143us       2.048us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        29.39%       5.728us       1.909us             3  
+                                Activity Buffer Request        71.63%       1.671ms        71.63%       1.671ms       1.671ms       2.112us        10.84%       2.112us       2.112us             1  
+                                    aten::empty_strided         1.18%      27.509us         1.18%      27.509us       4.585us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.15%     283.514us        12.15%     283.514us      31.502us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.70%      16.421us         0.91%      21.332us       2.370us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.37%       8.571us         0.37%       8.571us       0.571us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.43%      10.140us         0.43%      10.140us       3.380us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.39%       9.150us         0.39%       9.150us       3.050us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.23%       5.450us         0.30%       7.100us       2.367us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.436ms
-Self CUDA time total: 19.617us
+Self CPU time total: 2.333ms
+Self CUDA time total: 19.487us
 
 
 
@@ -4127,29 +4127,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     325.374us      1318.69%     325.374us     325.374us             1  
-                                            torch_eager         6.23%     145.210us        99.78%       2.326ms       2.326ms       0.000us         0.00%      26.978us      26.978us             1  
-                                               aten::to         0.28%       6.471us        87.58%       2.041ms     340.232us       0.000us         0.00%      15.298us       2.550us             6  
-                                         aten::_to_copy         1.01%      23.559us        87.30%       2.035ms     339.154us       0.000us         0.00%      15.298us       2.550us             6  
-                                            aten::copy_         2.04%      47.563us        85.03%       1.982ms     330.320us      12.994us        52.66%      15.298us       2.550us             6  
-                                           aten::conv1d         0.26%       6.060us         4.91%     114.341us      38.114us       0.000us         0.00%      11.680us       3.893us             3  
-                                      aten::convolution         0.40%       9.250us         4.65%     108.281us      36.094us       0.000us         0.00%      11.680us       3.893us             3  
-                                     aten::_convolution         0.89%      20.669us         4.25%      99.031us      33.010us       0.000us         0.00%      11.680us       3.893us             3  
-                                aten::_conv_depthwise2d         0.95%      22.039us         2.73%      63.550us      21.183us      11.680us        47.34%      11.680us       3.893us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.680us        47.34%      11.680us       3.893us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.657us        26.98%       6.657us       2.219us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.337us        25.68%       6.337us       2.112us             3  
-                                Activity Buffer Request        74.59%       1.739ms        74.59%       1.739ms       1.739ms       2.304us         9.34%       2.304us       2.304us             1  
-                                    aten::empty_strided         1.26%      29.442us         1.26%      29.442us       4.907us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.39%     218.802us         9.39%     218.802us      24.311us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.69%      16.041us         0.91%      21.173us       2.353us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.37%       8.602us         0.37%       8.602us       0.573us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.40%       9.341us         0.40%       9.341us       3.114us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.39%       8.990us         0.39%       8.990us       2.997us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.23%       5.290us         0.28%       6.580us       2.193us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     316.162us      1286.41%     316.162us     316.162us             1  
+                                            torch_eager         6.08%     142.813us        99.78%       2.344ms       2.344ms       0.000us         0.00%      26.881us      26.881us             1  
+                                               aten::to         0.26%       6.099us        87.82%       2.063ms     343.816us       0.000us         0.00%      15.329us       2.555us             6  
+                                         aten::_to_copy         0.96%      22.501us        87.56%       2.057ms     342.799us       0.000us         0.00%      15.329us       2.555us             6  
+                                            aten::copy_         2.00%      46.941us        85.41%       2.006ms     334.376us      13.025us        53.00%      15.329us       2.555us             6  
+                                           aten::conv1d         0.27%       6.279us         4.78%     112.281us      37.427us       0.000us         0.00%      11.552us       3.851us             3  
+                                      aten::convolution         0.37%       8.701us         4.51%     106.002us      35.334us       0.000us         0.00%      11.552us       3.851us             3  
+                                     aten::_convolution         0.88%      20.659us         4.14%      97.301us      32.434us       0.000us         0.00%      11.552us       3.851us             3  
+                                aten::_conv_depthwise2d         0.89%      20.940us         2.63%      61.791us      20.597us      11.552us        47.00%      11.552us       3.851us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        47.00%      11.552us       3.851us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.689us        27.22%       6.689us       2.230us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        25.78%       6.336us       2.112us             3  
+                                Activity Buffer Request        74.92%       1.760ms        74.92%       1.760ms       1.760ms       2.304us         9.37%       2.304us       2.304us             1  
+                                    aten::empty_strided         1.19%      28.040us         1.19%      28.040us       4.673us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.45%     222.075us         9.45%     222.075us      24.675us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.70%      16.419us         0.91%      21.470us       2.386us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.36%       8.480us         0.36%       8.480us       0.565us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.40%       9.310us         0.40%       9.310us       3.103us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.38%       8.880us         0.38%       8.880us       2.960us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.24%       5.612us         0.29%       6.911us       2.304us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.331ms
-Self CUDA time total: 24.674us
+Self CPU time total: 2.349ms
+Self CUDA time total: 24.577us
 
 
 
@@ -4159,29 +4159,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     325.853us      1241.91%     325.853us     325.853us             1  
-                                            torch_eager         6.02%     142.382us        99.78%       2.359ms       2.359ms       0.000us         0.00%      28.510us      28.510us             1  
-                                               aten::to         0.27%       6.279us        87.80%       2.076ms     345.959us       0.000us         0.00%      15.262us       2.544us             6  
-                                         aten::_to_copy         0.97%      22.980us        87.54%       2.069ms     344.912us       0.000us         0.00%      15.262us       2.544us             6  
-                                            aten::copy_         2.02%      47.672us        85.33%       2.017ms     336.189us      12.990us        49.51%      15.262us       2.544us             6  
-                                           aten::conv1d         0.27%       6.391us         4.88%     115.262us      38.421us       0.000us         0.00%      13.248us       4.416us             3  
-                                      aten::convolution         0.41%       9.629us         4.61%     108.871us      36.290us       0.000us         0.00%      13.248us       4.416us             3  
-                                     aten::_convolution         0.88%      20.800us         4.20%      99.242us      33.081us       0.000us         0.00%      13.248us       4.416us             3  
-                                aten::_conv_depthwise2d         0.93%      21.882us         2.62%      62.041us      20.680us      13.248us        50.49%      13.248us       4.416us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.248us        50.49%      13.248us       4.416us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.622us        25.24%       6.622us       2.207us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.368us        24.27%       6.368us       2.123us             3  
-                                Activity Buffer Request        75.21%       1.778ms        75.21%       1.778ms       1.778ms       2.272us         8.66%       2.272us       2.272us             1  
-                                    aten::empty_strided         1.24%      29.361us         1.24%      29.361us       4.893us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.97%     212.032us         8.97%     212.032us      23.559us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.75%      17.821us         0.98%      23.130us       2.570us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.37%       8.699us         0.37%       8.699us       0.580us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.38%       9.090us         0.38%       9.090us       3.030us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.44%      10.480us         0.44%      10.480us       3.493us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.24%       5.631us         0.30%       7.011us       2.337us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     315.548us      1212.95%     315.548us     315.548us             1  
+                                            torch_eager         6.12%     137.996us        99.74%       2.248ms       2.248ms       0.000us         0.00%      28.287us      28.287us             1  
+                                               aten::to         0.25%       5.659us        87.43%       1.970ms     328.377us       0.000us         0.00%      15.295us       2.549us             6  
+                                         aten::_to_copy         0.93%      21.030us        87.17%       1.965ms     327.434us       0.000us         0.00%      15.295us       2.549us             6  
+                                            aten::copy_         2.11%      47.618us        85.03%       1.916ms     319.389us      13.023us        50.06%      15.295us       2.549us             6  
+                                           aten::conv1d         0.25%       5.679us         5.07%     114.160us      38.053us       0.000us         0.00%      12.992us       4.331us             3  
+                                      aten::convolution         0.38%       8.670us         4.81%     108.481us      36.160us       0.000us         0.00%      12.992us       4.331us             3  
+                                     aten::_convolution         0.97%      21.770us         4.43%      99.811us      33.270us       0.000us         0.00%      12.992us       4.331us             3  
+                                aten::_conv_depthwise2d         0.95%      21.450us         2.80%      63.011us      21.004us      12.992us        49.94%      12.992us       4.331us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      12.992us        49.94%      12.992us       4.331us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.687us        25.70%       6.687us       2.229us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        24.36%       6.336us       2.112us             3  
+                                Activity Buffer Request        74.56%       1.680ms        74.56%       1.680ms       1.680ms       2.272us         8.73%       2.272us       2.272us             1  
+                                    aten::empty_strided         1.21%      27.241us         1.21%      27.241us       4.540us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.39%     211.596us         9.39%     211.596us      23.511us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.73%      16.539us         0.97%      21.840us       2.427us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.39%       8.730us         0.39%       8.730us       0.582us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.41%       9.190us         0.41%       9.190us       3.063us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.40%       9.090us         0.40%       9.090us       3.030us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.24%       5.411us         0.30%       6.770us       2.257us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.364ms
-Self CUDA time total: 26.238us
+Self CPU time total: 2.254ms
+Self CUDA time total: 26.015us
 
 
 
@@ -4191,29 +4191,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     331.328us       858.50%     331.328us     331.328us             1  
-                                            torch_eager         5.97%     146.471us        99.79%       2.446ms       2.446ms       0.000us         0.00%      41.186us      41.186us             1  
-                                           aten::conv1d         0.25%       6.210us         4.77%     116.961us      38.987us       0.000us         0.00%      22.849us       7.616us             3  
-                                      aten::convolution         0.40%       9.740us         4.52%     110.751us      36.917us       0.000us         0.00%      22.849us       7.616us             3  
-                                     aten::_convolution         0.89%      21.911us         4.12%     101.011us      33.670us       0.000us         0.00%      22.849us       7.616us             3  
-                                aten::_conv_depthwise2d         0.92%      22.550us         2.59%      63.530us      21.177us      22.849us        59.20%      22.849us       7.616us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.849us        59.20%      22.849us       7.616us             3  
-                                               aten::to         0.25%       6.228us        88.01%       2.158ms     359.617us       0.000us         0.00%      18.337us       3.056us             6  
-                                         aten::_to_copy         1.00%      24.602us        87.76%       2.151ms     358.579us       0.000us         0.00%      18.337us       3.056us             6  
-                                            aten::copy_         1.98%      48.619us        85.49%       2.096ms     349.334us      15.745us        40.80%      18.337us       3.056us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.385us        21.73%       8.385us       2.795us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        19.07%       7.360us       2.453us             3  
-                                Activity Buffer Request        75.73%       1.857ms        75.73%       1.857ms       1.857ms       2.592us         6.72%       2.592us       2.592us             1  
-                                    aten::empty_strided         1.26%      30.871us         1.26%      30.871us       5.145us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.69%     213.074us         8.69%     213.074us      23.675us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.69%      16.899us         0.91%      22.302us       2.478us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.35%       8.674us         0.35%       8.674us       0.578us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.39%       9.670us         0.39%       9.670us       3.223us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.37%       9.000us         0.37%       9.000us       3.000us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.23%       5.570us         0.28%       6.790us       2.263us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     309.088us       806.93%     309.088us     309.088us             1  
+                                            torch_eager         6.25%     144.584us        99.79%       2.308ms       2.308ms       0.000us         0.00%      40.896us      40.896us             1  
+                                           aten::conv1d         0.25%       5.670us         4.80%     111.051us      37.017us       0.000us         0.00%      22.560us       7.520us             3  
+                                      aten::convolution         0.37%       8.509us         4.56%     105.381us      35.127us       0.000us         0.00%      22.560us       7.520us             3  
+                                     aten::_convolution         0.91%      21.140us         4.19%      96.872us      32.291us       0.000us         0.00%      22.560us       7.520us             3  
+                                aten::_conv_depthwise2d         0.90%      20.721us         2.62%      60.611us      20.204us      22.560us        58.90%      22.560us       7.520us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.560us        58.90%      22.560us       7.520us             3  
+                                               aten::to         0.25%       5.820us        87.66%       2.027ms     337.907us       0.000us         0.00%      18.336us       3.056us             6  
+                                         aten::_to_copy         0.93%      21.482us        87.41%       2.022ms     336.937us       0.000us         0.00%      18.336us       3.056us             6  
+                                            aten::copy_         1.96%      45.321us        85.30%       1.973ms     328.809us      15.744us        41.10%      18.336us       3.056us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.416us        21.97%       8.416us       2.805us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        19.13%       7.328us       2.443us             3  
+                                Activity Buffer Request        75.52%       1.747ms        75.52%       1.747ms       1.747ms       2.592us         6.77%       2.592us       2.592us             1  
+                                    aten::empty_strided         1.18%      27.288us         1.18%      27.288us       4.548us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.77%     202.733us         8.77%     202.733us      22.526us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.70%      16.129us         0.91%      20.960us       2.329us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.36%       8.361us         0.36%       8.361us       0.557us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.40%       9.150us         0.40%       9.150us       3.050us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.39%       8.930us         0.39%       8.930us       2.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.24%       5.551us         0.30%       6.831us       2.277us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.452ms
-Self CUDA time total: 38.594us
+Self CPU time total: 2.313ms
+Self CUDA time total: 38.304us
 
 
 
@@ -4223,29 +4223,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     324.382us       781.00%     324.382us     324.382us             1  
-                                            torch_eager         6.15%     143.693us        99.76%       2.329ms       2.329ms       0.000us         0.00%      44.158us      44.158us             1  
-                                           aten::conv1d         0.25%       5.870us         4.90%     114.381us      38.127us       0.000us         0.00%      25.694us       8.565us             3  
-                                      aten::convolution         0.39%       9.129us         4.65%     108.511us      36.170us       0.000us         0.00%      25.694us       8.565us             3  
-                                     aten::_convolution         0.92%      21.560us         4.26%      99.382us      33.127us       0.000us         0.00%      25.694us       8.565us             3  
-                                aten::_conv_depthwise2d         0.91%      21.251us         2.67%      62.331us      20.777us      25.694us        61.86%      25.694us       8.565us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.694us        61.86%      25.694us       8.565us             3  
-                                               aten::to         0.26%       6.051us        87.64%       2.046ms     341.007us       0.000us         0.00%      18.464us       3.077us             6  
-                                         aten::_to_copy         0.99%      23.033us        87.38%       2.040ms     339.999us       0.000us         0.00%      18.464us       3.077us             6  
-                                            aten::copy_         2.09%      48.709us        85.05%       1.985ms     330.910us      15.840us        38.14%      18.464us       3.077us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.448us        20.34%       8.448us       2.816us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        17.80%       7.392us       2.464us             3  
-                                Activity Buffer Request        74.80%       1.746ms        74.80%       1.746ms       1.746ms       2.624us         6.32%       2.624us       2.624us             1  
-                                    aten::empty_strided         1.35%      31.498us         1.35%      31.498us       5.250us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.10%     212.334us         9.10%     212.334us      23.593us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.70%      16.311us         0.92%      21.550us       2.394us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.38%       8.780us         0.38%       8.780us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.39%       9.170us         0.39%       9.170us       3.057us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.44%      10.170us         0.44%      10.170us       3.390us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.24%       5.530us         0.30%       6.891us       2.297us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     314.049us       763.16%     314.049us     314.049us             1  
+                                            torch_eager         6.27%     140.693us        99.78%       2.240ms       2.240ms       0.000us         0.00%      43.775us      43.775us             1  
+                                           aten::conv1d         0.27%       6.020us         5.00%     112.161us      37.387us       0.000us         0.00%      25.408us       8.469us             3  
+                                      aten::convolution         0.39%       8.730us         4.73%     106.141us      35.380us       0.000us         0.00%      25.408us       8.469us             3  
+                                     aten::_convolution         0.96%      21.462us         4.34%      97.411us      32.470us       0.000us         0.00%      25.408us       8.469us             3  
+                                aten::_conv_depthwise2d         0.92%      20.739us         2.73%      61.250us      20.417us      25.408us        61.74%      25.408us       8.469us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.408us        61.74%      25.408us       8.469us             3  
+                                               aten::to         0.26%       5.842us        87.39%       1.962ms     327.034us       0.000us         0.00%      18.367us       3.061us             6  
+                                         aten::_to_copy         0.95%      21.318us        87.13%       1.956ms     326.060us       0.000us         0.00%      18.367us       3.061us             6  
+                                            aten::copy_         2.12%      47.620us        84.95%       1.908ms     317.917us      15.743us        38.26%      18.367us       3.061us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.384us        20.37%       8.384us       2.795us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.359us        17.88%       7.359us       2.453us             3  
+                                Activity Buffer Request        74.98%       1.684ms        74.98%       1.684ms       1.684ms       2.624us         6.38%       2.624us       2.624us             1  
+                                    aten::empty_strided         1.23%      27.541us         1.23%      27.541us       4.590us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.81%     197.723us         8.81%     197.723us      21.969us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.72%      16.172us         0.94%      21.010us       2.334us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.37%       8.238us         0.37%       8.238us       0.549us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.46%      10.261us         0.46%      10.261us       3.420us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.39%       8.800us         0.39%       8.800us       2.933us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.24%       5.440us         0.30%       6.680us       2.227us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.335ms
-Self CUDA time total: 41.534us
+Self CPU time total: 2.245ms
+Self CUDA time total: 41.151us
 
 
 
@@ -4255,29 +4255,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     319.038us       307.34%     319.038us     319.038us             1  
-                                            torch_eager         4.95%     115.620us        99.75%       2.329ms       2.329ms       0.000us         0.00%     109.886us     109.886us             1  
-                                           aten::conv1d         0.24%       5.500us         4.79%     111.722us      37.241us       0.000us         0.00%      71.360us      23.787us             3  
-                                      aten::convolution         0.38%       8.820us         4.55%     106.222us      35.407us       0.000us         0.00%      71.360us      23.787us             3  
-                                     aten::_convolution         0.86%      20.169us         4.17%      97.402us      32.467us       0.000us         0.00%      71.360us      23.787us             3  
-                                aten::_conv_depthwise2d         0.88%      20.499us         2.70%      62.992us      20.997us      71.360us        68.74%      71.360us      23.787us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      71.360us        68.74%      71.360us      23.787us             3  
-                                               aten::to         0.25%       5.942us        88.99%       2.078ms     346.257us       0.000us         0.00%      38.526us       6.421us             6  
-                                         aten::_to_copy         0.97%      22.531us        88.74%       2.072ms     345.267us       0.000us         0.00%      38.526us       6.421us             6  
-                                            aten::copy_         1.95%      45.459us        86.50%       2.019ms     336.557us      32.447us        31.26%      38.526us       6.421us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.791us        17.14%      17.791us       5.930us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.656us        14.12%      14.656us       4.885us             3  
-                                Activity Buffer Request        76.44%       1.784ms        76.44%       1.784ms       1.784ms       6.079us         5.86%       6.079us       6.079us             1  
-                                    aten::empty_strided         1.27%      29.730us         1.27%      29.730us       4.955us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.13%     213.066us         9.13%     213.066us      23.674us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.66%      15.410us         0.85%      19.870us       2.208us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.33%       7.790us         0.33%       7.790us       0.519us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.44%      10.351us         0.44%      10.351us       3.450us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.36%       8.461us         0.36%       8.461us       2.820us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.23%       5.401us         0.29%       6.691us       2.230us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     321.281us       312.48%     321.281us     321.281us             1  
+                                            torch_eager         5.86%     138.423us        99.79%       2.355ms       2.355ms       0.000us         0.00%     108.833us     108.833us             1  
+                                           aten::conv1d         0.25%       5.850us         4.76%     112.341us      37.447us       0.000us         0.00%      70.720us      23.573us             3  
+                                      aten::convolution         0.36%       8.520us         4.51%     106.491us      35.497us       0.000us         0.00%      70.720us      23.573us             3  
+                                     aten::_convolution         0.88%      20.802us         4.15%      97.971us      32.657us       0.000us         0.00%      70.720us      23.573us             3  
+                                aten::_conv_depthwise2d         0.89%      20.951us         2.63%      62.101us      20.700us      70.720us        68.78%      70.720us      23.573us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.720us        68.78%      70.720us      23.573us             3  
+                                               aten::to         0.25%       6.012us        88.03%       2.078ms     346.293us       0.000us         0.00%      38.113us       6.352us             6  
+                                         aten::_to_copy         0.93%      22.001us        87.78%       2.072ms     345.291us       0.000us         0.00%      38.113us       6.352us             6  
+                                            aten::copy_         1.93%      45.601us        85.70%       2.023ms     337.137us      32.097us        31.22%      38.113us       6.352us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.601us        17.12%      17.601us       5.867us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.496us        14.10%      14.496us       4.832us             3  
+                                Activity Buffer Request        76.10%       1.796ms        76.10%       1.796ms       1.796ms       6.016us         5.85%       6.016us       6.016us             1  
+                                    aten::empty_strided         1.14%      26.919us         1.14%      26.919us       4.487us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.62%     203.563us         8.62%     203.563us      22.618us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.74%      17.469us         0.97%      22.809us       2.534us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.37%       8.760us         0.37%       8.760us       0.584us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.40%       9.490us         0.40%       9.490us       3.163us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.39%       9.140us         0.39%       9.140us       3.047us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.23%       5.370us         0.28%       6.650us       2.217us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.335ms
-Self CUDA time total: 103.807us
+Self CPU time total: 2.360ms
+Self CUDA time total: 102.817us
 
 
 
@@ -4287,29 +4287,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     320.032us       281.56%     320.032us     320.032us             1  
-                                            torch_eager         4.89%     112.502us        99.77%       2.297ms       2.297ms       0.000us         0.00%     119.649us     119.649us             1  
-                                           aten::conv1d         0.24%       5.540us         4.86%     111.980us      37.327us       0.000us         0.00%      81.407us      27.136us             3  
-                                      aten::convolution         0.38%       8.839us         4.62%     106.440us      35.480us       0.000us         0.00%      81.407us      27.136us             3  
-                                     aten::_convolution         0.90%      20.821us         4.24%      97.601us      32.534us       0.000us         0.00%      81.407us      27.136us             3  
-                                aten::_conv_depthwise2d         0.94%      21.639us         2.69%      61.990us      20.663us      81.407us        71.62%      81.407us      27.136us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      81.407us        71.62%      81.407us      27.136us             3  
-                                               aten::to         0.26%       5.912us        88.93%       2.047ms     341.211us       0.000us         0.00%      38.242us       6.374us             6  
-                                         aten::_to_copy         0.96%      22.099us        88.68%       2.041ms     340.225us       0.000us         0.00%      38.242us       6.374us             6  
-                                            aten::copy_         2.13%      49.062us        86.51%       1.991ms     331.902us      32.257us        28.38%      38.242us       6.374us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.665us        15.54%      17.665us       5.888us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.592us        12.84%      14.592us       4.864us             3  
-                                Activity Buffer Request        76.05%       1.751ms        76.05%       1.751ms       1.751ms       5.985us         5.27%       5.985us       5.985us             1  
-                                    aten::empty_strided         1.21%      27.841us         1.21%      27.841us       4.640us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.26%     213.213us         9.26%     213.213us      23.690us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.70%      16.150us         0.91%      21.061us       2.340us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.36%       8.381us         0.36%       8.381us       0.559us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.40%       9.130us         0.40%       9.130us       3.043us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.42%       9.600us         0.42%       9.600us       3.200us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.24%       5.419us         0.29%       6.669us       2.223us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     314.234us       278.42%     314.234us     314.234us             1  
+                                            torch_eager        14.56%     110.034us        99.36%     750.943us     750.943us       0.000us         0.00%     118.847us     118.847us             1  
+                                           aten::conv1d         0.76%       5.759us        14.83%     112.060us      37.353us       0.000us         0.00%      80.768us      26.923us             3  
+                                      aten::convolution         1.14%       8.591us        14.06%     106.301us      35.434us       0.000us         0.00%      80.768us      26.923us             3  
+                                     aten::_convolution         2.73%      20.639us        12.93%      97.710us      32.570us       0.000us         0.00%      80.768us      26.923us             3  
+                                aten::_conv_depthwise2d         2.77%      20.950us         8.22%      62.160us      20.720us      80.768us        71.56%      80.768us      26.923us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.768us        71.56%      80.768us      26.923us             3  
+                                               aten::to         0.75%       5.692us        66.62%     503.480us      83.913us       0.000us         0.00%      38.079us       6.346us             6  
+                                         aten::_to_copy         2.84%      21.440us        65.86%     497.788us      82.965us       0.000us         0.00%      38.079us       6.346us             6  
+                                            aten::copy_         6.27%      47.370us        59.47%     449.478us      74.913us      32.095us        28.44%      38.079us       6.346us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.567us        15.56%      17.567us       5.856us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.528us        12.87%      14.528us       4.843us             3  
+                                Activity Buffer Request        28.71%     217.014us        28.71%     217.014us     217.014us       5.984us         5.30%       5.984us       5.984us             1  
+                                    aten::empty_strided         3.56%      26.870us         3.56%      26.870us       4.478us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        27.58%     208.434us        27.58%     208.434us      23.159us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.15%      16.260us         2.82%      21.342us       2.371us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.12%       8.483us         1.12%       8.483us       0.566us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.19%       8.980us         1.19%       8.980us       2.993us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.18%       8.890us         1.18%       8.890us       2.963us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.73%       5.530us         0.90%       6.840us       2.280us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.302ms
-Self CUDA time total: 113.664us
+Self CPU time total: 755.803us
+Self CUDA time total: 112.863us
 
 
 
@@ -4319,29 +4319,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         4.70%     113.641us        96.03%       2.320ms       2.320ms       0.000us         0.00%     464.763us     464.763us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     453.786us       106.62%     453.786us     453.786us             1  
-                                           aten::conv1d         0.23%       5.630us         4.62%     111.673us      37.224us       0.000us         0.00%     278.940us      92.980us             3  
-                                      aten::convolution         0.36%       8.651us         4.39%     106.043us      35.348us       0.000us         0.00%     278.940us      92.980us             3  
-                                     aten::_convolution         0.86%      20.739us         4.03%      97.392us      32.464us       0.000us         0.00%     278.940us      92.980us             3  
-                                aten::_conv_depthwise2d         0.90%      21.710us         2.57%      62.062us      20.687us     278.940us        65.54%     278.940us      92.980us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     278.940us        65.54%     278.940us      92.980us             3  
-                                               aten::to         0.24%       5.880us        85.69%       2.071ms     345.102us       0.000us         0.00%     185.823us      30.970us             6  
-                                         aten::_to_copy         0.90%      21.820us        85.45%       2.065ms     344.122us       0.000us         0.00%     185.823us      30.970us             6  
-                                            aten::copy_         1.99%      48.071us        83.40%       2.015ms     335.882us     146.655us        34.46%     185.823us      30.970us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     105.919us        24.89%     105.919us      35.306us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.736us         9.57%      40.736us      13.579us             3  
-                                Activity Buffer Request        72.26%       1.746ms        72.26%       1.746ms       1.746ms      39.168us         9.20%      39.168us      39.168us             1  
-                                    aten::empty_strided         1.14%      27.621us         1.14%      27.621us       4.604us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.07%     243.344us        10.07%     243.344us      27.038us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.66%      15.908us         0.86%      20.760us       2.307us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.34%       8.262us         0.34%       8.262us       0.551us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.37%       8.921us         0.37%       8.921us       2.974us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.38%       9.260us         0.38%       9.260us       3.087us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.22%       5.361us         0.27%       6.641us       2.214us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager        13.44%     111.290us        87.96%     728.462us     728.462us       0.000us         0.00%     465.598us     465.598us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     451.198us       105.76%     451.198us     451.198us             1  
+                                           aten::conv1d         0.71%       5.852us        13.50%     111.822us      37.274us       0.000us         0.00%     279.070us      93.023us             3  
+                                      aten::convolution         1.02%       8.470us        12.80%     105.970us      35.323us       0.000us         0.00%     279.070us      93.023us             3  
+                                     aten::_convolution         2.46%      20.349us        11.77%      97.500us      32.500us       0.000us         0.00%     279.070us      93.023us             3  
+                                aten::_conv_depthwise2d         2.50%      20.731us         7.53%      62.371us      20.790us     279.070us        65.41%     279.070us      93.023us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     279.070us        65.41%     279.070us      93.023us             3  
+                                               aten::to         0.69%       5.693us        58.10%     481.149us      80.192us       0.000us         0.00%     186.528us      31.088us             6  
+                                         aten::_to_copy         2.52%      20.868us        57.41%     475.456us      79.243us       0.000us         0.00%     186.528us      31.088us             6  
+                                            aten::copy_         5.92%      49.010us        51.66%     427.857us      71.310us     147.552us        34.59%     186.528us      31.088us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     107.296us        25.15%     107.296us      35.765us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.256us         9.44%      40.256us      13.419us             3  
+                                Activity Buffer Request        24.18%     200.274us        24.18%     200.274us     200.274us      38.976us         9.14%      38.976us      38.976us             1  
+                                    aten::empty_strided         3.23%      26.731us         3.23%      26.731us       4.455us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.27%     200.993us        24.27%     200.993us      22.333us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.86%      15.430us         2.41%      19.970us       2.219us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.00%       8.282us         1.00%       8.282us       0.552us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.22%      10.070us         1.22%      10.070us       3.357us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.10%       9.150us         1.10%       9.150us       3.050us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.65%       5.408us         0.83%       6.890us       2.297us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.416ms
-Self CUDA time total: 425.595us
+Self CPU time total: 828.193us
+Self CUDA time total: 426.622us
 
 
 
@@ -4351,29 +4351,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         4.81%     115.230us        95.51%       2.289ms       2.289ms       0.000us         0.00%     473.560us     473.560us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     466.268us       106.59%     466.268us     466.268us             1  
-                                           aten::conv1d         0.23%       5.540us         4.63%     111.002us      37.001us       0.000us         0.00%     298.430us      99.477us             3  
-                                      aten::convolution         0.37%       8.900us         4.40%     105.462us      35.154us       0.000us         0.00%     298.430us      99.477us             3  
-                                     aten::_convolution         0.85%      20.430us         4.03%      96.562us      32.187us       0.000us         0.00%     298.430us      99.477us             3  
-                                aten::_conv_depthwise2d         0.86%      20.562us         2.57%      61.592us      20.531us     298.430us        68.22%     298.430us      99.477us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     298.430us        68.22%     298.430us      99.477us             3  
-                                               aten::to         0.24%       5.669us        85.05%       2.039ms     339.802us       0.000us         0.00%     175.130us      29.188us             6  
-                                         aten::_to_copy         0.96%      22.942us        84.82%       2.033ms     338.857us       0.000us         0.00%     175.130us      29.188us             6  
-                                            aten::copy_         2.01%      48.190us        82.64%       1.981ms     330.170us     139.003us        31.78%     175.130us      29.188us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      98.430us        22.50%      98.430us      32.810us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.573us         9.28%      40.573us      13.524us             3  
-                                Activity Buffer Request        72.81%       1.745ms        72.81%       1.745ms       1.745ms      36.127us         8.26%      36.127us      36.127us             1  
-                                    aten::empty_strided         1.22%      29.180us         1.22%      29.180us       4.863us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.73%     209.224us         8.73%     209.224us      23.247us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.66%      15.770us         0.87%      20.750us       2.306us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.35%       8.340us         0.35%       8.340us       0.556us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.43%      10.290us         0.43%      10.290us       3.430us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.37%       8.960us         0.37%       8.960us       2.987us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.22%       5.340us         0.28%       6.610us       2.203us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager        12.75%     112.781us        87.94%     777.793us     777.793us       0.000us         0.00%     472.535us     472.535us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     465.081us       106.66%     465.081us     465.081us             1  
+                                           aten::conv1d         0.64%       5.670us        12.88%     113.921us      37.974us       0.000us         0.00%     298.235us      99.412us             3  
+                                      aten::convolution         0.99%       8.800us        12.24%     108.251us      36.084us       0.000us         0.00%     298.235us      99.412us             3  
+                                     aten::_convolution         2.35%      20.829us        11.24%      99.451us      33.150us       0.000us         0.00%     298.235us      99.412us             3  
+                                aten::_conv_depthwise2d         2.33%      20.611us         7.13%      63.071us      21.024us     298.235us        68.40%     298.235us      99.412us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     298.235us        68.40%     298.235us      99.412us             3  
+                                               aten::to         0.69%       6.129us        59.45%     525.809us      87.635us       0.000us         0.00%     174.300us      29.050us             6  
+                                         aten::_to_copy         2.38%      21.040us        58.76%     519.680us      86.613us       0.000us         0.00%     174.300us      29.050us             6  
+                                            aten::copy_         5.38%      47.582us        53.27%     471.169us      78.528us     137.789us        31.60%     174.300us      29.050us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      97.534us        22.37%      97.534us      32.511us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.255us         9.23%      40.255us      13.418us             3  
+                                Activity Buffer Request        27.57%     243.834us        27.57%     243.834us     243.834us      36.511us         8.37%      36.511us      36.511us             1  
+                                    aten::empty_strided         3.11%      27.471us         3.11%      27.471us       4.579us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        23.03%     203.723us        23.03%     203.723us      22.636us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.93%      17.041us         2.45%      21.672us       2.408us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.93%       8.218us         0.93%       8.218us       0.548us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.01%       8.950us         1.01%       8.950us       2.983us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.08%       9.540us         1.08%       9.540us       3.180us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.64%       5.653us         0.81%       7.121us       2.374us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.397ms
-Self CUDA time total: 437.433us
+Self CPU time total: 884.464us
+Self CUDA time total: 436.024us
 
 
 
@@ -4383,29 +4383,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     325.149us      1725.02%     325.149us     325.149us             1  
-                                            torch_eager         4.86%     112.628us        99.78%       2.311ms       2.311ms       0.000us         0.00%      20.769us      20.769us             1  
-                                               aten::to         0.26%       5.932us        88.67%       2.054ms     342.251us       0.000us         0.00%      13.536us       2.256us             6  
-                                         aten::_to_copy         1.00%      23.270us        88.41%       2.048ms     341.262us       0.000us         0.00%      13.536us       2.256us             6  
-                                            aten::copy_         2.14%      49.511us        86.15%       1.995ms     332.552us      11.616us        61.63%      13.536us       2.256us             6  
-                                           aten::conv1d         0.24%       5.480us         5.19%     120.221us      40.074us       0.000us         0.00%       7.233us       2.411us             3  
-                                      aten::convolution         0.37%       8.641us         4.95%     114.741us      38.247us       0.000us         0.00%       7.233us       2.411us             3  
-                                     aten::_convolution         0.88%      20.361us         4.58%     106.100us      35.367us       0.000us         0.00%       7.233us       2.411us             3  
-                                aten::_conv_depthwise2d         0.96%      22.180us         3.05%      70.680us      23.560us       7.233us        38.37%       7.233us       2.411us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.233us        38.37%       7.233us       2.411us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        31.41%       5.920us       1.973us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        30.22%       5.696us       1.899us             3  
-                                Activity Buffer Request        75.90%       1.758ms        75.90%       1.758ms       1.758ms       1.920us        10.19%       1.920us       1.920us             1  
-                                    aten::empty_strided         1.25%      28.990us         1.25%      28.990us       4.832us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.42%     218.162us         9.42%     218.162us      24.240us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.68%      15.833us         0.90%      20.731us       2.303us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.37%       8.468us         0.37%       8.468us       0.565us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.40%       9.220us         0.40%       9.220us       3.073us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.39%       8.980us         0.39%       8.980us       2.993us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.24%       5.550us         0.30%       7.000us       2.333us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     322.331us      1707.26%     322.331us     322.331us             1  
+                                            torch_eager        13.79%     111.240us        99.37%     801.573us     801.573us       0.000us         0.00%      20.864us      20.864us             1  
+                                               aten::to         0.71%       5.762us        67.35%     543.311us      90.552us       0.000us         0.00%      13.696us       2.283us             6  
+                                         aten::_to_copy         2.83%      22.852us        66.64%     537.549us      89.592us       0.000us         0.00%      13.696us       2.283us             6  
+                                            aten::copy_         5.86%      47.269us        60.36%     486.896us      81.149us      11.712us        62.03%      13.696us       2.283us             6  
+                                           aten::conv1d         0.71%       5.710us        14.24%     114.861us      38.287us       0.000us         0.00%       7.168us       2.389us             3  
+                                      aten::convolution         1.09%       8.781us        13.53%     109.151us      36.384us       0.000us         0.00%       7.168us       2.389us             3  
+                                     aten::_convolution         2.85%      22.969us        12.44%     100.370us      33.457us       0.000us         0.00%       7.168us       2.389us             3  
+                                aten::_conv_depthwise2d         2.60%      20.991us         7.73%      62.361us      20.787us       7.168us        37.97%       7.168us       2.389us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.168us        37.97%       7.168us       2.389us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.016us        31.86%       6.016us       2.005us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        30.17%       5.696us       1.899us             3  
+                                Activity Buffer Request        32.81%     264.695us        32.81%     264.695us     264.695us       1.984us        10.51%       1.984us       1.984us             1  
+                                    aten::empty_strided         3.45%      27.801us         3.45%      27.801us       4.633us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.60%     198.402us        24.60%     198.402us      22.045us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.08%      16.740us         3.51%      28.341us       3.149us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.86%      15.030us         1.86%      15.030us       1.002us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.16%       9.330us         1.16%       9.330us       3.110us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.06%       8.570us         1.06%       8.570us       2.857us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.67%       5.391us         0.83%       6.660us       2.220us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.316ms
-Self CUDA time total: 18.849us
+Self CPU time total: 806.653us
+Self CUDA time total: 18.880us
 
 
 
@@ -4415,29 +4415,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     320.511us      1636.76%     320.511us     320.511us             1  
-                                            torch_eager         5.91%     139.372us        99.79%       2.353ms       2.353ms       0.000us         0.00%      21.598us      21.598us             1  
-                                               aten::to         0.25%       6.010us        87.93%       2.073ms     345.496us       0.000us         0.00%      13.663us       2.277us             6  
-                                         aten::_to_copy         0.96%      22.549us        87.67%       2.067ms     344.494us       0.000us         0.00%      13.663us       2.277us             6  
-                                            aten::copy_         2.09%      49.251us        85.51%       2.016ms     335.977us      11.647us        59.48%      13.663us       2.277us             6  
-                                           aten::conv1d         0.26%       6.081us         4.89%     115.321us      38.440us       0.000us         0.00%       7.935us       2.645us             3  
-                                      aten::convolution         0.40%       9.450us         4.63%     109.240us      36.413us       0.000us         0.00%       7.935us       2.645us             3  
-                                     aten::_convolution         0.90%      21.168us         4.23%      99.790us      33.263us       0.000us         0.00%       7.935us       2.645us             3  
-                                aten::_conv_depthwise2d         0.87%      20.610us         2.67%      62.871us      20.957us       7.935us        40.52%       7.935us       2.645us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.935us        40.52%       7.935us       2.645us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.983us        30.55%       5.983us       1.994us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        28.92%       5.664us       1.888us             3  
-                                Activity Buffer Request        75.47%       1.779ms        75.47%       1.779ms       1.779ms       2.016us        10.30%       2.016us       2.016us             1  
-                                    aten::empty_strided         1.21%      28.551us         1.21%      28.551us       4.759us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.91%     210.105us         8.91%     210.105us      23.345us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.72%      16.961us         0.93%      21.872us       2.430us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.36%       8.422us         0.36%       8.422us       0.561us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.46%      10.910us         0.46%      10.910us       3.637us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.37%       8.650us         0.37%       8.650us       2.883us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.24%       5.579us         0.30%       6.970us       2.323us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     310.011us      1593.40%     310.011us     310.011us             1  
+                                            torch_eager        14.19%     114.063us        99.41%     798.944us     798.944us       0.000us         0.00%      21.440us      21.440us             1  
+                                               aten::to         0.69%       5.570us        68.28%     548.719us      91.453us       0.000us         0.00%      13.535us       2.256us             6  
+                                         aten::_to_copy         2.58%      20.749us        67.58%     543.149us      90.525us       0.000us         0.00%      13.535us       2.256us             6  
+                                            aten::copy_         5.87%      47.160us        61.68%     495.718us      82.620us      11.551us        59.37%      13.535us       2.256us             6  
+                                           aten::conv1d         0.70%       5.640us        13.72%     110.252us      36.751us       0.000us         0.00%       7.905us       2.635us             3  
+                                      aten::convolution         1.10%       8.879us        13.02%     104.612us      34.871us       0.000us         0.00%       7.905us       2.635us             3  
+                                     aten::_convolution         2.60%      20.881us        11.91%      95.733us      31.911us       0.000us         0.00%       7.905us       2.635us             3  
+                                aten::_conv_depthwise2d         2.57%      20.619us         7.48%      60.111us      20.037us       7.905us        40.63%       7.905us       2.635us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.905us        40.63%       7.905us       2.635us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us        30.26%       5.888us       1.963us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.663us        29.11%       5.663us       1.888us             3  
+                                Activity Buffer Request        34.15%     274.474us        34.15%     274.474us     274.474us       1.984us        10.20%       1.984us       1.984us             1  
+                                    aten::empty_strided         3.32%      26.682us         3.32%      26.682us       4.447us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.32%     195.444us        24.32%     195.444us      21.716us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.12%      17.050us         2.77%      22.260us       2.473us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.06%       8.520us         1.06%       8.520us       0.568us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.13%       9.091us         1.13%       9.091us       3.030us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.12%       9.041us         1.12%       9.041us       3.014us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.64%       5.131us         0.80%       6.391us       2.130us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.358ms
-Self CUDA time total: 19.582us
+Self CPU time total: 803.684us
+Self CUDA time total: 19.456us
 
 
 
@@ -4447,29 +4447,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     310.009us      1591.01%     310.009us     310.009us             1  
-                                            torch_eager        14.85%     113.881us        99.35%     762.102us     762.102us       0.000us         0.00%      21.693us      21.693us             1  
-                                               aten::to         0.75%       5.742us        67.36%     516.710us      86.118us       0.000us         0.00%      14.398us       2.400us             6  
-                                         aten::_to_copy         2.84%      21.798us        66.61%     510.968us      85.161us       0.000us         0.00%      14.398us       2.400us             6  
-                                            aten::copy_         6.26%      48.021us        59.81%     458.808us      76.468us      12.190us        62.56%      14.398us       2.400us             6  
-                                           aten::conv1d         0.69%       5.290us        14.07%     107.951us      35.984us       0.000us         0.00%       7.295us       2.432us             3  
-                                      aten::convolution         1.14%       8.770us        13.38%     102.661us      34.220us       0.000us         0.00%       7.295us       2.432us             3  
-                                     aten::_convolution         2.56%      19.629us        12.24%      93.891us      31.297us       0.000us         0.00%       7.295us       2.432us             3  
-                                aten::_conv_depthwise2d         2.72%      20.851us         7.84%      60.152us      20.051us       7.295us        37.44%       7.295us       2.432us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.295us        37.44%       7.295us       2.432us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.271us        32.18%       6.271us       2.090us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.919us        30.38%       5.919us       1.973us             3  
-                                Activity Buffer Request        29.70%     227.833us        29.70%     227.833us     227.833us       2.208us        11.33%       2.208us       2.208us             1  
-                                    aten::empty_strided         3.96%      30.362us         3.96%      30.362us       5.060us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.62%     204.185us        26.62%     204.185us      22.687us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.01%      15.431us         2.57%      19.700us       2.189us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.98%       7.520us         0.98%       7.520us       0.501us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.29%       9.930us         1.29%       9.930us       3.310us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.06%       8.140us         1.06%       8.140us       2.713us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.67%       5.119us         0.83%       6.400us       2.133us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     314.048us      1627.61%     314.048us     314.048us             1  
+                                            torch_eager         6.09%     141.071us        99.78%       2.312ms       2.312ms       0.000us         0.00%      21.471us      21.471us             1  
+                                               aten::to         0.25%       5.868us        87.74%       2.033ms     338.847us       0.000us         0.00%      14.239us       2.373us             6  
+                                         aten::_to_copy         0.91%      21.009us        87.49%       2.027ms     337.869us       0.000us         0.00%      14.239us       2.373us             6  
+                                            aten::copy_         2.05%      47.413us        85.30%       1.977ms     329.427us      12.063us        62.52%      14.239us       2.373us             6  
+                                           aten::conv1d         0.24%       5.651us         4.86%     112.612us      37.537us       0.000us         0.00%       7.232us       2.411us             3  
+                                      aten::convolution         0.38%       8.789us         4.62%     106.961us      35.654us       0.000us         0.00%       7.232us       2.411us             3  
+                                     aten::_convolution         0.89%      20.661us         4.24%      98.172us      32.724us       0.000us         0.00%       7.232us       2.411us             3  
+                                aten::_conv_depthwise2d         0.90%      20.862us         2.69%      62.371us      20.790us       7.232us        37.48%       7.232us       2.411us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.232us        37.48%       7.232us       2.411us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.208us        32.17%       6.208us       2.069us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.855us        30.34%       5.855us       1.952us             3  
+                                Activity Buffer Request        75.73%       1.755ms        75.73%       1.755ms       1.755ms       2.176us        11.28%       2.176us       2.176us             1  
+                                    aten::empty_strided         1.28%      29.642us         1.28%      29.642us       4.940us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.54%     197.852us         8.54%     197.852us      21.984us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.71%      16.502us         0.92%      21.412us       2.379us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.36%       8.400us         0.36%       8.400us       0.560us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.40%       9.250us         0.40%       9.250us       3.083us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.37%       8.629us         0.37%       8.629us       2.876us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.24%       5.530us         0.30%       6.920us       2.307us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 767.122us
-Self CUDA time total: 19.485us
+Self CPU time total: 2.317ms
+Self CUDA time total: 19.295us
 
 
 
@@ -4479,29 +4479,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     312.058us      1547.83%     312.058us     312.058us             1  
-                                            torch_eager        19.84%     167.701us        99.34%     839.603us     839.603us       0.000us         0.00%      22.369us      22.369us             1  
-                                               aten::to         0.69%       5.791us        63.55%     537.169us      89.528us       0.000us         0.00%      14.400us       2.400us             6  
-                                         aten::_to_copy         2.59%      21.910us        62.87%     531.378us      88.563us       0.000us         0.00%      14.400us       2.400us             6  
-                                            aten::copy_         5.79%      48.970us        56.91%     481.028us      80.171us      12.192us        60.47%      14.400us       2.400us             6  
-                                           aten::conv1d         0.65%       5.520us        13.10%     110.752us      36.917us       0.000us         0.00%       7.969us       2.656us             3  
-                                      aten::convolution         1.03%       8.700us        12.45%     105.232us      35.077us       0.000us         0.00%       7.969us       2.656us             3  
-                                     aten::_convolution         2.40%      20.311us        11.42%      96.532us      32.177us       0.000us         0.00%       7.969us       2.656us             3  
-                                aten::_conv_depthwise2d         2.39%      20.240us         7.28%      61.521us      20.507us       7.969us        39.53%       7.969us       2.656us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.969us        39.53%       7.969us       2.656us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        31.11%       6.272us       2.091us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        29.36%       5.920us       1.973us             3  
-                                Activity Buffer Request        29.19%     246.714us        29.19%     246.714us     246.714us       2.208us        10.95%       2.208us       2.208us             1  
-                                    aten::empty_strided         3.36%      28.440us         3.36%      28.440us       4.740us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.70%     208.775us        24.70%     208.775us      23.197us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.84%      15.580us         2.41%      20.350us       2.261us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.95%       8.049us         0.95%       8.049us       0.537us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.07%       9.050us         1.07%       9.050us       3.017us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.04%       8.800us         1.04%       8.800us       2.933us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.63%       5.361us         0.79%       6.650us       2.217us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     312.955us      1552.43%     312.955us     312.955us             1  
+                                            torch_eager        14.84%     110.223us        99.32%     737.732us     737.732us       0.000us         0.00%      22.399us      22.399us             1  
+                                               aten::to         0.81%       6.020us        65.77%     488.537us      81.423us       0.000us         0.00%      14.464us       2.411us             6  
+                                         aten::_to_copy         2.95%      21.919us        64.96%     482.517us      80.420us       0.000us         0.00%      14.464us       2.411us             6  
+                                            aten::copy_         6.27%      46.582us        58.16%     432.008us      72.001us      12.224us        60.64%      14.464us       2.411us             6  
+                                           aten::conv1d         0.73%       5.430us        15.33%     113.861us      37.954us       0.000us         0.00%       7.935us       2.645us             3  
+                                      aten::convolution         1.23%       9.170us        14.60%     108.431us      36.144us       0.000us         0.00%       7.935us       2.645us             3  
+                                     aten::_convolution         2.83%      21.009us        13.36%      99.261us      33.087us       0.000us         0.00%       7.935us       2.645us             3  
+                                aten::_conv_depthwise2d         2.83%      21.021us         8.51%      63.211us      21.070us       7.935us        39.36%       7.935us       2.645us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.935us        39.36%       7.935us       2.645us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        31.27%       6.304us       2.101us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        29.37%       5.920us       1.973us             3  
+                                Activity Buffer Request        28.04%     208.284us        28.04%     208.284us     208.284us       2.240us        11.11%       2.240us       2.240us             1  
+                                    aten::empty_strided         3.85%      28.590us         3.85%      28.590us       4.765us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.94%     200.092us        26.94%     200.092us      22.232us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.19%      16.299us         2.86%      21.260us       2.362us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.14%       8.481us         1.14%       8.481us       0.565us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.35%      10.000us         1.35%      10.000us       3.333us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.24%       9.240us         1.24%       9.240us       3.080us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.74%       5.520us         0.94%       6.960us       2.320us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 845.213us
-Self CUDA time total: 20.161us
+Self CPU time total: 742.812us
+Self CUDA time total: 20.159us
 
 
 
@@ -4511,29 +4511,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     312.867us       859.95%     312.867us     312.867us             1  
-                                            torch_eager        14.44%     112.752us        99.36%     776.042us     776.042us       0.000us         0.00%      39.006us      39.006us             1  
-                                           aten::conv1d         0.71%       5.580us        13.99%     109.252us      36.417us       0.000us         0.00%      20.512us       6.837us             3  
-                                      aten::convolution         1.09%       8.531us        13.27%     103.672us      34.557us       0.000us         0.00%      20.512us       6.837us             3  
-                                     aten::_convolution         2.62%      20.459us        12.18%      95.141us      31.714us       0.000us         0.00%      20.512us       6.837us             3  
-                                aten::_conv_depthwise2d         2.59%      20.222us         7.70%      60.162us      20.054us      20.512us        56.38%      20.512us       6.837us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.512us        56.38%      20.512us       6.837us             3  
-                                               aten::to         0.75%       5.821us        67.81%     529.608us      88.268us       0.000us         0.00%      18.494us       3.082us             6  
-                                         aten::_to_copy         2.86%      22.338us        67.06%     523.787us      87.298us       0.000us         0.00%      18.494us       3.082us             6  
-                                            aten::copy_         6.02%      47.020us        60.45%     472.148us      78.691us      15.870us        43.62%      18.494us       3.082us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.447us        23.22%       8.447us       2.816us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.423us        20.40%       7.423us       2.474us             3  
-                                Activity Buffer Request        30.80%     240.594us        30.80%     240.594us     240.594us       2.624us         7.21%       2.624us       2.624us             1  
-                                    aten::empty_strided         3.75%      29.301us         3.75%      29.301us       4.884us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.46%     206.633us        26.46%     206.633us      22.959us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.01%      15.720us         2.61%      20.410us       2.268us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.02%       7.981us         1.02%       7.981us       0.532us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.13%       8.841us         1.13%       8.841us       2.947us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.15%       9.000us         1.15%       9.000us       3.000us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.68%       5.329us         0.84%       6.560us       2.187us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     313.245us       871.70%     313.245us     313.245us             1  
+                                            torch_eager        14.61%     110.582us        99.35%     752.253us     752.253us       0.000us         0.00%      38.527us      38.527us             1  
+                                           aten::conv1d         0.75%       5.651us        14.90%     112.841us      37.614us       0.000us         0.00%      20.160us       6.720us             3  
+                                      aten::convolution         1.16%       8.759us        14.16%     107.190us      35.730us       0.000us         0.00%      20.160us       6.720us             3  
+                                     aten::_convolution         2.72%      20.612us        13.00%      98.431us      32.810us       0.000us         0.00%      20.160us       6.720us             3  
+                                aten::_conv_depthwise2d         2.88%      21.780us         8.26%      62.540us      20.847us      20.160us        56.10%      20.160us       6.720us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.160us        56.10%      20.160us       6.720us             3  
+                                               aten::to         0.75%       5.701us        66.54%     503.830us      83.972us       0.000us         0.00%      18.367us       3.061us             6  
+                                         aten::_to_copy         2.88%      21.780us        65.79%     498.129us      83.021us       0.000us         0.00%      18.367us       3.061us             6  
+                                            aten::copy_         6.12%      46.334us        59.23%     448.449us      74.742us      15.775us        43.90%      18.367us       3.061us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.415us        23.42%       8.415us       2.805us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        20.48%       7.360us       2.453us             3  
+                                Activity Buffer Request        29.82%     225.793us        29.82%     225.793us     225.793us       2.592us         7.21%       2.592us       2.592us             1  
+                                    aten::empty_strided         3.68%      27.900us         3.68%      27.900us       4.650us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.28%     198.992us        26.28%     198.992us      22.110us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.15%      16.291us         2.79%      21.110us       2.346us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.09%       8.258us         1.09%       8.258us       0.551us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.19%       9.040us         1.19%       9.040us       3.013us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.20%       9.050us         1.20%       9.050us       3.017us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.73%       5.530us         0.90%       6.810us       2.270us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 781.073us
-Self CUDA time total: 36.382us
+Self CPU time total: 757.144us
+Self CUDA time total: 35.935us
 
 
 
@@ -4543,29 +4543,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     353.311us       916.31%     353.311us     353.311us             1  
-                                            torch_eager        17.31%     144.171us        99.40%     827.943us     827.943us       0.000us         0.00%      41.150us      41.150us             1  
-                                           aten::conv1d         0.66%       5.470us        14.12%     117.601us      39.200us       0.000us         0.00%      22.624us       7.541us             3  
-                                      aten::convolution         1.09%       9.120us        13.46%     112.131us      37.377us       0.000us         0.00%      22.624us       7.541us             3  
-                                     aten::_convolution         2.77%      23.100us        12.37%     103.011us      34.337us       0.000us         0.00%      22.624us       7.541us             3  
-                                aten::_conv_depthwise2d         2.63%      21.901us         7.78%      64.791us      21.597us      22.624us        58.68%      22.624us       7.541us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.624us        58.68%      22.624us       7.541us             3  
-                                               aten::to         0.71%       5.920us        64.88%     540.450us      90.075us       0.000us         0.00%      18.526us       3.088us             6  
-                                         aten::_to_copy         2.59%      21.613us        64.17%     534.530us      89.088us       0.000us         0.00%      18.526us       3.088us             6  
-                                            aten::copy_         5.88%      48.990us        58.06%     483.646us      80.608us      15.934us        41.32%      18.526us       3.088us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.575us        22.24%       8.575us       2.858us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.359us        19.09%       7.359us       2.453us             3  
-                                Activity Buffer Request        29.91%     249.164us        29.91%     249.164us     249.164us       2.592us         6.72%       2.592us       2.592us             1  
-                                    aten::empty_strided         3.51%      29.271us         3.51%      29.271us       4.879us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.18%     209.712us        25.18%     209.712us      23.301us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.99%      16.542us         2.59%      21.611us       2.401us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.04%       8.638us         1.04%       8.638us       0.576us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.16%       9.650us         1.16%       9.650us       3.217us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.08%       9.020us         1.08%       9.020us       3.007us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.68%       5.681us         0.85%       7.060us       2.353us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     311.265us       814.68%     311.265us     311.265us             1  
+                                            torch_eager        15.57%     112.202us        99.29%     715.482us     715.482us       0.000us         0.00%      40.798us      40.798us             1  
+                                           aten::conv1d         0.79%       5.710us        15.58%     112.241us      37.414us       0.000us         0.00%      22.336us       7.445us             3  
+                                      aten::convolution         1.20%       8.631us        14.78%     106.531us      35.510us       0.000us         0.00%      22.336us       7.445us             3  
+                                     aten::_convolution         2.96%      21.310us        13.59%      97.900us      32.633us       0.000us         0.00%      22.336us       7.445us             3  
+                                aten::_conv_depthwise2d         2.84%      20.470us         8.57%      61.720us      20.573us      22.336us        58.46%      22.336us       7.445us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.336us        58.46%      22.336us       7.445us             3  
+                                               aten::to         0.77%       5.551us        64.63%     465.749us      77.625us       0.000us         0.00%      18.462us       3.077us             6  
+                                         aten::_to_copy         2.88%      20.749us        63.86%     460.198us      76.700us       0.000us         0.00%      18.462us       3.077us             6  
+                                            aten::copy_         6.44%      46.412us        57.18%     412.048us      68.675us      15.871us        41.54%      18.462us       3.077us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.511us        22.28%       8.511us       2.837us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        19.26%       7.360us       2.453us             3  
+                                Activity Buffer Request        26.39%     190.134us        26.39%     190.134us     190.134us       2.591us         6.78%       2.591us       2.591us             1  
+                                    aten::empty_strided         3.80%      27.401us         3.80%      27.401us       4.567us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        27.38%     197.312us        27.38%     197.312us      21.924us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.26%      16.250us         2.97%      21.400us       2.378us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.19%       8.579us         1.19%       8.579us       0.572us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.47%      10.600us         1.47%      10.600us       3.533us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.23%       8.840us         1.23%       8.840us       2.947us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.77%       5.530us         0.95%       6.840us       2.280us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 832.973us
-Self CUDA time total: 38.558us
+Self CPU time total: 720.602us
+Self CUDA time total: 38.207us
 
 
 
@@ -4575,29 +4575,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     316.829us       488.45%     316.829us     316.829us             1  
-                                            torch_eager        14.19%     114.002us        99.33%     798.183us     798.183us       0.000us         0.00%      68.991us      68.991us             1  
-                                           aten::conv1d         0.68%       5.460us        13.80%     110.892us      36.964us       0.000us         0.00%      42.304us      14.101us             3  
-                                      aten::convolution         1.10%       8.859us        13.12%     105.432us      35.144us       0.000us         0.00%      42.304us      14.101us             3  
-                                     aten::_convolution         2.59%      20.821us        12.02%      96.573us      32.191us       0.000us         0.00%      42.304us      14.101us             3  
-                                aten::_conv_depthwise2d         2.64%      21.190us         7.50%      60.251us      20.084us      42.304us        65.22%      42.304us      14.101us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      42.304us        65.22%      42.304us      14.101us             3  
-                                               aten::to         0.75%       6.059us        68.35%     549.177us      91.530us       0.000us         0.00%      26.687us       4.448us             6  
-                                         aten::_to_copy         2.76%      22.169us        67.59%     543.118us      90.520us       0.000us         0.00%      26.687us       4.448us             6  
-                                            aten::copy_         6.74%      54.161us        61.27%     492.308us      82.051us      22.560us        34.78%      26.687us       4.448us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.095us        18.65%      12.095us       4.032us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.465us        16.13%      10.465us       3.488us             3  
-                                Activity Buffer Request        31.75%     255.134us        31.75%     255.134us     255.134us       4.127us         6.36%       4.127us       4.127us             1  
-                                    aten::empty_strided         3.56%      28.641us         3.56%      28.641us       4.773us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.49%     204.843us        25.49%     204.843us      22.760us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.06%      16.521us         2.65%      21.322us       2.369us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.02%       8.171us         1.02%       8.171us       0.545us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.14%       9.170us         1.14%       9.170us       3.057us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.00%       8.061us         1.00%       8.061us       2.687us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.66%       5.330us         0.81%       6.520us       2.173us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     314.752us       491.08%     314.752us     314.752us             1  
+                                            torch_eager        14.41%     112.890us        99.37%     778.613us     778.613us       0.000us         0.00%      68.189us      68.189us             1  
+                                           aten::conv1d         0.75%       5.840us        14.26%     111.761us      37.254us       0.000us         0.00%      41.759us      13.920us             3  
+                                      aten::convolution         1.10%       8.641us        13.52%     105.921us      35.307us       0.000us         0.00%      41.759us      13.920us             3  
+                                     aten::_convolution         2.66%      20.830us        12.41%      97.280us      32.427us       0.000us         0.00%      41.759us      13.920us             3  
+                                aten::_conv_depthwise2d         2.71%      21.201us         7.84%      61.431us      20.477us      41.759us        65.15%      41.759us      13.920us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.759us        65.15%      41.759us      13.920us             3  
+                                               aten::to         0.73%       5.701us        67.43%     528.381us      88.063us       0.000us         0.00%      26.430us       4.405us             6  
+                                         aten::_to_copy         2.70%      21.142us        66.70%     522.680us      87.113us       0.000us         0.00%      26.430us       4.405us             6  
+                                            aten::copy_         6.06%      47.459us        59.50%     466.248us      77.708us      22.335us        34.85%      26.430us       4.405us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.903us        18.57%      11.903us       3.968us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us        16.28%      10.432us       3.477us             3  
+                                Activity Buffer Request        30.62%     239.904us        30.62%     239.904us     239.904us       4.095us         6.39%       4.095us       4.095us             1  
+                                    aten::empty_strided         4.50%      35.290us         4.50%      35.290us       5.882us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.58%     200.475us        25.58%     200.475us      22.275us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.10%      16.441us         2.75%      21.561us       2.396us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.10%       8.620us         1.10%       8.620us       0.575us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.18%       9.220us         1.18%       9.220us       3.073us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.20%       9.420us         1.20%       9.420us       3.140us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.71%       5.569us         0.87%       6.849us       2.283us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 803.533us
-Self CUDA time total: 64.864us
+Self CPU time total: 783.583us
+Self CUDA time total: 64.094us
 
 
 
@@ -4607,29 +4607,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.383us       466.25%     328.383us     328.383us             1  
-                                            torch_eager         5.82%     138.672us        99.78%       2.376ms       2.376ms       0.000us         0.00%      74.527us      74.527us             1  
-                                           aten::conv1d         0.24%       5.689us         4.87%     115.970us      38.657us       0.000us         0.00%      47.969us      15.990us             3  
-                                      aten::convolution         0.43%      10.191us         4.63%     110.281us      36.760us       0.000us         0.00%      47.969us      15.990us             3  
-                                     aten::_convolution         0.91%      21.579us         4.20%     100.090us      33.363us       0.000us         0.00%      47.969us      15.990us             3  
-                                aten::_conv_depthwise2d         0.87%      20.670us         2.63%      62.670us      20.890us      47.969us        68.11%      47.969us      15.990us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.969us        68.11%      47.969us      15.990us             3  
-                                               aten::to         0.27%       6.430us        88.04%       2.097ms     349.464us       0.000us         0.00%      26.558us       4.426us             6  
-                                         aten::_to_copy         0.99%      23.642us        87.77%       2.090ms     348.392us       0.000us         0.00%      26.558us       4.426us             6  
-                                            aten::copy_         2.06%      49.120us        85.54%       2.037ms     339.525us      22.462us        31.89%      26.558us       4.426us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.999us        17.04%      11.999us       4.000us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.463us        14.86%      10.463us       3.488us             3  
-                                Activity Buffer Request        75.66%       1.802ms        75.66%       1.802ms       1.802ms       4.096us         5.82%       4.096us       4.096us             1  
-                                    aten::empty_strided         1.24%      29.560us         1.24%      29.560us       4.927us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.75%     208.373us         8.75%     208.373us      23.153us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.70%      16.782us         0.92%      21.972us       2.441us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.36%       8.520us         0.36%       8.520us       0.568us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.38%       9.160us         0.38%       9.160us       3.053us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.44%      10.580us         0.44%      10.580us       3.527us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.24%       5.730us         0.29%       7.020us       2.340us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.050us       488.00%     341.050us     341.050us             1  
+                                            torch_eager        14.90%     121.512us        99.29%     809.864us     809.864us       0.000us         0.00%      73.952us      73.952us             1  
+                                           aten::conv1d         0.74%       6.030us        14.63%     119.332us      39.777us       0.000us         0.00%      47.456us      15.819us             3  
+                                      aten::convolution         1.12%       9.160us        13.89%     113.302us      37.767us       0.000us         0.00%      47.456us      15.819us             3  
+                                     aten::_convolution         2.63%      21.441us        12.77%     104.142us      34.714us       0.000us         0.00%      47.456us      15.819us             3  
+                                aten::_conv_depthwise2d         2.71%      22.133us         8.21%      66.923us      22.308us      47.456us        67.90%      47.456us      15.819us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.456us        67.90%      47.456us      15.819us             3  
+                                               aten::to         0.75%       6.151us        66.55%     542.789us      90.465us       0.000us         0.00%      26.496us       4.416us             6  
+                                         aten::_to_copy         2.90%      23.661us        65.79%     536.638us      89.440us       0.000us         0.00%      26.496us       4.416us             6  
+                                            aten::copy_         6.09%      49.711us        59.32%     483.818us      80.636us      22.432us        32.10%      26.496us       4.416us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.904us        17.03%      11.904us       3.968us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us        15.06%      10.528us       3.509us             3  
+                                Activity Buffer Request        30.67%     250.175us        30.67%     250.175us     250.175us       4.064us         5.82%       4.064us       4.064us             1  
+                                    aten::empty_strided         3.58%      29.159us         3.58%      29.159us       4.860us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.38%     207.002us        25.38%     207.002us      23.000us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.12%      17.271us         2.72%      22.171us       2.463us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.06%       8.630us         1.06%       8.630us       0.575us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.48%      12.040us         1.48%      12.040us       4.013us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.19%       9.680us         1.19%       9.680us       3.227us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.69%       5.638us         0.87%       7.098us       2.366us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.382ms
-Self CUDA time total: 70.431us
+Self CPU time total: 815.634us
+Self CUDA time total: 69.888us
 
 
 
@@ -4639,29 +4639,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     336.351us       179.68%     336.351us     336.351us             1  
-                                            torch_eager         5.85%     142.571us        99.79%       2.430ms       2.430ms       0.000us         0.00%     197.311us     197.311us             1  
-                                           aten::conv1d         0.28%       6.741us         4.71%     114.731us      38.244us       0.000us         0.00%     134.368us      44.789us             3  
-                                      aten::convolution         0.38%       9.350us         4.43%     107.990us      35.997us       0.000us         0.00%     134.368us      44.789us             3  
-                                     aten::_convolution         0.88%      21.488us         4.05%      98.640us      32.880us       0.000us         0.00%     134.368us      44.789us             3  
-                                aten::_conv_depthwise2d         0.83%      20.301us         2.51%      61.091us      20.364us     134.368us        71.78%     134.368us      44.789us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     134.368us        71.78%     134.368us      44.789us             3  
-                                               aten::to         0.26%       6.379us        88.22%       2.148ms     358.072us       0.000us         0.00%      62.943us      10.491us             6  
-                                         aten::_to_copy         0.93%      22.632us        87.96%       2.142ms     357.009us       0.000us         0.00%      62.943us      10.491us             6  
-                                            aten::copy_         2.03%      49.489us        85.76%       2.089ms     348.110us      52.831us        28.22%      62.943us      10.491us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.727us        15.88%      29.727us       9.909us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.104us        12.34%      23.104us       7.701us             3  
-                                Activity Buffer Request        76.11%       1.853ms        76.11%       1.853ms       1.853ms      10.112us         5.40%      10.112us      10.112us             1  
-                                    aten::empty_strided         1.26%      30.760us         1.26%      30.760us       5.127us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.55%     208.274us         8.55%     208.274us      23.142us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.71%      17.184us         0.91%      22.223us       2.469us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.34%       8.338us         0.34%       8.338us       0.556us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.38%       9.180us         0.38%       9.180us       3.060us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.37%       9.020us         0.37%       9.020us       3.007us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.22%       5.460us         0.27%       6.690us       2.230us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     333.665us       179.68%     333.665us     333.665us             1  
+                                            torch_eager         6.31%     142.852us        99.78%       2.260ms       2.260ms       0.000us         0.00%     195.711us     195.711us             1  
+                                           aten::conv1d         0.27%       6.061us         5.13%     116.202us      38.734us       0.000us         0.00%     133.407us      44.469us             3  
+                                      aten::convolution         0.39%       8.731us         4.86%     110.141us      36.714us       0.000us         0.00%     133.407us      44.469us             3  
+                                     aten::_convolution         0.93%      21.019us         4.48%     101.410us      33.803us       0.000us         0.00%     133.407us      44.469us             3  
+                                aten::_conv_depthwise2d         0.95%      21.630us         2.86%      64.671us      21.557us     133.407us        71.84%     133.407us      44.469us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     133.407us        71.84%     133.407us      44.469us             3  
+                                               aten::to         0.26%       5.950us        87.17%       1.974ms     329.070us       0.000us         0.00%      62.304us      10.384us             6  
+                                         aten::_to_copy         0.93%      21.080us        86.91%       1.968ms     328.079us       0.000us         0.00%      62.304us      10.384us             6  
+                                            aten::copy_         2.06%      46.600us        84.72%       1.919ms     319.815us      52.288us        28.16%      62.304us      10.384us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.503us        15.89%      29.503us       9.834us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.785us        12.27%      22.785us       7.595us             3  
+                                Activity Buffer Request        74.89%       1.696ms        74.89%       1.696ms       1.696ms      10.016us         5.39%      10.016us      10.016us             1  
+                                    aten::empty_strided         1.26%      28.502us         1.26%      28.502us       4.750us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.73%     197.643us         8.73%     197.643us      21.960us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.77%      17.390us         1.01%      22.821us       2.536us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.41%       9.201us         0.41%       9.201us       0.613us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.51%      11.561us         0.51%      11.561us       3.854us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.43%       9.850us         0.43%       9.850us       3.283us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.25%       5.550us         0.31%       7.130us       2.377us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.435ms
-Self CUDA time total: 187.199us
+Self CPU time total: 2.265ms
+Self CUDA time total: 185.695us
 
 
 
@@ -4671,29 +4671,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     335.323us       159.21%     335.323us     335.323us             1  
-                                            torch_eager        14.44%     115.471us        99.40%     794.842us     794.842us       0.000us         0.00%     223.709us     223.709us             1  
-                                           aten::conv1d         0.70%       5.561us        13.80%     110.362us      36.787us       0.000us         0.00%     154.845us      51.615us             3  
-                                      aten::convolution         1.15%       9.189us        13.11%     104.801us      34.934us       0.000us         0.00%     154.845us      51.615us             3  
-                                     aten::_convolution         2.52%      20.182us        11.96%      95.612us      31.871us       0.000us         0.00%     154.845us      51.615us             3  
-                                aten::_conv_depthwise2d         2.51%      20.101us         7.60%      60.741us      20.247us     154.845us        73.52%     154.845us      51.615us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     154.845us        73.52%     154.845us      51.615us             3  
-                                               aten::to         0.72%       5.750us        68.18%     545.179us      90.863us       0.000us         0.00%      68.864us      11.477us             6  
-                                         aten::_to_copy         2.77%      22.130us        67.46%     539.429us      89.905us       0.000us         0.00%      68.864us      11.477us             6  
-                                            aten::copy_         5.86%      46.830us        60.79%     486.078us      81.013us      55.776us        26.48%      68.864us      11.477us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.416us        15.39%      32.416us      10.805us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.360us        11.09%      23.360us       7.787us             3  
-                                Activity Buffer Request        31.66%     253.204us        31.66%     253.204us     253.204us      13.088us         6.21%      13.088us      13.088us             1  
-                                    aten::empty_strided         3.90%      31.221us         3.90%      31.221us       5.203us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.02%     208.054us        26.02%     208.054us      23.117us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.93%      15.399us         2.47%      19.760us       2.196us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.98%       7.800us         0.98%       7.800us       0.520us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.23%       9.810us         1.23%       9.810us       3.270us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.10%       8.820us         1.10%       8.820us       2.940us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.69%       5.519us         0.86%       6.899us       2.300us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     336.127us       160.20%     336.127us     336.127us             1  
+                                            torch_eager        15.02%     112.950us        99.30%     746.873us     746.873us       0.000us         0.00%     223.453us     223.453us             1  
+                                           aten::conv1d         0.82%       6.200us        15.13%     113.793us      37.931us       0.000us         0.00%     154.078us      51.359us             3  
+                                      aten::convolution         1.21%       9.111us        14.30%     107.593us      35.864us       0.000us         0.00%     154.078us      51.359us             3  
+                                     aten::_convolution         2.76%      20.789us        13.09%      98.482us      32.827us       0.000us         0.00%     154.078us      51.359us             3  
+                                aten::_conv_depthwise2d         2.85%      21.433us         8.36%      62.872us      20.957us     154.078us        73.43%     154.078us      51.359us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     154.078us        73.43%     154.078us      51.359us             3  
+                                               aten::to         0.77%       5.760us        65.90%     495.650us      82.608us       0.000us         0.00%      69.375us      11.562us             6  
+                                         aten::_to_copy         2.76%      20.790us        65.13%     489.890us      81.648us       0.000us         0.00%      69.375us      11.562us             6  
+                                            aten::copy_         6.23%      46.852us        58.63%     441.010us      73.502us      55.743us        26.57%      69.375us      11.562us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.736us        15.60%      32.736us      10.912us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.007us        10.97%      23.007us       7.669us             3  
+                                Activity Buffer Request        28.60%     215.113us        28.60%     215.113us     215.113us      13.632us         6.50%      13.632us      13.632us             1  
+                                    aten::empty_strided         3.73%      28.090us         3.73%      28.090us       4.682us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.86%     202.035us        26.86%     202.035us      22.448us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.10%      15.770us         2.68%      20.191us       2.243us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.08%       8.131us         1.08%       8.131us       0.542us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.19%       8.950us         1.19%       8.950us       2.983us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.26%       9.499us         1.26%       9.499us       3.166us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.71%       5.360us         0.95%       7.120us       2.373us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 799.662us
-Self CUDA time total: 210.621us
+Self CPU time total: 752.153us
+Self CUDA time total: 209.821us
 
 
 
@@ -4703,28 +4703,28 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         6.62%     120.362us        52.56%     956.135us     956.135us       0.000us         0.00%       1.509ms       1.509ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.411ms       100.41%       1.411ms       1.411ms             1  
-                                               aten::to         0.34%       6.140us        38.13%     693.750us     115.625us       0.000us         0.00%     815.515us     135.919us             6  
-                                         aten::_to_copy         1.53%      27.810us        37.80%     687.610us     114.602us       0.000us         0.00%     815.515us     135.919us             6  
-                                            aten::copy_         2.83%      51.570us        25.68%     467.247us      77.874us     711.740us        50.66%     815.515us     135.919us             6  
-                                           aten::conv1d         0.32%       5.781us         6.36%     115.702us      38.567us       0.000us         0.00%     693.278us     231.093us             3  
-                                      aten::convolution         0.51%       9.289us         6.04%     109.921us      36.640us       0.000us         0.00%     693.278us     231.093us             3  
-                                     aten::_convolution         1.19%      21.630us         5.53%     100.632us      33.544us       0.000us         0.00%     693.278us     231.093us             3  
-                                aten::_conv_depthwise2d         1.16%      21.108us         3.52%      63.951us      21.317us     693.278us        49.34%     693.278us     231.093us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     693.278us        49.34%     693.278us     231.093us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     405.439us        28.86%     405.439us     135.146us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     306.301us        21.80%     306.301us     102.100us             3  
-                                Activity Buffer Request        12.14%     220.924us        12.14%     220.924us     220.924us     103.775us         7.39%     103.775us     103.775us             1  
-                                    aten::empty_strided         1.98%      36.051us        10.58%     192.553us      32.092us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        12.05%     219.204us        12.05%     219.204us      24.356us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.93%      16.940us         1.22%      22.200us       2.467us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.48%       8.651us         0.48%       8.651us       0.577us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.51%       9.201us         0.51%       9.201us       3.067us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.51%       9.191us         0.51%       9.191us       3.064us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.31%       5.621us         0.38%       6.871us       2.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         6.57%     117.922us        51.18%     918.465us     918.465us       0.000us         0.00%       1.509ms       1.509ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.411ms       100.40%       1.411ms       1.411ms             1  
+                                               aten::to         0.34%       6.101us        36.65%     657.591us     109.598us       0.000us         0.00%     815.747us     135.958us             6  
+                                         aten::_to_copy         1.52%      27.229us        36.31%     651.490us     108.582us       0.000us         0.00%     815.747us     135.958us             6  
+                                            aten::copy_         2.73%      48.941us        24.59%     441.187us      73.531us     712.610us        50.71%     815.747us     135.958us             6  
+                                           aten::conv1d         0.34%       6.120us         6.48%     116.312us      38.771us       0.000us         0.00%     692.768us     230.923us             3  
+                                      aten::convolution         0.51%       9.129us         6.14%     110.192us      36.731us       0.000us         0.00%     692.768us     230.923us             3  
+                                     aten::_convolution         1.19%      21.333us         5.63%     101.063us      33.688us       0.000us         0.00%     692.768us     230.923us             3  
+                                aten::_conv_depthwise2d         1.24%      22.308us         3.59%      64.481us      21.494us     692.768us        49.29%     692.768us     230.923us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     692.768us        49.29%     692.768us     230.923us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     407.041us        28.96%     407.041us     135.680us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     305.569us        21.74%     305.569us     101.856us             3  
+                                Activity Buffer Request        11.66%     209.253us        11.66%     209.253us     209.253us     103.137us         7.34%     103.137us     103.137us             1  
+                                    aten::empty_strided         1.92%      34.411us        10.20%     183.074us      30.512us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.50%     206.424us        11.50%     206.424us      22.936us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.94%      16.930us         1.24%      22.220us       2.469us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.50%       8.941us         0.50%       8.941us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.52%       9.381us         0.52%       9.381us       3.127us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.52%       9.361us         0.52%       9.361us       3.120us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       5.508us         0.38%       6.869us       2.290us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.819ms
+Self CPU time total: 1.794ms
 Self CUDA time total: 1.405ms
 
 
@@ -4735,29 +4735,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         6.07%     112.213us        42.26%     781.792us     781.792us       0.000us         0.00%       1.498ms       1.498ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.428ms       100.39%       1.428ms       1.428ms             1  
-                                               aten::to         0.33%       6.130us        28.74%     531.749us      88.625us       0.000us         0.00%     757.569us     126.261us             6  
-                                         aten::_to_copy         1.23%      22.780us        28.41%     525.619us      87.603us       0.000us         0.00%     757.569us     126.261us             6  
-                                            aten::copy_         2.64%      48.852us        25.56%     472.969us      78.828us     682.049us        47.95%     757.569us     126.261us             6  
-                                           aten::conv1d         0.33%       6.130us         6.13%     113.361us      37.787us       0.000us         0.00%     740.449us     246.816us             3  
-                                      aten::convolution         0.48%       8.889us         5.80%     107.231us      35.744us       0.000us         0.00%     740.449us     246.816us             3  
-                                     aten::_convolution         1.13%      20.931us         5.32%      98.342us      32.781us       0.000us         0.00%     740.449us     246.816us             3  
-                                aten::_conv_depthwise2d         1.15%      21.330us         3.38%      62.491us      20.830us     740.449us        52.05%     740.449us     246.816us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     740.449us        52.05%     740.449us     246.816us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     397.857us        27.97%     397.857us     132.619us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     284.192us        19.98%     284.192us      94.731us             3  
-                                Activity Buffer Request        12.95%     239.644us        12.95%     239.644us     239.644us      75.520us         5.31%      75.520us      75.520us             1  
-                                    aten::empty_strided         1.61%      29.870us         1.61%      29.870us       4.978us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        11.17%     206.574us        11.17%     206.574us      22.953us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.85%      15.779us         1.12%      20.809us       2.312us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       8.409us         0.45%       8.409us       0.561us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.49%       9.120us         0.49%       9.120us       3.040us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.54%       9.940us         0.54%       9.940us       3.313us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.29%       5.381us         0.36%       6.700us       2.233us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         6.19%     116.332us        43.22%     812.174us     812.174us       0.000us         0.00%       1.498ms       1.498ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.429ms       100.40%       1.429ms       1.429ms             1  
+                                               aten::to         0.31%       5.839us        29.49%     554.079us      92.346us       0.000us         0.00%     756.667us     126.111us             6  
+                                         aten::_to_copy         1.13%      21.183us        29.18%     548.240us      91.373us       0.000us         0.00%     756.667us     126.111us             6  
+                                            aten::copy_         2.52%      47.321us        26.61%     500.037us      83.340us     681.756us        47.91%     756.667us     126.111us             6  
+                                           aten::conv1d         0.30%       5.691us         6.12%     114.963us      38.321us       0.000us         0.00%     741.276us     247.092us             3  
+                                      aten::convolution         0.46%       8.681us         5.82%     109.272us      36.424us       0.000us         0.00%     741.276us     247.092us             3  
+                                     aten::_convolution         1.12%      21.121us         5.35%     100.591us      33.530us       0.000us         0.00%     741.276us     247.092us             3  
+                                aten::_conv_depthwise2d         1.12%      21.039us         3.43%      64.490us      21.497us     741.276us        52.09%     741.276us     247.092us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     741.276us        52.09%     741.276us     247.092us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     397.598us        27.94%     397.598us     132.533us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     284.158us        19.97%     284.158us      94.719us             3  
+                                Activity Buffer Request        11.11%     208.773us        11.11%     208.773us     208.773us      74.911us         5.26%      74.911us      74.911us             1  
+                                    aten::empty_strided         1.44%      27.020us         1.44%      27.020us       4.503us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        14.26%     268.023us        14.26%     268.023us      29.780us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.93%      17.452us         1.22%      23.001us       2.556us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.48%       8.988us         0.48%       8.988us       0.599us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.52%       9.701us         0.52%       9.701us       3.234us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.51%       9.670us         0.51%       9.670us       3.223us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.29%       5.400us         0.35%       6.640us       2.213us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.850ms
-Self CUDA time total: 1.422ms
+Self CPU time total: 1.879ms
+Self CUDA time total: 1.423ms
 
 
 impl                     wl                  p50(ms)  ok