diff --git "a/causal_conv1d/impls/torch_causal_conv1d.html" "b/causal_conv1d/impls/torch_causal_conv1d.html" --- "a/causal_conv1d/impls/torch_causal_conv1d.html" +++ "b/causal_conv1d/impls/torch_causal_conv1d.html" @@ -3904,7 +3904,7 @@ Cell: nv | 0.25s
-
Fri Dec 19 18:56:39 2025       
+
Fri Dec 19 19:54:47 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.25s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   33C    P0             78W /  350W |       0MiB /  46068MiB |     11%      Default |
+| N/A   37C    P0             82W /  350W |       0MiB /  46068MiB |     14%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3935,9 +3935,9 @@ Cell: nv | 0.25s
 
 ▼ code 
 ▼ output
- ▶ uv-logs
+ ▶ uv-logs
  | 
-Cell: benchmark | 3.84s
+Cell: benchmark | 7.79s
  | 
 
 Raw
@@ -3999,29 +3999,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     420.863us      2177.59%     420.863us     420.863us             1  
-                                            torch_eager         8.44%     198.071us        99.39%       2.333ms       2.333ms       0.000us         0.00%      21.695us      21.695us             1  
-                                               aten::to         0.42%       9.821us        82.68%       1.941ms     323.474us       0.000us         0.00%      14.367us       2.395us             6  
-                                         aten::_to_copy         1.47%      34.509us        82.26%       1.931ms     321.837us       0.000us         0.00%      14.367us       2.395us             6  
-                                            aten::copy_         2.58%      60.614us        78.68%       1.847ms     307.834us      11.999us        62.08%      14.367us       2.395us             6  
-                                           aten::conv1d         0.31%       7.291us         6.62%     155.424us      51.808us       0.000us         0.00%       7.328us       2.443us             3  
-                                      aten::convolution         0.56%      13.171us         6.31%     148.133us      49.378us       0.000us         0.00%       7.328us       2.443us             3  
-                                     aten::_convolution         1.45%      33.972us         5.75%     134.962us      44.987us       0.000us         0.00%       7.328us       2.443us             3  
-                                aten::_conv_depthwise2d         1.44%      33.841us         3.61%      84.652us      28.217us       7.328us        37.92%       7.328us       2.443us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        37.92%       7.328us       2.443us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.335us        32.78%       6.335us       2.112us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        29.31%       5.664us       1.888us             3  
-                                Activity Buffer Request        73.26%       1.720ms        73.26%       1.720ms       1.720ms       2.368us        12.25%       2.368us       2.368us             1  
-                                    aten::empty_strided         2.11%      49.511us         2.11%      49.511us       8.252us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.97%      93.272us         3.97%      93.272us      10.364us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.01%      23.716us         1.33%      31.137us       3.460us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.50%      11.722us         0.50%      11.722us       0.781us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.50%      11.630us         0.50%      11.630us       3.877us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.53%      12.550us         0.53%      12.550us       4.183us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.29%       6.740us         0.34%       8.071us       2.690us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     436.163us      2260.26%     436.163us     436.163us             1  
+                                            torch_eager         9.04%     219.426us        99.40%       2.414ms       2.414ms       0.000us         0.00%      21.633us      21.633us             1  
+                                               aten::to         0.42%      10.088us        82.02%       1.992ms     331.955us       0.000us         0.00%      14.273us       2.379us             6  
+                                         aten::_to_copy         1.72%      41.805us        81.60%       1.982ms     330.274us       0.000us         0.00%      14.273us       2.379us             6  
+                                            aten::copy_         2.55%      61.862us        77.44%       1.881ms     313.438us      11.937us        61.86%      14.273us       2.379us             6  
+                                           aten::conv1d         0.37%       9.100us         6.66%     161.626us      53.875us       0.000us         0.00%       7.360us       2.453us             3  
+                                      aten::convolution         0.65%      15.771us         6.28%     152.526us      50.842us       0.000us         0.00%       7.360us       2.453us             3  
+                                     aten::_convolution         1.37%      33.342us         5.63%     136.755us      45.585us       0.000us         0.00%       7.360us       2.453us             3  
+                                aten::_conv_depthwise2d         1.40%      34.091us         3.35%      81.282us      27.094us       7.360us        38.14%       7.360us       2.453us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        38.14%       7.360us       2.453us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        32.67%       6.304us       2.101us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.633us        29.19%       5.633us       1.878us             3  
+                                Activity Buffer Request        72.13%       1.752ms        72.13%       1.752ms       1.752ms       2.336us        12.11%       2.336us       2.336us             1  
+                                    aten::empty_strided         2.44%      59.211us         2.44%      59.211us       9.868us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.74%      90.852us         3.74%      90.852us      10.095us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.11%      27.021us         1.43%      34.781us       3.865us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.54%      13.120us         0.54%      13.120us       0.875us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.50%      12.090us         0.50%      12.090us       4.030us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.47%      11.450us         0.47%      11.450us       3.817us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.39%       9.560us         0.46%      11.070us       3.690us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.348ms
-Self CUDA time total: 19.327us
+Self CPU time total: 2.428ms
+Self CUDA time total: 19.297us
 
 
 
@@ -4031,29 +4031,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     313.250us      1596.75%     313.250us     313.250us             1  
-                                            torch_eager         6.72%     145.881us        99.75%       2.164ms       2.164ms       0.000us         0.00%      21.826us      21.826us             1  
-                                               aten::to         0.27%       5.881us        86.54%       1.878ms     312.937us       0.000us         0.00%      13.921us       2.320us             6  
-                                         aten::_to_copy         1.01%      21.932us        86.27%       1.872ms     311.957us       0.000us         0.00%      13.921us       2.320us             6  
-                                            aten::copy_         2.19%      47.440us        83.99%       1.822ms     303.712us      11.713us        59.71%      13.921us       2.320us             6  
-                                           aten::conv1d         0.28%       6.160us         5.31%     115.143us      38.381us       0.000us         0.00%       7.905us       2.635us             3  
-                                      aten::convolution         0.41%       8.901us         5.02%     108.983us      36.328us       0.000us         0.00%       7.905us       2.635us             3  
-                                     aten::_convolution         1.01%      21.821us         4.61%     100.082us      33.361us       0.000us         0.00%       7.905us       2.635us             3  
-                                aten::_conv_depthwise2d         0.97%      21.100us         2.86%      62.101us      20.700us       7.905us        40.29%       7.905us       2.635us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.905us        40.29%       7.905us       2.635us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.113us        31.16%       6.113us       2.038us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.600us        28.55%       5.600us       1.867us             3  
-                                Activity Buffer Request        79.65%       1.728ms        79.65%       1.728ms       1.728ms       2.208us        11.25%       2.208us       2.208us             1  
-                                    aten::empty_strided         1.27%      27.539us         1.27%      27.539us       4.590us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.21%      69.681us         3.21%      69.681us       7.742us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.78%      16.828us         1.04%      22.610us       2.512us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       9.512us         0.44%       9.512us       0.634us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.43%       9.331us         0.43%       9.331us       3.110us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.41%       8.800us         0.41%       8.800us       2.933us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.26%       5.580us         0.33%       7.220us       2.407us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     323.230us      1639.85%     323.230us     323.230us             1  
+                                            torch_eager         5.66%     125.716us        99.73%       2.215ms       2.215ms       0.000us         0.00%      21.919us      21.919us             1  
+                                               aten::to         0.28%       6.120us        87.62%       1.946ms     324.308us       0.000us         0.00%      13.951us       2.325us             6  
+                                         aten::_to_copy         1.09%      24.242us        87.34%       1.940ms     323.288us       0.000us         0.00%      13.951us       2.325us             6  
+                                            aten::copy_         2.08%      46.140us        84.89%       1.885ms     314.223us      11.743us        59.58%      13.951us       2.325us             6  
+                                           aten::conv1d         0.26%       5.800us         5.28%     117.333us      39.111us       0.000us         0.00%       7.968us       2.656us             3  
+                                      aten::convolution         0.51%      11.340us         5.02%     111.533us      37.178us       0.000us         0.00%       7.968us       2.656us             3  
+                                     aten::_convolution         1.02%      22.760us         4.51%     100.193us      33.398us       0.000us         0.00%       7.968us       2.656us             3  
+                                aten::_conv_depthwise2d         0.99%      21.901us         2.76%      61.233us      20.411us       7.968us        40.42%       7.968us       2.656us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.968us        40.42%       7.968us       2.656us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        31.33%       6.176us       2.059us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.567us        28.24%       5.567us       1.856us             3  
+                                Activity Buffer Request        80.76%       1.794ms        80.76%       1.794ms       1.794ms       2.208us        11.20%       2.208us       2.208us             1  
+                                    aten::empty_strided         1.36%      30.150us         1.36%      30.150us       5.025us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.98%      66.273us         2.98%      66.273us       7.364us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.76%      16.860us         0.98%      21.769us       2.419us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.39%       8.679us         0.39%       8.679us       0.579us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.42%       9.331us         0.42%       9.331us       3.110us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.42%       9.410us         0.42%       9.410us       3.137us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.27%       6.040us         0.33%       7.320us       2.440us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.170ms
-Self CUDA time total: 19.618us
+Self CPU time total: 2.221ms
+Self CUDA time total: 19.711us
 
 
 
@@ -4063,29 +4063,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     311.776us      1662.54%     311.776us     311.776us             1  
-                                            torch_eager         6.35%     136.063us        99.73%       2.136ms       2.136ms       0.000us         0.00%      20.769us      20.769us             1  
-                                               aten::to         0.26%       5.629us        86.85%       1.860ms     310.039us       0.000us         0.00%      13.823us       2.304us             6  
-                                         aten::_to_copy         0.98%      21.032us        86.59%       1.855ms     309.100us       0.000us         0.00%      13.823us       2.304us             6  
-                                            aten::copy_         2.10%      44.987us        84.33%       1.806ms     301.043us      11.807us        62.96%      13.823us       2.304us             6  
-                                           aten::conv1d         0.27%       5.831us         5.33%     114.232us      38.077us       0.000us         0.00%       6.946us       2.315us             3  
-                                      aten::convolution         0.40%       8.671us         5.06%     108.401us      36.134us       0.000us         0.00%       6.946us       2.315us             3  
-                                     aten::_convolution         0.98%      20.999us         4.66%      99.730us      33.243us       0.000us         0.00%       6.946us       2.315us             3  
-                                aten::_conv_depthwise2d         0.97%      20.860us         2.94%      63.051us      21.017us       6.946us        37.04%       6.946us       2.315us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.946us        37.04%       6.946us       2.315us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.047us        32.25%       6.047us       2.016us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.760us        30.72%       5.760us       1.920us             3  
-                                Activity Buffer Request        79.94%       1.712ms        79.94%       1.712ms       1.712ms       2.016us        10.75%       2.016us       2.016us             1  
-                                    aten::empty_strided         1.28%      27.310us         1.28%      27.310us       4.552us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.44%      73.575us         3.44%      73.575us       8.175us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.79%      16.901us         1.05%      22.540us       2.504us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.42%       9.019us         0.42%       9.019us       0.601us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.43%       9.230us         0.43%       9.230us       3.077us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.40%       8.580us         0.40%       8.580us       2.860us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.25%       5.250us         0.31%       6.610us       2.203us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     318.337us      1694.72%     318.337us     318.337us             1  
+                                            torch_eager         5.96%     125.666us        99.74%       2.103ms       2.103ms       0.000us         0.00%      20.799us      20.799us             1  
+                                               aten::to         0.29%       6.100us        87.07%       1.836ms     305.953us       0.000us         0.00%      13.822us       2.304us             6  
+                                         aten::_to_copy         1.12%      23.630us        86.78%       1.830ms     304.936us       0.000us         0.00%      13.822us       2.304us             6  
+                                            aten::copy_         2.25%      47.463us        84.23%       1.776ms     295.964us      11.807us        62.86%      13.822us       2.304us             6  
+                                           aten::conv1d         0.29%       6.140us         5.41%     114.033us      38.011us       0.000us         0.00%       6.977us       2.326us             3  
+                                      aten::convolution         0.47%       9.960us         5.12%     107.893us      35.964us       0.000us         0.00%       6.977us       2.326us             3  
+                                     aten::_convolution         1.11%      23.480us         4.65%      97.933us      32.644us       0.000us         0.00%       6.977us       2.326us             3  
+                                aten::_conv_depthwise2d         1.00%      21.010us         2.82%      59.553us      19.851us       6.977us        37.14%       6.977us       2.326us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.977us        37.14%       6.977us       2.326us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.047us        32.19%       6.047us       2.016us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.760us        30.66%       5.760us       1.920us             3  
+                                Activity Buffer Request        79.84%       1.683ms        79.84%       1.683ms       1.683ms       2.015us        10.73%       2.015us       2.015us             1  
+                                    aten::empty_strided         1.43%      30.201us         1.43%      30.201us       5.033us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.12%      65.751us         3.12%      65.751us       7.306us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.82%      17.281us         1.06%      22.310us       2.479us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.41%       8.589us         0.41%       8.589us       0.573us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.43%       9.162us         0.43%       9.162us       3.054us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.41%       8.730us         0.41%       8.730us       2.910us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.25%       5.310us         0.31%       6.480us       2.160us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.142ms
-Self CUDA time total: 18.753us
+Self CPU time total: 2.108ms
+Self CUDA time total: 18.784us
 
 
 
@@ -4095,29 +4095,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     315.647us      1619.78%     315.647us     315.647us             1  
-                                            torch_eager         6.63%     154.700us        99.79%       2.328ms       2.328ms       0.000us         0.00%      21.599us      21.599us             1  
-                                               aten::to         0.26%       6.031us        87.19%       2.034ms     338.971us       0.000us         0.00%      13.983us       2.331us             6  
-                                         aten::_to_copy         0.93%      21.742us        86.93%       2.028ms     337.966us       0.000us         0.00%      13.983us       2.331us             6  
-                                            aten::copy_         2.02%      47.211us        84.82%       1.979ms     329.757us      11.871us        60.92%      13.983us       2.331us             6  
-                                           aten::conv1d         0.25%       5.920us         4.90%     114.312us      38.104us       0.000us         0.00%       7.616us       2.539us             3  
-                                      aten::convolution         0.39%       8.981us         4.65%     108.392us      36.131us       0.000us         0.00%       7.616us       2.539us             3  
-                                     aten::_convolution         0.89%      20.821us         4.26%      99.411us      33.137us       0.000us         0.00%       7.616us       2.539us             3  
-                                aten::_conv_depthwise2d         0.89%      20.790us         2.71%      63.250us      21.083us       7.616us        39.08%       7.616us       2.539us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.616us        39.08%       7.616us       2.539us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.143us        31.52%       6.143us       2.048us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        29.39%       5.728us       1.909us             3  
-                                Activity Buffer Request        71.63%       1.671ms        71.63%       1.671ms       1.671ms       2.112us        10.84%       2.112us       2.112us             1  
-                                    aten::empty_strided         1.18%      27.509us         1.18%      27.509us       4.585us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        12.15%     283.514us        12.15%     283.514us      31.502us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.70%      16.421us         0.91%      21.332us       2.370us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.37%       8.571us         0.37%       8.571us       0.571us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.43%      10.140us         0.43%      10.140us       3.380us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.39%       9.150us         0.39%       9.150us       3.050us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.23%       5.450us         0.30%       7.100us       2.367us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     331.012us      1687.37%     331.012us     331.012us             1  
+                                            torch_eager         5.07%     123.976us        99.78%       2.439ms       2.439ms       0.000us         0.00%      21.729us      21.729us             1  
+                                               aten::to         0.26%       6.330us        88.80%       2.171ms     361.846us       0.000us         0.00%      14.016us       2.336us             6  
+                                         aten::_to_copy         0.96%      23.551us        88.54%       2.165ms     360.791us       0.000us         0.00%      14.016us       2.336us             6  
+                                            aten::copy_         1.90%      46.440us        86.33%       2.111ms     351.776us      11.904us        60.68%      14.016us       2.336us             6  
+                                           aten::conv1d         0.24%       5.859us         4.84%     118.253us      39.418us       0.000us         0.00%       7.713us       2.571us             3  
+                                      aten::convolution         0.37%       9.110us         4.60%     112.394us      37.465us       0.000us         0.00%       7.713us       2.571us             3  
+                                     aten::_convolution         0.99%      24.110us         4.22%     103.284us      34.428us       0.000us         0.00%       7.713us       2.571us             3  
+                                aten::_conv_depthwise2d         0.86%      20.950us         2.48%      60.661us      20.220us       7.713us        39.32%       7.713us       2.571us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.713us        39.32%       7.713us       2.571us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        31.48%       6.176us       2.059us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        29.20%       5.728us       1.909us             3  
+                                Activity Buffer Request        74.74%       1.827ms        74.74%       1.827ms       1.827ms       2.112us        10.77%       2.112us       2.112us             1  
+                                    aten::empty_strided         1.25%      30.541us         1.25%      30.541us       5.090us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.54%     257.579us        10.54%     257.579us      28.620us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.76%      18.570us         0.98%      23.841us       2.649us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.35%       8.661us         0.35%       8.661us       0.577us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.39%       9.560us         0.39%       9.560us       3.187us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.39%       9.470us         0.39%       9.470us       3.157us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.30%       7.292us         0.35%       8.562us       2.854us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.333ms
-Self CUDA time total: 19.487us
+Self CPU time total: 2.445ms
+Self CUDA time total: 19.617us
 
 
 
@@ -4127,29 +4127,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     316.162us      1286.41%     316.162us     316.162us             1  
-                                            torch_eager         6.08%     142.813us        99.78%       2.344ms       2.344ms       0.000us         0.00%      26.881us      26.881us             1  
-                                               aten::to         0.26%       6.099us        87.82%       2.063ms     343.816us       0.000us         0.00%      15.329us       2.555us             6  
-                                         aten::_to_copy         0.96%      22.501us        87.56%       2.057ms     342.799us       0.000us         0.00%      15.329us       2.555us             6  
-                                            aten::copy_         2.00%      46.941us        85.41%       2.006ms     334.376us      13.025us        53.00%      15.329us       2.555us             6  
-                                           aten::conv1d         0.27%       6.279us         4.78%     112.281us      37.427us       0.000us         0.00%      11.552us       3.851us             3  
-                                      aten::convolution         0.37%       8.701us         4.51%     106.002us      35.334us       0.000us         0.00%      11.552us       3.851us             3  
-                                     aten::_convolution         0.88%      20.659us         4.14%      97.301us      32.434us       0.000us         0.00%      11.552us       3.851us             3  
-                                aten::_conv_depthwise2d         0.89%      20.940us         2.63%      61.791us      20.597us      11.552us        47.00%      11.552us       3.851us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        47.00%      11.552us       3.851us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.689us        27.22%       6.689us       2.230us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        25.78%       6.336us       2.112us             3  
-                                Activity Buffer Request        74.92%       1.760ms        74.92%       1.760ms       1.760ms       2.304us         9.37%       2.304us       2.304us             1  
-                                    aten::empty_strided         1.19%      28.040us         1.19%      28.040us       4.673us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.45%     222.075us         9.45%     222.075us      24.675us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.70%      16.419us         0.91%      21.470us       2.386us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.36%       8.480us         0.36%       8.480us       0.565us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.40%       9.310us         0.40%       9.310us       3.103us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.38%       8.880us         0.38%       8.880us       2.960us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.24%       5.612us         0.29%       6.911us       2.304us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     323.714us      1318.86%     323.714us     323.714us             1  
+                                            torch_eager         5.46%     123.145us        99.75%       2.249ms       2.249ms       0.000us         0.00%      26.849us      26.849us             1  
+                                               aten::to         0.25%       5.731us        88.05%       1.985ms     330.837us       0.000us         0.00%      15.297us       2.550us             6  
+                                         aten::_to_copy         1.06%      23.869us        87.80%       1.979ms     329.882us       0.000us         0.00%      15.297us       2.550us             6  
+                                            aten::copy_         2.13%      47.950us        85.40%       1.925ms     320.895us      12.993us        52.94%      15.297us       2.550us             6  
+                                           aten::conv1d         0.25%       5.619us         5.08%     114.483us      38.161us       0.000us         0.00%      11.552us       3.851us             3  
+                                      aten::convolution         0.41%       9.331us         4.83%     108.864us      36.288us       0.000us         0.00%      11.552us       3.851us             3  
+                                     aten::_convolution         0.98%      22.020us         4.42%      99.533us      33.178us       0.000us         0.00%      11.552us       3.851us             3  
+                                aten::_conv_depthwise2d         0.92%      20.831us         2.69%      60.562us      20.187us      11.552us        47.06%      11.552us       3.851us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        47.06%      11.552us       3.851us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.688us        27.25%       6.688us       2.229us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.305us        25.69%       6.305us       2.102us             3  
+                                Activity Buffer Request        74.33%       1.676ms        74.33%       1.676ms       1.676ms       2.304us         9.39%       2.304us       2.304us             1  
+                                    aten::empty_strided         1.33%      30.051us         1.33%      30.051us       5.008us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.91%     223.307us         9.91%     223.307us      24.812us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.79%      17.850us         1.04%      23.451us       2.606us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.40%       8.951us         0.40%       8.951us       0.597us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.40%       9.121us         0.40%       9.121us       3.040us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.40%       8.980us         0.40%       8.980us       2.993us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.27%       6.160us         0.32%       7.280us       2.427us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.349ms
-Self CUDA time total: 24.577us
+Self CPU time total: 2.254ms
+Self CUDA time total: 24.545us
 
 
 
@@ -4159,29 +4159,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     315.548us      1212.95%     315.548us     315.548us             1  
-                                            torch_eager         6.12%     137.996us        99.74%       2.248ms       2.248ms       0.000us         0.00%      28.287us      28.287us             1  
-                                               aten::to         0.25%       5.659us        87.43%       1.970ms     328.377us       0.000us         0.00%      15.295us       2.549us             6  
-                                         aten::_to_copy         0.93%      21.030us        87.17%       1.965ms     327.434us       0.000us         0.00%      15.295us       2.549us             6  
-                                            aten::copy_         2.11%      47.618us        85.03%       1.916ms     319.389us      13.023us        50.06%      15.295us       2.549us             6  
-                                           aten::conv1d         0.25%       5.679us         5.07%     114.160us      38.053us       0.000us         0.00%      12.992us       4.331us             3  
-                                      aten::convolution         0.38%       8.670us         4.81%     108.481us      36.160us       0.000us         0.00%      12.992us       4.331us             3  
-                                     aten::_convolution         0.97%      21.770us         4.43%      99.811us      33.270us       0.000us         0.00%      12.992us       4.331us             3  
-                                aten::_conv_depthwise2d         0.95%      21.450us         2.80%      63.011us      21.004us      12.992us        49.94%      12.992us       4.331us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      12.992us        49.94%      12.992us       4.331us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.687us        25.70%       6.687us       2.229us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        24.36%       6.336us       2.112us             3  
-                                Activity Buffer Request        74.56%       1.680ms        74.56%       1.680ms       1.680ms       2.272us         8.73%       2.272us       2.272us             1  
-                                    aten::empty_strided         1.21%      27.241us         1.21%      27.241us       4.540us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.39%     211.596us         9.39%     211.596us      23.511us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.73%      16.539us         0.97%      21.840us       2.427us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.39%       8.730us         0.39%       8.730us       0.582us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.41%       9.190us         0.41%       9.190us       3.063us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.40%       9.090us         0.40%       9.090us       3.030us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.24%       5.411us         0.30%       6.770us       2.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.293us      1275.69%     332.293us     332.293us             1  
+                                            torch_eager         4.94%     125.275us        99.79%       2.531ms       2.531ms       0.000us         0.00%      28.288us      28.288us             1  
+                                               aten::to         0.24%       6.110us        89.17%       2.261ms     376.887us       0.000us         0.00%      15.199us       2.533us             6  
+                                         aten::_to_copy         0.97%      24.638us        88.93%       2.255ms     375.868us       0.000us         0.00%      15.199us       2.533us             6  
+                                            aten::copy_         1.89%      47.902us        86.77%       2.200ms     366.727us      12.959us        49.75%      15.199us       2.533us             6  
+                                           aten::conv1d         0.23%       5.820us         4.59%     116.433us      38.811us       0.000us         0.00%      13.089us       4.363us             3  
+                                      aten::convolution         0.40%      10.180us         4.36%     110.613us      36.871us       0.000us         0.00%      13.089us       4.363us             3  
+                                     aten::_convolution         0.89%      22.520us         3.96%     100.433us      33.478us       0.000us         0.00%      13.089us       4.363us             3  
+                                aten::_conv_depthwise2d         0.83%      21.002us         2.40%      60.753us      20.251us      13.089us        50.25%      13.089us       4.363us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.089us        50.25%      13.089us       4.363us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.591us        25.30%       6.591us       2.197us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.368us        24.45%       6.368us       2.123us             3  
+                                Activity Buffer Request        77.38%       1.962ms        77.38%       1.962ms       1.962ms       2.240us         8.60%       2.240us       2.240us             1  
+                                    aten::empty_strided         1.19%      30.211us         1.19%      30.211us       5.035us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.30%     210.437us         8.30%     210.437us      23.382us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.72%      18.342us         0.94%      23.791us       2.643us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.36%       9.059us         0.36%       9.059us       0.604us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.39%       9.940us         0.39%       9.940us       3.313us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.37%       9.490us         0.37%       9.490us       3.163us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       7.000us         0.33%       8.470us       2.823us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.254ms
-Self CUDA time total: 26.015us
+Self CPU time total: 2.536ms
+Self CUDA time total: 26.048us
 
 
 
@@ -4191,29 +4191,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     309.088us       806.93%     309.088us     309.088us             1  
-                                            torch_eager         6.25%     144.584us        99.79%       2.308ms       2.308ms       0.000us         0.00%      40.896us      40.896us             1  
-                                           aten::conv1d         0.25%       5.670us         4.80%     111.051us      37.017us       0.000us         0.00%      22.560us       7.520us             3  
-                                      aten::convolution         0.37%       8.509us         4.56%     105.381us      35.127us       0.000us         0.00%      22.560us       7.520us             3  
-                                     aten::_convolution         0.91%      21.140us         4.19%      96.872us      32.291us       0.000us         0.00%      22.560us       7.520us             3  
-                                aten::_conv_depthwise2d         0.90%      20.721us         2.62%      60.611us      20.204us      22.560us        58.90%      22.560us       7.520us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.560us        58.90%      22.560us       7.520us             3  
-                                               aten::to         0.25%       5.820us        87.66%       2.027ms     337.907us       0.000us         0.00%      18.336us       3.056us             6  
-                                         aten::_to_copy         0.93%      21.482us        87.41%       2.022ms     336.937us       0.000us         0.00%      18.336us       3.056us             6  
-                                            aten::copy_         1.96%      45.321us        85.30%       1.973ms     328.809us      15.744us        41.10%      18.336us       3.056us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.416us        21.97%       8.416us       2.805us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        19.13%       7.328us       2.443us             3  
-                                Activity Buffer Request        75.52%       1.747ms        75.52%       1.747ms       1.747ms       2.592us         6.77%       2.592us       2.592us             1  
-                                    aten::empty_strided         1.18%      27.288us         1.18%      27.288us       4.548us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.77%     202.733us         8.77%     202.733us      22.526us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.70%      16.129us         0.91%      20.960us       2.329us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.36%       8.361us         0.36%       8.361us       0.557us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.40%       9.150us         0.40%       9.150us       3.050us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.39%       8.930us         0.39%       8.930us       2.977us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.24%       5.551us         0.30%       6.831us       2.277us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     342.974us       895.45%     342.974us     342.974us             1  
+                                            torch_eager         6.28%     155.441us        99.76%       2.468ms       2.468ms       0.000us         0.00%      40.893us      40.893us             1  
+                                           aten::conv1d         0.26%       6.400us         4.96%     122.693us      40.898us       0.000us         0.00%      22.559us       7.520us             3  
+                                      aten::convolution         0.44%      10.880us         4.70%     116.293us      38.764us       0.000us         0.00%      22.559us       7.520us             3  
+                                     aten::_convolution         1.02%      25.290us         4.26%     105.413us      35.138us       0.000us         0.00%      22.559us       7.520us             3  
+                                aten::_conv_depthwise2d         0.93%      22.901us         2.53%      62.512us      20.837us      22.559us        58.90%      22.559us       7.520us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.559us        58.90%      22.559us       7.520us             3  
+                                               aten::to         0.27%       6.673us        87.38%       2.161ms     360.194us       0.000us         0.00%      18.334us       3.056us             6  
+                                         aten::_to_copy         1.14%      28.279us        87.11%       2.154ms     359.082us       0.000us         0.00%      18.334us       3.056us             6  
+                                            aten::copy_         2.07%      51.091us        84.63%       2.093ms     348.865us      15.743us        41.10%      18.334us       3.056us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.352us        21.81%       8.352us       2.784us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.391us        19.30%       7.391us       2.464us             3  
+                                Activity Buffer Request        70.52%       1.744ms        70.52%       1.744ms       1.744ms       2.591us         6.76%       2.591us       2.591us             1  
+                                    aten::empty_strided         1.34%      33.023us         1.34%      33.023us       5.504us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.90%     319.187us        12.90%     319.187us      35.465us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.77%      18.952us         0.98%      24.192us       2.688us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.37%       9.200us         0.37%       9.200us       0.613us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.37%       9.140us         0.37%       9.140us       3.047us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.37%       9.211us         0.37%       9.211us       3.070us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.27%       6.591us         0.33%       8.191us       2.730us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.313ms
-Self CUDA time total: 38.304us
+Self CPU time total: 2.473ms
+Self CUDA time total: 38.302us
 
 
 
@@ -4223,29 +4223,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     314.049us       763.16%     314.049us     314.049us             1  
-                                            torch_eager         6.27%     140.693us        99.78%       2.240ms       2.240ms       0.000us         0.00%      43.775us      43.775us             1  
-                                           aten::conv1d         0.27%       6.020us         5.00%     112.161us      37.387us       0.000us         0.00%      25.408us       8.469us             3  
-                                      aten::convolution         0.39%       8.730us         4.73%     106.141us      35.380us       0.000us         0.00%      25.408us       8.469us             3  
-                                     aten::_convolution         0.96%      21.462us         4.34%      97.411us      32.470us       0.000us         0.00%      25.408us       8.469us             3  
-                                aten::_conv_depthwise2d         0.92%      20.739us         2.73%      61.250us      20.417us      25.408us        61.74%      25.408us       8.469us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.408us        61.74%      25.408us       8.469us             3  
-                                               aten::to         0.26%       5.842us        87.39%       1.962ms     327.034us       0.000us         0.00%      18.367us       3.061us             6  
-                                         aten::_to_copy         0.95%      21.318us        87.13%       1.956ms     326.060us       0.000us         0.00%      18.367us       3.061us             6  
-                                            aten::copy_         2.12%      47.620us        84.95%       1.908ms     317.917us      15.743us        38.26%      18.367us       3.061us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.384us        20.37%       8.384us       2.795us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.359us        17.88%       7.359us       2.453us             3  
-                                Activity Buffer Request        74.98%       1.684ms        74.98%       1.684ms       1.684ms       2.624us         6.38%       2.624us       2.624us             1  
-                                    aten::empty_strided         1.23%      27.541us         1.23%      27.541us       4.590us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.81%     197.723us         8.81%     197.723us      21.969us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.72%      16.172us         0.94%      21.010us       2.334us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.37%       8.238us         0.37%       8.238us       0.549us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.46%      10.261us         0.46%      10.261us       3.420us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.39%       8.800us         0.39%       8.800us       2.933us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.24%       5.440us         0.30%       6.680us       2.227us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     320.766us       781.90%     320.766us     320.766us             1  
+                                            torch_eager         5.17%     120.612us        99.77%       2.326ms       2.326ms       0.000us         0.00%      43.616us      43.616us             1  
+                                           aten::conv1d         0.25%       5.870us         4.90%     114.223us      38.074us       0.000us         0.00%      25.312us       8.437us             3  
+                                      aten::convolution         0.44%      10.332us         4.65%     108.353us      36.118us       0.000us         0.00%      25.312us       8.437us             3  
+                                     aten::_convolution         0.97%      22.638us         4.20%      98.021us      32.674us       0.000us         0.00%      25.312us       8.437us             3  
+                                aten::_conv_depthwise2d         0.87%      20.270us         2.54%      59.242us      19.747us      25.312us        61.70%      25.312us       8.437us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.312us        61.70%      25.312us       8.437us             3  
+                                               aten::to         0.24%       5.591us        88.58%       2.066ms     344.257us       0.000us         0.00%      18.304us       3.051us             6  
+                                         aten::_to_copy         1.00%      23.430us        88.34%       2.060ms     343.325us       0.000us         0.00%      18.304us       3.051us             6  
+                                            aten::copy_         2.09%      48.750us        86.05%       2.006ms     334.417us      15.712us        38.30%      18.304us       3.051us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.384us        20.44%       8.384us       2.795us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        17.86%       7.328us       2.443us             3  
+                                Activity Buffer Request        74.74%       1.743ms        74.74%       1.743ms       1.743ms       2.592us         6.32%       2.592us       2.592us             1  
+                                    aten::empty_strided         1.29%      30.021us         1.29%      30.021us       5.004us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.10%     235.617us        10.10%     235.617us      26.180us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.74%      17.250us         0.95%      22.130us       2.459us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.36%       8.290us         0.36%       8.290us       0.553us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.39%       9.191us         0.39%       9.191us       3.064us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.39%       8.990us         0.39%       8.990us       2.997us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.27%       6.321us         0.32%       7.551us       2.517us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.245ms
-Self CUDA time total: 41.151us
+Self CPU time total: 2.332ms
+Self CUDA time total: 41.024us
 
 
 
@@ -4255,29 +4255,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     321.281us       312.48%     321.281us     321.281us             1  
-                                            torch_eager         5.86%     138.423us        99.79%       2.355ms       2.355ms       0.000us         0.00%     108.833us     108.833us             1  
-                                           aten::conv1d         0.25%       5.850us         4.76%     112.341us      37.447us       0.000us         0.00%      70.720us      23.573us             3  
-                                      aten::convolution         0.36%       8.520us         4.51%     106.491us      35.497us       0.000us         0.00%      70.720us      23.573us             3  
-                                     aten::_convolution         0.88%      20.802us         4.15%      97.971us      32.657us       0.000us         0.00%      70.720us      23.573us             3  
-                                aten::_conv_depthwise2d         0.89%      20.951us         2.63%      62.101us      20.700us      70.720us        68.78%      70.720us      23.573us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.720us        68.78%      70.720us      23.573us             3  
-                                               aten::to         0.25%       6.012us        88.03%       2.078ms     346.293us       0.000us         0.00%      38.113us       6.352us             6  
-                                         aten::_to_copy         0.93%      22.001us        87.78%       2.072ms     345.291us       0.000us         0.00%      38.113us       6.352us             6  
-                                            aten::copy_         1.93%      45.601us        85.70%       2.023ms     337.137us      32.097us        31.22%      38.113us       6.352us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.601us        17.12%      17.601us       5.867us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.496us        14.10%      14.496us       4.832us             3  
-                                Activity Buffer Request        76.10%       1.796ms        76.10%       1.796ms       1.796ms       6.016us         5.85%       6.016us       6.016us             1  
-                                    aten::empty_strided         1.14%      26.919us         1.14%      26.919us       4.487us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.62%     203.563us         8.62%     203.563us      22.618us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.74%      17.469us         0.97%      22.809us       2.534us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.37%       8.760us         0.37%       8.760us       0.584us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.40%       9.490us         0.40%       9.490us       3.163us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.39%       9.140us         0.39%       9.140us       3.047us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.23%       5.370us         0.28%       6.650us       2.217us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     327.997us       319.51%     327.997us     327.997us             1  
+                                            torch_eager         5.09%     118.124us        99.77%       2.314ms       2.314ms       0.000us         0.00%     108.574us     108.574us             1  
+                                           aten::conv1d         0.23%       5.441us         4.97%     115.193us      38.398us       0.000us         0.00%      70.464us      23.488us             3  
+                                      aten::convolution         0.40%       9.290us         4.73%     109.752us      36.584us       0.000us         0.00%      70.464us      23.488us             3  
+                                     aten::_convolution         1.03%      23.820us         4.33%     100.462us      33.487us       0.000us         0.00%      70.464us      23.488us             3  
+                                aten::_conv_depthwise2d         0.94%      21.771us         2.65%      61.512us      20.504us      70.464us        68.64%      70.464us      23.488us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.464us        68.64%      70.464us      23.488us             3  
+                                               aten::to         0.25%       5.729us        88.48%       2.052ms     341.956us       0.000us         0.00%      38.110us       6.352us             6  
+                                         aten::_to_copy         1.03%      23.980us        88.23%       2.046ms     341.002us       0.000us         0.00%      38.110us       6.352us             6  
+                                            aten::copy_         2.02%      46.950us        85.91%       1.992ms     332.012us      32.191us        31.36%      38.110us       6.352us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.567us        17.11%      17.567us       5.856us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.624us        14.25%      14.624us       4.875us             3  
+                                Activity Buffer Request        76.17%       1.766ms        76.17%       1.766ms       1.766ms       5.919us         5.77%       5.919us       5.919us             1  
+                                    aten::empty_strided         1.29%      29.961us         1.29%      29.961us       4.993us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.62%     199.926us         8.62%     199.926us      22.214us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.81%      18.769us         1.04%      24.230us       2.692us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.38%       8.852us         0.38%       8.852us       0.590us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.41%       9.440us         0.41%       9.440us       3.147us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.40%       9.240us         0.40%       9.240us       3.080us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.24%       5.490us         0.29%       6.730us       2.243us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.360ms
-Self CUDA time total: 102.817us
+Self CPU time total: 2.319ms
+Self CUDA time total: 102.655us
 
 
 
@@ -4287,29 +4287,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     314.234us       278.42%     314.234us     314.234us             1  
-                                            torch_eager        14.56%     110.034us        99.36%     750.943us     750.943us       0.000us         0.00%     118.847us     118.847us             1  
-                                           aten::conv1d         0.76%       5.759us        14.83%     112.060us      37.353us       0.000us         0.00%      80.768us      26.923us             3  
-                                      aten::convolution         1.14%       8.591us        14.06%     106.301us      35.434us       0.000us         0.00%      80.768us      26.923us             3  
-                                     aten::_convolution         2.73%      20.639us        12.93%      97.710us      32.570us       0.000us         0.00%      80.768us      26.923us             3  
-                                aten::_conv_depthwise2d         2.77%      20.950us         8.22%      62.160us      20.720us      80.768us        71.56%      80.768us      26.923us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.768us        71.56%      80.768us      26.923us             3  
-                                               aten::to         0.75%       5.692us        66.62%     503.480us      83.913us       0.000us         0.00%      38.079us       6.346us             6  
-                                         aten::_to_copy         2.84%      21.440us        65.86%     497.788us      82.965us       0.000us         0.00%      38.079us       6.346us             6  
-                                            aten::copy_         6.27%      47.370us        59.47%     449.478us      74.913us      32.095us        28.44%      38.079us       6.346us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.567us        15.56%      17.567us       5.856us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.528us        12.87%      14.528us       4.843us             3  
-                                Activity Buffer Request        28.71%     217.014us        28.71%     217.014us     217.014us       5.984us         5.30%       5.984us       5.984us             1  
-                                    aten::empty_strided         3.56%      26.870us         3.56%      26.870us       4.478us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        27.58%     208.434us        27.58%     208.434us      23.159us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.15%      16.260us         2.82%      21.342us       2.371us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.12%       8.483us         1.12%       8.483us       0.566us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.19%       8.980us         1.19%       8.980us       2.993us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.18%       8.890us         1.18%       8.890us       2.963us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.73%       5.530us         0.90%       6.840us       2.280us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     324.863us       288.98%     324.863us     324.863us             1  
+                                            torch_eager         5.27%     120.370us        99.77%       2.281ms       2.281ms       0.000us         0.00%     118.337us     118.337us             1  
+                                           aten::conv1d         0.25%       5.800us         4.98%     113.813us      37.938us       0.000us         0.00%      80.416us      26.805us             3  
+                                      aten::convolution         0.39%       8.920us         4.72%     108.013us      36.004us       0.000us         0.00%      80.416us      26.805us             3  
+                                     aten::_convolution         0.94%      21.602us         4.33%      99.093us      33.031us       0.000us         0.00%      80.416us      26.805us             3  
+                                aten::_conv_depthwise2d         0.98%      22.480us         2.74%      62.561us      20.854us      80.416us        71.53%      80.416us      26.805us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.416us        71.53%      80.416us      26.805us             3  
+                                               aten::to         0.26%       6.002us        88.33%       2.019ms     336.570us       0.000us         0.00%      37.921us       6.320us             6  
+                                         aten::_to_copy         0.97%      22.148us        88.07%       2.013ms     335.570us       0.000us         0.00%      37.921us       6.320us             6  
+                                            aten::copy_         2.05%      46.852us        85.75%       1.960ms     326.725us      32.001us        28.47%      37.921us       6.320us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.472us        15.54%      17.472us       5.824us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.529us        12.92%      14.529us       4.843us             3  
+                                Activity Buffer Request        76.07%       1.739ms        76.07%       1.739ms       1.739ms       5.920us         5.27%       5.920us       5.920us             1  
+                                    aten::empty_strided         1.35%      30.921us         1.35%      30.921us       5.153us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.60%     196.555us         8.60%     196.555us      21.839us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.81%      18.430us         1.02%      23.240us       2.582us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.36%       8.150us         0.36%       8.150us       0.543us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.40%       9.159us         0.40%       9.159us       3.053us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.39%       8.901us         0.39%       8.901us       2.967us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.24%       5.410us         0.29%       6.670us       2.223us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 755.803us
-Self CUDA time total: 112.863us
+Self CPU time total: 2.286ms
+Self CUDA time total: 112.417us
 
 
 
@@ -4319,29 +4319,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager        13.44%     111.290us        87.96%     728.462us     728.462us       0.000us         0.00%     465.598us     465.598us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     451.198us       105.76%     451.198us     451.198us             1  
-                                           aten::conv1d         0.71%       5.852us        13.50%     111.822us      37.274us       0.000us         0.00%     279.070us      93.023us             3  
-                                      aten::convolution         1.02%       8.470us        12.80%     105.970us      35.323us       0.000us         0.00%     279.070us      93.023us             3  
-                                     aten::_convolution         2.46%      20.349us        11.77%      97.500us      32.500us       0.000us         0.00%     279.070us      93.023us             3  
-                                aten::_conv_depthwise2d         2.50%      20.731us         7.53%      62.371us      20.790us     279.070us        65.41%     279.070us      93.023us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     279.070us        65.41%     279.070us      93.023us             3  
-                                               aten::to         0.69%       5.693us        58.10%     481.149us      80.192us       0.000us         0.00%     186.528us      31.088us             6  
-                                         aten::_to_copy         2.52%      20.868us        57.41%     475.456us      79.243us       0.000us         0.00%     186.528us      31.088us             6  
-                                            aten::copy_         5.92%      49.010us        51.66%     427.857us      71.310us     147.552us        34.59%     186.528us      31.088us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     107.296us        25.15%     107.296us      35.765us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.256us         9.44%      40.256us      13.419us             3  
-                                Activity Buffer Request        24.18%     200.274us        24.18%     200.274us     200.274us      38.976us         9.14%      38.976us      38.976us             1  
-                                    aten::empty_strided         3.23%      26.731us         3.23%      26.731us       4.455us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.27%     200.993us        24.27%     200.993us      22.333us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.86%      15.430us         2.41%      19.970us       2.219us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.00%       8.282us         1.00%       8.282us       0.552us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.22%      10.070us         1.22%      10.070us       3.357us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.10%       9.150us         1.10%       9.150us       3.050us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.65%       5.408us         0.83%       6.890us       2.297us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         4.92%     118.303us        96.45%       2.317ms       2.317ms       0.000us         0.00%     464.513us     464.513us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     454.847us       106.94%     454.847us     454.847us             1  
+                                           aten::conv1d         0.23%       5.570us         4.84%     116.393us      38.798us       0.000us         0.00%     280.159us      93.386us             3  
+                                      aten::convolution         0.41%       9.781us         4.61%     110.823us      36.941us       0.000us         0.00%     280.159us      93.386us             3  
+                                     aten::_convolution         0.97%      23.190us         4.21%     101.042us      33.681us       0.000us         0.00%     280.159us      93.386us             3  
+                                aten::_conv_depthwise2d         0.92%      22.172us         2.56%      61.572us      20.524us     280.159us        65.87%     280.159us      93.386us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     280.159us        65.87%     280.159us      93.386us             3  
+                                               aten::to         0.25%       6.010us        85.54%       2.055ms     342.552us       0.000us         0.00%     184.354us      30.726us             6  
+                                         aten::_to_copy         1.07%      25.702us        85.29%       2.049ms     341.550us       0.000us         0.00%     184.354us      30.726us             6  
+                                            aten::copy_         1.90%      45.633us        82.97%       1.993ms     332.247us     145.185us        34.13%     184.354us      30.726us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     104.993us        24.68%     104.993us      34.998us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.192us         9.45%      40.192us      13.397us             3  
+                                Activity Buffer Request        73.74%       1.772ms        73.74%       1.772ms       1.772ms      39.169us         9.21%      39.169us      39.169us             1  
+                                    aten::empty_strided         1.25%      30.119us         1.25%      30.119us       5.020us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.18%     196.512us         8.18%     196.512us      21.835us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.74%      17.761us         0.97%      23.251us       2.583us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.39%       9.350us         0.39%       9.350us       0.623us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.39%       9.400us         0.39%       9.400us       3.133us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.40%       9.580us         0.40%       9.580us       3.193us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.27%       6.480us         0.34%       8.150us       2.717us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 828.193us
-Self CUDA time total: 426.622us
+Self CPU time total: 2.403ms
+Self CUDA time total: 425.344us
 
 
 
@@ -4351,29 +4351,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager        12.75%     112.781us        87.94%     777.793us     777.793us       0.000us         0.00%     472.535us     472.535us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     465.081us       106.66%     465.081us     465.081us             1  
-                                           aten::conv1d         0.64%       5.670us        12.88%     113.921us      37.974us       0.000us         0.00%     298.235us      99.412us             3  
-                                      aten::convolution         0.99%       8.800us        12.24%     108.251us      36.084us       0.000us         0.00%     298.235us      99.412us             3  
-                                     aten::_convolution         2.35%      20.829us        11.24%      99.451us      33.150us       0.000us         0.00%     298.235us      99.412us             3  
-                                aten::_conv_depthwise2d         2.33%      20.611us         7.13%      63.071us      21.024us     298.235us        68.40%     298.235us      99.412us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     298.235us        68.40%     298.235us      99.412us             3  
-                                               aten::to         0.69%       6.129us        59.45%     525.809us      87.635us       0.000us         0.00%     174.300us      29.050us             6  
-                                         aten::_to_copy         2.38%      21.040us        58.76%     519.680us      86.613us       0.000us         0.00%     174.300us      29.050us             6  
-                                            aten::copy_         5.38%      47.582us        53.27%     471.169us      78.528us     137.789us        31.60%     174.300us      29.050us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      97.534us        22.37%      97.534us      32.511us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.255us         9.23%      40.255us      13.418us             3  
-                                Activity Buffer Request        27.57%     243.834us        27.57%     243.834us     243.834us      36.511us         8.37%      36.511us      36.511us             1  
-                                    aten::empty_strided         3.11%      27.471us         3.11%      27.471us       4.579us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        23.03%     203.723us        23.03%     203.723us      22.636us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.93%      17.041us         2.45%      21.672us       2.408us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.93%       8.218us         0.93%       8.218us       0.548us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.01%       8.950us         1.01%       8.950us       2.983us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.08%       9.540us         1.08%       9.540us       3.180us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.64%       5.653us         0.81%       7.121us       2.374us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         5.04%     119.094us        95.77%       2.263ms       2.263ms       0.000us         0.00%     471.612us     471.612us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     465.149us       106.82%     465.149us     465.149us             1  
+                                           aten::conv1d         0.25%       5.900us         4.82%     113.812us      37.937us       0.000us         0.00%     297.789us      99.263us             3  
+                                      aten::convolution         0.42%       9.940us         4.57%     107.912us      35.971us       0.000us         0.00%     297.789us      99.263us             3  
+                                     aten::_convolution         0.91%      21.591us         4.15%      97.972us      32.657us       0.000us         0.00%     297.789us      99.263us             3  
+                                aten::_conv_depthwise2d         0.88%      20.792us         2.56%      60.392us      20.131us     297.789us        68.39%     297.789us      99.263us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     297.789us        68.39%     297.789us      99.263us             3  
+                                               aten::to         0.25%       5.882us        84.81%       2.004ms     334.050us       0.000us         0.00%     173.823us      28.970us             6  
+                                         aten::_to_copy         1.03%      24.309us        84.56%       1.998ms     333.069us       0.000us         0.00%     173.823us      28.970us             6  
+                                            aten::copy_         1.99%      46.992us        82.30%       1.945ms     324.149us     137.663us        31.61%     173.823us      28.970us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      97.280us        22.34%      97.280us      32.427us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.383us         9.27%      40.383us      13.461us             3  
+                                Activity Buffer Request        72.83%       1.721ms        72.83%       1.721ms       1.721ms      36.160us         8.30%      36.160us      36.160us             1  
+                                    aten::empty_strided         1.24%      29.211us         1.24%      29.211us       4.869us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.35%     197.423us         8.35%     197.423us      21.936us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.69%      16.359us         0.90%      21.340us       2.371us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.37%       8.812us         0.37%       8.812us       0.587us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.41%       9.620us         0.41%       9.620us       3.207us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.40%       9.410us         0.40%       9.410us       3.137us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.27%       6.299us         0.32%       7.670us       2.557us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 884.464us
-Self CUDA time total: 436.024us
+Self CPU time total: 2.363ms
+Self CUDA time total: 435.452us
 
 
 
@@ -4383,29 +4383,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     322.331us      1707.26%     322.331us     322.331us             1  
-                                            torch_eager        13.79%     111.240us        99.37%     801.573us     801.573us       0.000us         0.00%      20.864us      20.864us             1  
-                                               aten::to         0.71%       5.762us        67.35%     543.311us      90.552us       0.000us         0.00%      13.696us       2.283us             6  
-                                         aten::_to_copy         2.83%      22.852us        66.64%     537.549us      89.592us       0.000us         0.00%      13.696us       2.283us             6  
-                                            aten::copy_         5.86%      47.269us        60.36%     486.896us      81.149us      11.712us        62.03%      13.696us       2.283us             6  
-                                           aten::conv1d         0.71%       5.710us        14.24%     114.861us      38.287us       0.000us         0.00%       7.168us       2.389us             3  
-                                      aten::convolution         1.09%       8.781us        13.53%     109.151us      36.384us       0.000us         0.00%       7.168us       2.389us             3  
-                                     aten::_convolution         2.85%      22.969us        12.44%     100.370us      33.457us       0.000us         0.00%       7.168us       2.389us             3  
-                                aten::_conv_depthwise2d         2.60%      20.991us         7.73%      62.361us      20.787us       7.168us        37.97%       7.168us       2.389us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.168us        37.97%       7.168us       2.389us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.016us        31.86%       6.016us       2.005us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        30.17%       5.696us       1.899us             3  
-                                Activity Buffer Request        32.81%     264.695us        32.81%     264.695us     264.695us       1.984us        10.51%       1.984us       1.984us             1  
-                                    aten::empty_strided         3.45%      27.801us         3.45%      27.801us       4.633us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.60%     198.402us        24.60%     198.402us      22.045us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.08%      16.740us         3.51%      28.341us       3.149us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.86%      15.030us         1.86%      15.030us       1.002us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.16%       9.330us         1.16%       9.330us       3.110us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.06%       8.570us         1.06%       8.570us       2.857us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.67%       5.391us         0.83%       6.660us       2.220us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     313.695us      1667.17%     313.695us     313.695us             1  
+                                            torch_eager        14.13%     113.614us        99.32%     798.339us     798.339us       0.000us         0.00%      20.800us      20.800us             1  
+                                               aten::to         0.70%       5.659us        68.10%     547.391us      91.232us       0.000us         0.00%      13.664us       2.277us             6  
+                                         aten::_to_copy         3.00%      24.130us        67.40%     541.732us      90.289us       0.000us         0.00%      13.664us       2.277us             6  
+                                            aten::copy_         5.94%      47.780us        60.67%     487.691us      81.282us      11.680us        62.07%      13.664us       2.277us             6  
+                                           aten::conv1d         0.70%       5.600us        14.11%     113.433us      37.811us       0.000us         0.00%       7.136us       2.379us             3  
+                                      aten::convolution         1.23%       9.922us        13.42%     107.833us      35.944us       0.000us         0.00%       7.136us       2.379us             3  
+                                     aten::_convolution         2.79%      22.410us        12.18%      97.911us      32.637us       0.000us         0.00%       7.136us       2.379us             3  
+                                aten::_conv_depthwise2d         2.59%      20.852us         7.49%      60.242us      20.081us       7.136us        37.93%       7.136us       2.379us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.136us        37.93%       7.136us       2.379us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.984us        31.80%       5.984us       1.995us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        30.27%       5.696us       1.899us             3  
+                                Activity Buffer Request        32.74%     263.147us        32.74%     263.147us     263.147us       1.984us        10.54%       1.984us       1.984us             1  
+                                    aten::empty_strided         3.72%      29.911us         3.72%      29.911us       4.985us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.67%     198.304us        24.67%     198.304us      22.034us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.90%      15.298us         2.42%      19.460us       2.162us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.96%       7.694us         0.96%       7.694us       0.513us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.11%       8.950us         1.11%       8.950us       2.983us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.11%       8.900us         1.11%       8.900us       2.967us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.80%       6.399us         0.97%       7.810us       2.603us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 806.653us
-Self CUDA time total: 18.880us
+Self CPU time total: 803.779us
+Self CUDA time total: 18.816us
 
 
 
@@ -4415,29 +4415,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     310.011us      1593.40%     310.011us     310.011us             1  
-                                            torch_eager        14.19%     114.063us        99.41%     798.944us     798.944us       0.000us         0.00%      21.440us      21.440us             1  
-                                               aten::to         0.69%       5.570us        68.28%     548.719us      91.453us       0.000us         0.00%      13.535us       2.256us             6  
-                                         aten::_to_copy         2.58%      20.749us        67.58%     543.149us      90.525us       0.000us         0.00%      13.535us       2.256us             6  
-                                            aten::copy_         5.87%      47.160us        61.68%     495.718us      82.620us      11.551us        59.37%      13.535us       2.256us             6  
-                                           aten::conv1d         0.70%       5.640us        13.72%     110.252us      36.751us       0.000us         0.00%       7.905us       2.635us             3  
-                                      aten::convolution         1.10%       8.879us        13.02%     104.612us      34.871us       0.000us         0.00%       7.905us       2.635us             3  
-                                     aten::_convolution         2.60%      20.881us        11.91%      95.733us      31.911us       0.000us         0.00%       7.905us       2.635us             3  
-                                aten::_conv_depthwise2d         2.57%      20.619us         7.48%      60.111us      20.037us       7.905us        40.63%       7.905us       2.635us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.905us        40.63%       7.905us       2.635us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us        30.26%       5.888us       1.963us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.663us        29.11%       5.663us       1.888us             3  
-                                Activity Buffer Request        34.15%     274.474us        34.15%     274.474us     274.474us       1.984us        10.20%       1.984us       1.984us             1  
-                                    aten::empty_strided         3.32%      26.682us         3.32%      26.682us       4.447us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.32%     195.444us        24.32%     195.444us      21.716us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.12%      17.050us         2.77%      22.260us       2.473us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.06%       8.520us         1.06%       8.520us       0.568us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.13%       9.091us         1.13%       9.091us       3.030us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.12%       9.041us         1.12%       9.041us       3.014us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.64%       5.131us         0.80%       6.391us       2.130us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     315.484us      1613.56%     315.484us     315.484us             1  
+                                            torch_eager        14.44%     114.641us        99.35%     788.539us     788.539us       0.000us         0.00%      21.568us      21.568us             1  
+                                               aten::to         0.74%       5.859us        67.65%     536.963us      89.494us       0.000us         0.00%      13.632us       2.272us             6  
+                                         aten::_to_copy         3.05%      24.201us        66.91%     531.104us      88.517us       0.000us         0.00%      13.632us       2.272us             6  
+                                            aten::copy_         6.01%      47.671us        60.17%     477.592us      79.599us      11.616us        59.41%      13.632us       2.272us             6  
+                                           aten::conv1d         0.73%       5.820us        14.10%     111.923us      37.308us       0.000us         0.00%       7.936us       2.645us             3  
+                                      aten::convolution         1.13%       8.991us        13.37%     106.103us      35.368us       0.000us         0.00%       7.936us       2.645us             3  
+                                     aten::_convolution         2.72%      21.550us        12.24%      97.112us      32.371us       0.000us         0.00%       7.936us       2.645us             3  
+                                aten::_conv_depthwise2d         2.66%      21.080us         7.67%      60.892us      20.297us       7.936us        40.59%       7.936us       2.645us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        40.59%       7.936us       2.645us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        30.28%       5.920us       1.973us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        29.13%       5.696us       1.899us             3  
+                                Activity Buffer Request        32.17%     255.357us        32.17%     255.357us     255.357us       2.016us        10.31%       2.016us       2.016us             1  
+                                    aten::empty_strided         3.69%      29.311us         3.69%      29.311us       4.885us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.68%     195.894us        24.68%     195.894us      21.766us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.03%      16.101us         2.61%      20.721us       2.302us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.02%       8.070us         1.02%       8.070us       0.538us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.14%       9.082us         1.14%       9.082us       3.027us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.18%       9.400us         1.18%       9.400us       3.133us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.66%       5.230us         0.82%       6.480us       2.160us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 803.684us
-Self CUDA time total: 19.456us
+Self CPU time total: 793.700us
+Self CUDA time total: 19.552us
 
 
 
@@ -4447,29 +4447,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     314.048us      1627.61%     314.048us     314.048us             1  
-                                            torch_eager         6.09%     141.071us        99.78%       2.312ms       2.312ms       0.000us         0.00%      21.471us      21.471us             1  
-                                               aten::to         0.25%       5.868us        87.74%       2.033ms     338.847us       0.000us         0.00%      14.239us       2.373us             6  
-                                         aten::_to_copy         0.91%      21.009us        87.49%       2.027ms     337.869us       0.000us         0.00%      14.239us       2.373us             6  
-                                            aten::copy_         2.05%      47.413us        85.30%       1.977ms     329.427us      12.063us        62.52%      14.239us       2.373us             6  
-                                           aten::conv1d         0.24%       5.651us         4.86%     112.612us      37.537us       0.000us         0.00%       7.232us       2.411us             3  
-                                      aten::convolution         0.38%       8.789us         4.62%     106.961us      35.654us       0.000us         0.00%       7.232us       2.411us             3  
-                                     aten::_convolution         0.89%      20.661us         4.24%      98.172us      32.724us       0.000us         0.00%       7.232us       2.411us             3  
-                                aten::_conv_depthwise2d         0.90%      20.862us         2.69%      62.371us      20.790us       7.232us        37.48%       7.232us       2.411us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.232us        37.48%       7.232us       2.411us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.208us        32.17%       6.208us       2.069us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.855us        30.34%       5.855us       1.952us             3  
-                                Activity Buffer Request        75.73%       1.755ms        75.73%       1.755ms       1.755ms       2.176us        11.28%       2.176us       2.176us             1  
-                                    aten::empty_strided         1.28%      29.642us         1.28%      29.642us       4.940us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.54%     197.852us         8.54%     197.852us      21.984us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.71%      16.502us         0.92%      21.412us       2.379us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.36%       8.400us         0.36%       8.400us       0.560us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.40%       9.250us         0.40%       9.250us       3.083us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.37%       8.629us         0.37%       8.629us       2.876us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.24%       5.530us         0.30%       6.920us       2.307us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     319.227us      1646.26%     319.227us     319.227us             1  
+                                            torch_eager        14.35%     115.202us        99.35%     797.750us     797.750us       0.000us         0.00%      21.567us      21.567us             1  
+                                               aten::to         0.73%       5.879us        67.70%     543.593us      90.599us       0.000us         0.00%      14.367us       2.394us             6  
+                                         aten::_to_copy         2.99%      23.982us        66.96%     537.714us      89.619us       0.000us         0.00%      14.367us       2.394us             6  
+                                            aten::copy_         5.85%      46.982us        60.24%     483.712us      80.619us      12.191us        62.87%      14.367us       2.394us             6  
+                                           aten::conv1d         0.69%       5.560us        14.06%     112.933us      37.644us       0.000us         0.00%       7.200us       2.400us             3  
+                                      aten::convolution         1.13%       9.060us        13.37%     107.373us      35.791us       0.000us         0.00%       7.200us       2.400us             3  
+                                     aten::_convolution         2.73%      21.891us        12.24%      98.313us      32.771us       0.000us         0.00%       7.200us       2.400us             3  
+                                aten::_conv_depthwise2d         2.58%      20.719us         7.42%      59.602us      19.867us       7.200us        37.13%       7.200us       2.400us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.200us        37.13%       7.200us       2.400us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        32.34%       6.272us       2.091us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.919us        30.52%       5.919us       1.973us             3  
+                                Activity Buffer Request        32.74%     262.897us        32.74%     262.897us     262.897us       2.176us        11.22%       2.176us       2.176us             1  
+                                    aten::empty_strided         3.74%      30.020us         3.74%      30.020us       5.003us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.16%     194.015us        24.16%     194.015us      21.557us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.13%      17.072us         2.73%      21.901us       2.433us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.03%       8.259us         1.03%       8.259us       0.551us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.15%       9.210us         1.15%       9.210us       3.070us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.18%       9.491us         1.18%       9.491us       3.164us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.90%       7.200us         1.06%       8.540us       2.847us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.317ms
-Self CUDA time total: 19.295us
+Self CPU time total: 802.990us
+Self CUDA time total: 19.391us
 
 
 
@@ -4479,29 +4479,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     312.955us      1552.43%     312.955us     312.955us             1  
-                                            torch_eager        14.84%     110.223us        99.32%     737.732us     737.732us       0.000us         0.00%      22.399us      22.399us             1  
-                                               aten::to         0.81%       6.020us        65.77%     488.537us      81.423us       0.000us         0.00%      14.464us       2.411us             6  
-                                         aten::_to_copy         2.95%      21.919us        64.96%     482.517us      80.420us       0.000us         0.00%      14.464us       2.411us             6  
-                                            aten::copy_         6.27%      46.582us        58.16%     432.008us      72.001us      12.224us        60.64%      14.464us       2.411us             6  
-                                           aten::conv1d         0.73%       5.430us        15.33%     113.861us      37.954us       0.000us         0.00%       7.935us       2.645us             3  
-                                      aten::convolution         1.23%       9.170us        14.60%     108.431us      36.144us       0.000us         0.00%       7.935us       2.645us             3  
-                                     aten::_convolution         2.83%      21.009us        13.36%      99.261us      33.087us       0.000us         0.00%       7.935us       2.645us             3  
-                                aten::_conv_depthwise2d         2.83%      21.021us         8.51%      63.211us      21.070us       7.935us        39.36%       7.935us       2.645us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.935us        39.36%       7.935us       2.645us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        31.27%       6.304us       2.101us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        29.37%       5.920us       1.973us             3  
-                                Activity Buffer Request        28.04%     208.284us        28.04%     208.284us     208.284us       2.240us        11.11%       2.240us       2.240us             1  
-                                    aten::empty_strided         3.85%      28.590us         3.85%      28.590us       4.765us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.94%     200.092us        26.94%     200.092us      22.232us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.19%      16.299us         2.86%      21.260us       2.362us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.14%       8.481us         1.14%       8.481us       0.565us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.35%      10.000us         1.35%      10.000us       3.333us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.24%       9.240us         1.24%       9.240us       3.080us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.74%       5.520us         0.94%       6.960us       2.320us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     318.585us      1587.92%     318.585us     318.585us             1  
+                                            torch_eager        14.32%     114.402us        99.36%     793.540us     793.540us       0.000us         0.00%      22.271us      22.271us             1  
+                                               aten::to         0.74%       5.889us        67.47%     538.893us      89.816us       0.000us         0.00%      14.368us       2.395us             6  
+                                         aten::_to_copy         2.86%      22.853us        66.74%     533.004us      88.834us       0.000us         0.00%      14.368us       2.395us             6  
+                                            aten::copy_         5.91%      47.181us        60.05%     479.601us      79.933us      12.160us        60.61%      14.368us       2.395us             6  
+                                           aten::conv1d         0.72%       5.730us        14.38%     114.863us      38.288us       0.000us         0.00%       7.903us       2.634us             3  
+                                      aten::convolution         1.15%       9.210us        13.66%     109.133us      36.378us       0.000us         0.00%       7.903us       2.634us             3  
+                                     aten::_convolution         2.90%      23.191us        12.51%      99.923us      33.308us       0.000us         0.00%       7.903us       2.634us             3  
+                                aten::_conv_depthwise2d         2.58%      20.640us         7.49%      59.852us      19.951us       7.903us        39.39%       7.903us       2.634us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.903us        39.39%       7.903us       2.634us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        31.42%       6.304us       2.101us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.856us        29.19%       5.856us       1.952us             3  
+                                Activity Buffer Request        32.57%     260.117us        32.57%     260.117us     260.117us       2.208us        11.01%       2.208us       2.208us             1  
+                                    aten::empty_strided         3.83%      30.550us         3.83%      30.550us       5.092us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.14%     192.815us        24.14%     192.815us      21.424us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.21%      17.653us         2.81%      22.412us       2.490us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.03%       8.198us         1.03%       8.198us       0.547us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.14%       9.130us         1.14%       9.130us       3.043us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.20%       9.570us         1.20%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.77%       6.130us         0.93%       7.460us       2.487us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 742.812us
-Self CUDA time total: 20.159us
+Self CPU time total: 798.670us
+Self CUDA time total: 20.063us
 
 
 
@@ -4511,29 +4511,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     313.245us       871.70%     313.245us     313.245us             1  
-                                            torch_eager        14.61%     110.582us        99.35%     752.253us     752.253us       0.000us         0.00%      38.527us      38.527us             1  
-                                           aten::conv1d         0.75%       5.651us        14.90%     112.841us      37.614us       0.000us         0.00%      20.160us       6.720us             3  
-                                      aten::convolution         1.16%       8.759us        14.16%     107.190us      35.730us       0.000us         0.00%      20.160us       6.720us             3  
-                                     aten::_convolution         2.72%      20.612us        13.00%      98.431us      32.810us       0.000us         0.00%      20.160us       6.720us             3  
-                                aten::_conv_depthwise2d         2.88%      21.780us         8.26%      62.540us      20.847us      20.160us        56.10%      20.160us       6.720us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.160us        56.10%      20.160us       6.720us             3  
-                                               aten::to         0.75%       5.701us        66.54%     503.830us      83.972us       0.000us         0.00%      18.367us       3.061us             6  
-                                         aten::_to_copy         2.88%      21.780us        65.79%     498.129us      83.021us       0.000us         0.00%      18.367us       3.061us             6  
-                                            aten::copy_         6.12%      46.334us        59.23%     448.449us      74.742us      15.775us        43.90%      18.367us       3.061us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.415us        23.42%       8.415us       2.805us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        20.48%       7.360us       2.453us             3  
-                                Activity Buffer Request        29.82%     225.793us        29.82%     225.793us     225.793us       2.592us         7.21%       2.592us       2.592us             1  
-                                    aten::empty_strided         3.68%      27.900us         3.68%      27.900us       4.650us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.28%     198.992us        26.28%     198.992us      22.110us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.15%      16.291us         2.79%      21.110us       2.346us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.09%       8.258us         1.09%       8.258us       0.551us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.19%       9.040us         1.19%       9.040us       3.013us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.20%       9.050us         1.20%       9.050us       3.017us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.73%       5.530us         0.90%       6.810us       2.270us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     325.920us       904.53%     325.920us     325.920us             1  
+                                            torch_eager        14.34%     116.794us        99.35%     809.360us     809.360us       0.000us         0.00%      38.624us      38.624us             1  
+                                           aten::conv1d         0.72%       5.870us        14.19%     115.602us      38.534us       0.000us         0.00%      20.192us       6.731us             3  
+                                      aten::convolution         1.16%       9.420us        13.47%     109.732us      36.577us       0.000us         0.00%      20.192us       6.731us             3  
+                                     aten::_convolution         2.80%      22.850us        12.31%     100.312us      33.437us       0.000us         0.00%      20.192us       6.731us             3  
+                                aten::_conv_depthwise2d         2.58%      21.022us         7.49%      61.052us      20.351us      20.192us        56.04%      20.192us       6.731us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.192us        56.04%      20.192us       6.731us             3  
+                                               aten::to         0.73%       5.980us        67.60%     550.703us      91.784us       0.000us         0.00%      18.432us       3.072us             6  
+                                         aten::_to_copy         2.92%      23.789us        66.86%     544.723us      90.787us       0.000us         0.00%      18.432us       3.072us             6  
+                                            aten::copy_         5.93%      48.281us        60.23%     490.683us      81.781us      15.840us        43.96%      18.432us       3.072us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.448us        23.45%       8.448us       2.816us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        20.52%       7.392us       2.464us             3  
+                                Activity Buffer Request        30.82%     251.076us        30.82%     251.076us     251.076us       2.592us         7.19%       2.592us       2.592us             1  
+                                    aten::empty_strided         3.71%      30.251us         3.71%      30.251us       5.042us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.06%     212.336us        26.06%     212.336us      23.593us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.07%      16.901us         2.67%      21.731us       2.415us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.01%       8.191us         1.01%       8.191us       0.546us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.17%       9.540us         1.17%       9.540us       3.180us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.16%       9.480us         1.16%       9.480us       3.160us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.88%       7.140us         1.02%       8.340us       2.780us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 757.144us
-Self CUDA time total: 35.935us
+Self CPU time total: 814.681us
+Self CUDA time total: 36.032us
 
 
 
@@ -4543,29 +4543,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     311.265us       814.68%     311.265us     311.265us             1  
-                                            torch_eager        15.57%     112.202us        99.29%     715.482us     715.482us       0.000us         0.00%      40.798us      40.798us             1  
-                                           aten::conv1d         0.79%       5.710us        15.58%     112.241us      37.414us       0.000us         0.00%      22.336us       7.445us             3  
-                                      aten::convolution         1.20%       8.631us        14.78%     106.531us      35.510us       0.000us         0.00%      22.336us       7.445us             3  
-                                     aten::_convolution         2.96%      21.310us        13.59%      97.900us      32.633us       0.000us         0.00%      22.336us       7.445us             3  
-                                aten::_conv_depthwise2d         2.84%      20.470us         8.57%      61.720us      20.573us      22.336us        58.46%      22.336us       7.445us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.336us        58.46%      22.336us       7.445us             3  
-                                               aten::to         0.77%       5.551us        64.63%     465.749us      77.625us       0.000us         0.00%      18.462us       3.077us             6  
-                                         aten::_to_copy         2.88%      20.749us        63.86%     460.198us      76.700us       0.000us         0.00%      18.462us       3.077us             6  
-                                            aten::copy_         6.44%      46.412us        57.18%     412.048us      68.675us      15.871us        41.54%      18.462us       3.077us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.511us        22.28%       8.511us       2.837us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        19.26%       7.360us       2.453us             3  
-                                Activity Buffer Request        26.39%     190.134us        26.39%     190.134us     190.134us       2.591us         6.78%       2.591us       2.591us             1  
-                                    aten::empty_strided         3.80%      27.401us         3.80%      27.401us       4.567us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        27.38%     197.312us        27.38%     197.312us      21.924us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.26%      16.250us         2.97%      21.400us       2.378us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.19%       8.579us         1.19%       8.579us       0.572us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.47%      10.600us         1.47%      10.600us       3.533us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.23%       8.840us         1.23%       8.840us       2.947us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.77%       5.530us         0.95%       6.840us       2.280us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     322.527us       848.40%     322.527us     322.527us             1  
+                                            torch_eager         5.15%     118.831us        99.77%       2.301ms       2.301ms       0.000us         0.00%      40.608us      40.608us             1  
+                                           aten::conv1d         0.25%       5.790us         4.95%     114.162us      38.054us       0.000us         0.00%      22.272us       7.424us             3  
+                                      aten::convolution         0.40%       9.280us         4.70%     108.372us      36.124us       0.000us         0.00%      22.272us       7.424us             3  
+                                     aten::_convolution         0.97%      22.470us         4.30%      99.092us      33.031us       0.000us         0.00%      22.272us       7.424us             3  
+                                aten::_conv_depthwise2d         0.93%      21.479us         2.62%      60.532us      20.177us      22.272us        58.59%      22.272us       7.424us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.272us        58.59%      22.272us       7.424us             3  
+                                               aten::to         0.26%       6.081us        88.49%       2.041ms     340.127us       0.000us         0.00%      18.336us       3.056us             6  
+                                         aten::_to_copy         1.04%      23.880us        88.22%       2.035ms     339.113us       0.000us         0.00%      18.336us       3.056us             6  
+                                            aten::copy_         2.05%      47.372us        85.91%       1.981ms     330.228us      15.744us        41.41%      18.336us       3.056us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.416us        22.14%       8.416us       2.805us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        19.28%       7.328us       2.443us             3  
+                                Activity Buffer Request        76.31%       1.760ms        76.31%       1.760ms       1.760ms       2.592us         6.82%       2.592us       2.592us             1  
+                                    aten::empty_strided         1.28%      29.431us         1.28%      29.431us       4.905us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.43%     194.465us         8.43%     194.465us      21.607us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.76%      17.531us         0.99%      22.821us       2.536us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.40%       9.119us         0.40%       9.119us       0.608us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.40%       9.181us         0.40%       9.181us       3.060us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.41%       9.511us         0.41%       9.511us       3.170us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       6.360us         0.35%       8.040us       2.680us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 720.602us
-Self CUDA time total: 38.207us
+Self CPU time total: 2.306ms
+Self CUDA time total: 38.016us
 
 
 
@@ -4575,29 +4575,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     314.752us       491.08%     314.752us     314.752us             1  
-                                            torch_eager        14.41%     112.890us        99.37%     778.613us     778.613us       0.000us         0.00%      68.189us      68.189us             1  
-                                           aten::conv1d         0.75%       5.840us        14.26%     111.761us      37.254us       0.000us         0.00%      41.759us      13.920us             3  
-                                      aten::convolution         1.10%       8.641us        13.52%     105.921us      35.307us       0.000us         0.00%      41.759us      13.920us             3  
-                                     aten::_convolution         2.66%      20.830us        12.41%      97.280us      32.427us       0.000us         0.00%      41.759us      13.920us             3  
-                                aten::_conv_depthwise2d         2.71%      21.201us         7.84%      61.431us      20.477us      41.759us        65.15%      41.759us      13.920us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.759us        65.15%      41.759us      13.920us             3  
-                                               aten::to         0.73%       5.701us        67.43%     528.381us      88.063us       0.000us         0.00%      26.430us       4.405us             6  
-                                         aten::_to_copy         2.70%      21.142us        66.70%     522.680us      87.113us       0.000us         0.00%      26.430us       4.405us             6  
-                                            aten::copy_         6.06%      47.459us        59.50%     466.248us      77.708us      22.335us        34.85%      26.430us       4.405us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.903us        18.57%      11.903us       3.968us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us        16.28%      10.432us       3.477us             3  
-                                Activity Buffer Request        30.62%     239.904us        30.62%     239.904us     239.904us       4.095us         6.39%       4.095us       4.095us             1  
-                                    aten::empty_strided         4.50%      35.290us         4.50%      35.290us       5.882us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.58%     200.475us        25.58%     200.475us      22.275us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.10%      16.441us         2.75%      21.561us       2.396us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.10%       8.620us         1.10%       8.620us       0.575us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.18%       9.220us         1.18%       9.220us       3.073us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.20%       9.420us         1.20%       9.420us       3.140us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.71%       5.569us         0.87%       6.849us       2.283us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     334.615us       522.09%     334.615us     334.615us             1  
+                                            torch_eager        14.27%     115.903us        99.37%     807.269us     807.269us       0.000us         0.00%      68.188us      68.188us             1  
+                                           aten::conv1d         0.69%       5.580us        15.60%     126.753us      42.251us       0.000us         0.00%      41.662us      13.887us             3  
+                                      aten::convolution         1.31%      10.620us        14.92%     121.173us      40.391us       0.000us         0.00%      41.662us      13.887us             3  
+                                     aten::_convolution         3.95%      32.112us        13.61%     110.553us      36.851us       0.000us         0.00%      41.662us      13.887us             3  
+                                aten::_conv_depthwise2d         2.71%      21.990us         7.82%      63.492us      21.164us      41.662us        65.00%      41.662us      13.887us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.662us        65.00%      41.662us      13.887us             3  
+                                               aten::to         0.71%       5.728us        66.32%     538.813us      89.802us       0.000us         0.00%      26.526us       4.421us             6  
+                                         aten::_to_copy         2.82%      22.881us        65.62%     533.085us      88.848us       0.000us         0.00%      26.526us       4.421us             6  
+                                            aten::copy_         5.89%      47.814us        59.03%     479.514us      79.919us      22.430us        35.00%      26.526us       4.421us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        18.72%      12.000us       4.000us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.430us        16.27%      10.430us       3.477us             3  
+                                Activity Buffer Request        31.09%     252.576us        31.09%     252.576us     252.576us       4.096us         6.39%       4.096us       4.096us             1  
+                                    aten::empty_strided         3.78%      30.690us         3.78%      30.690us       5.115us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.86%     201.964us        24.86%     201.964us      22.440us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.01%      16.320us         2.66%      21.600us       2.400us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.07%       8.700us         1.07%       8.700us       0.580us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.17%       9.541us         1.17%       9.541us       3.180us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.12%       9.121us         1.12%       9.121us       3.040us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.69%       5.569us         0.85%       6.929us       2.310us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 783.583us
-Self CUDA time total: 64.094us
+Self CPU time total: 812.390us
+Self CUDA time total: 64.092us
 
 
 
@@ -4607,29 +4607,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.050us       488.00%     341.050us     341.050us             1  
-                                            torch_eager        14.90%     121.512us        99.29%     809.864us     809.864us       0.000us         0.00%      73.952us      73.952us             1  
-                                           aten::conv1d         0.74%       6.030us        14.63%     119.332us      39.777us       0.000us         0.00%      47.456us      15.819us             3  
-                                      aten::convolution         1.12%       9.160us        13.89%     113.302us      37.767us       0.000us         0.00%      47.456us      15.819us             3  
-                                     aten::_convolution         2.63%      21.441us        12.77%     104.142us      34.714us       0.000us         0.00%      47.456us      15.819us             3  
-                                aten::_conv_depthwise2d         2.71%      22.133us         8.21%      66.923us      22.308us      47.456us        67.90%      47.456us      15.819us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.456us        67.90%      47.456us      15.819us             3  
-                                               aten::to         0.75%       6.151us        66.55%     542.789us      90.465us       0.000us         0.00%      26.496us       4.416us             6  
-                                         aten::_to_copy         2.90%      23.661us        65.79%     536.638us      89.440us       0.000us         0.00%      26.496us       4.416us             6  
-                                            aten::copy_         6.09%      49.711us        59.32%     483.818us      80.636us      22.432us        32.10%      26.496us       4.416us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.904us        17.03%      11.904us       3.968us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us        15.06%      10.528us       3.509us             3  
-                                Activity Buffer Request        30.67%     250.175us        30.67%     250.175us     250.175us       4.064us         5.82%       4.064us       4.064us             1  
-                                    aten::empty_strided         3.58%      29.159us         3.58%      29.159us       4.860us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.38%     207.002us        25.38%     207.002us      23.000us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.12%      17.271us         2.72%      22.171us       2.463us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.06%       8.630us         1.06%       8.630us       0.575us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.48%      12.040us         1.48%      12.040us       4.013us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.19%       9.680us         1.19%       9.680us       3.227us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.69%       5.638us         0.87%       7.098us       2.366us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     333.628us       479.14%     333.628us     333.628us             1  
+                                            torch_eager        20.40%     175.467us        99.34%     854.292us     854.292us       0.000us         0.00%      73.758us      73.758us             1  
+                                           aten::conv1d         0.66%       5.659us        14.42%     124.043us      41.348us       0.000us         0.00%      47.231us      15.744us             3  
+                                      aten::convolution         1.14%       9.780us        13.77%     118.384us      39.461us       0.000us         0.00%      47.231us      15.744us             3  
+                                     aten::_convolution         2.67%      22.962us        12.63%     108.604us      36.201us       0.000us         0.00%      47.231us      15.744us             3  
+                                aten::_conv_depthwise2d         2.49%      21.432us         8.04%      69.183us      23.061us      47.231us        67.83%      47.231us      15.744us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.231us        67.83%      47.231us      15.744us             3  
+                                               aten::to         0.69%       5.899us        61.45%     528.412us      88.069us       0.000us         0.00%      26.527us       4.421us             6  
+                                         aten::_to_copy         2.66%      22.900us        60.76%     522.513us      87.086us       0.000us         0.00%      26.527us       4.421us             6  
+                                            aten::copy_         5.49%      47.171us        54.63%     469.832us      78.305us      22.399us        32.17%      26.527us       4.421us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.968us        17.19%      11.968us       3.989us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us        14.98%      10.431us       3.477us             3  
+                                Activity Buffer Request        28.72%     246.977us        28.72%     246.977us     246.977us       4.128us         5.93%       4.128us       4.128us             1  
+                                    aten::empty_strided         3.46%      29.781us         3.46%      29.781us       4.964us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        22.90%     196.964us        22.90%     196.964us      21.885us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.98%      17.050us         2.57%      22.090us       2.454us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.01%       8.700us         1.01%       8.700us       0.580us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.95%      16.810us         1.95%      16.810us       5.603us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.12%       9.661us         1.12%       9.661us       3.220us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.74%       6.399us         0.92%       7.879us       2.626us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 815.634us
-Self CUDA time total: 69.888us
+Self CPU time total: 859.952us
+Self CUDA time total: 69.630us
 
 
 
@@ -4639,29 +4639,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     333.665us       179.68%     333.665us     333.665us             1  
-                                            torch_eager         6.31%     142.852us        99.78%       2.260ms       2.260ms       0.000us         0.00%     195.711us     195.711us             1  
-                                           aten::conv1d         0.27%       6.061us         5.13%     116.202us      38.734us       0.000us         0.00%     133.407us      44.469us             3  
-                                      aten::convolution         0.39%       8.731us         4.86%     110.141us      36.714us       0.000us         0.00%     133.407us      44.469us             3  
-                                     aten::_convolution         0.93%      21.019us         4.48%     101.410us      33.803us       0.000us         0.00%     133.407us      44.469us             3  
-                                aten::_conv_depthwise2d         0.95%      21.630us         2.86%      64.671us      21.557us     133.407us        71.84%     133.407us      44.469us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     133.407us        71.84%     133.407us      44.469us             3  
-                                               aten::to         0.26%       5.950us        87.17%       1.974ms     329.070us       0.000us         0.00%      62.304us      10.384us             6  
-                                         aten::_to_copy         0.93%      21.080us        86.91%       1.968ms     328.079us       0.000us         0.00%      62.304us      10.384us             6  
-                                            aten::copy_         2.06%      46.600us        84.72%       1.919ms     319.815us      52.288us        28.16%      62.304us      10.384us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.503us        15.89%      29.503us       9.834us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.785us        12.27%      22.785us       7.595us             3  
-                                Activity Buffer Request        74.89%       1.696ms        74.89%       1.696ms       1.696ms      10.016us         5.39%      10.016us      10.016us             1  
-                                    aten::empty_strided         1.26%      28.502us         1.26%      28.502us       4.750us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.73%     197.643us         8.73%     197.643us      21.960us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.77%      17.390us         1.01%      22.821us       2.536us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.41%       9.201us         0.41%       9.201us       0.613us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.51%      11.561us         0.51%      11.561us       3.854us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.43%       9.850us         0.43%       9.850us       3.283us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.25%       5.550us         0.31%       7.130us       2.377us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     346.806us       186.86%     346.806us     346.806us             1  
+                                            torch_eager        14.53%     117.325us        99.36%     802.090us     802.090us       0.000us         0.00%     195.516us     195.516us             1  
+                                           aten::conv1d         0.70%       5.679us        15.36%     123.982us      41.327us       0.000us         0.00%     133.212us      44.404us             3  
+                                      aten::convolution         1.23%       9.941us        14.65%     118.303us      39.434us       0.000us         0.00%     133.212us      44.404us             3  
+                                     aten::_convolution         2.76%      22.310us        13.42%     108.362us      36.121us       0.000us         0.00%     133.212us      44.404us             3  
+                                aten::_conv_depthwise2d         3.54%      28.551us         8.44%      68.171us      22.724us     133.212us        71.78%     133.212us      44.404us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     133.212us        71.78%     133.212us      44.404us             3  
+                                               aten::to         0.73%       5.930us        66.21%     534.532us      89.089us       0.000us         0.00%      62.304us      10.384us             6  
+                                         aten::_to_copy         2.86%      23.081us        65.48%     528.602us      88.100us       0.000us         0.00%      62.304us      10.384us             6  
+                                            aten::copy_         5.96%      48.102us        58.83%     474.941us      79.157us      52.384us        28.22%      62.304us      10.384us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.504us        15.90%      29.504us       9.835us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.880us        12.33%      22.880us       7.627us             3  
+                                Activity Buffer Request        31.11%     251.176us        31.11%     251.176us     251.176us       9.920us         5.34%       9.920us       9.920us             1  
+                                    aten::empty_strided         3.79%      30.580us         3.79%      30.580us       5.097us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.34%     196.463us        24.34%     196.463us      21.829us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.06%      16.611us         2.71%      21.892us       2.432us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.12%       9.041us         1.12%       9.041us       0.603us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.13%       9.090us         1.13%       9.090us       3.030us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.21%       9.730us         1.21%       9.730us       3.243us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.95%       7.689us         1.15%       9.269us       3.090us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.265ms
-Self CUDA time total: 185.695us
+Self CPU time total: 807.270us
+Self CUDA time total: 185.596us
 
 
 
@@ -4671,29 +4671,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     336.127us       160.20%     336.127us     336.127us             1  
-                                            torch_eager        15.02%     112.950us        99.30%     746.873us     746.873us       0.000us         0.00%     223.453us     223.453us             1  
-                                           aten::conv1d         0.82%       6.200us        15.13%     113.793us      37.931us       0.000us         0.00%     154.078us      51.359us             3  
-                                      aten::convolution         1.21%       9.111us        14.30%     107.593us      35.864us       0.000us         0.00%     154.078us      51.359us             3  
-                                     aten::_convolution         2.76%      20.789us        13.09%      98.482us      32.827us       0.000us         0.00%     154.078us      51.359us             3  
-                                aten::_conv_depthwise2d         2.85%      21.433us         8.36%      62.872us      20.957us     154.078us        73.43%     154.078us      51.359us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     154.078us        73.43%     154.078us      51.359us             3  
-                                               aten::to         0.77%       5.760us        65.90%     495.650us      82.608us       0.000us         0.00%      69.375us      11.562us             6  
-                                         aten::_to_copy         2.76%      20.790us        65.13%     489.890us      81.648us       0.000us         0.00%      69.375us      11.562us             6  
-                                            aten::copy_         6.23%      46.852us        58.63%     441.010us      73.502us      55.743us        26.57%      69.375us      11.562us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.736us        15.60%      32.736us      10.912us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.007us        10.97%      23.007us       7.669us             3  
-                                Activity Buffer Request        28.60%     215.113us        28.60%     215.113us     215.113us      13.632us         6.50%      13.632us      13.632us             1  
-                                    aten::empty_strided         3.73%      28.090us         3.73%      28.090us       4.682us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.86%     202.035us        26.86%     202.035us      22.448us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.10%      15.770us         2.68%      20.191us       2.243us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.08%       8.131us         1.08%       8.131us       0.542us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.19%       8.950us         1.19%       8.950us       2.983us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.26%       9.499us         1.26%       9.499us       3.166us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.71%       5.360us         0.95%       7.120us       2.373us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     339.579us       162.49%     339.579us     339.579us             1  
+                                            torch_eager        14.47%     115.372us        99.34%     792.230us     792.230us       0.000us         0.00%     222.202us     222.202us             1  
+                                           aten::conv1d         0.71%       5.650us        14.39%     114.783us      38.261us       0.000us         0.00%     153.629us      51.210us             3  
+                                      aten::convolution         1.12%       8.960us        13.68%     109.133us      36.378us       0.000us         0.00%     153.629us      51.210us             3  
+                                     aten::_convolution         2.81%      22.409us        12.56%     100.173us      33.391us       0.000us         0.00%     153.629us      51.210us             3  
+                                aten::_conv_depthwise2d         2.65%      21.140us         7.72%      61.571us      20.524us     153.629us        73.51%     153.629us      51.210us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     153.629us        73.51%     153.629us      51.210us             3  
+                                               aten::to         0.74%       5.880us        67.17%     535.703us      89.284us       0.000us         0.00%      68.573us      11.429us             6  
+                                         aten::_to_copy         2.96%      23.602us        66.44%     529.823us      88.304us       0.000us         0.00%      68.573us      11.429us             6  
+                                            aten::copy_         5.90%      47.080us        59.79%     476.780us      79.463us      55.357us        26.49%      68.573us      11.429us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.351us        15.48%      32.351us      10.784us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.006us        11.01%      23.006us       7.669us             3  
+                                Activity Buffer Request        31.17%     248.536us        31.17%     248.536us     248.536us      13.216us         6.32%      13.216us      13.216us             1  
+                                    aten::empty_strided         3.69%      29.441us         3.69%      29.441us       4.907us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.34%     202.095us        25.34%     202.095us      22.455us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.16%      17.223us         2.82%      22.464us       2.496us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.10%       8.783us         1.10%       8.783us       0.586us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.15%       9.160us         1.15%       9.160us       3.053us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.30%      10.340us         1.30%      10.340us       3.447us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.78%       6.220us         0.94%       7.501us       2.500us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 752.153us
-Self CUDA time total: 209.821us
+Self CPU time total: 797.480us
+Self CUDA time total: 208.986us
 
 
 
@@ -4703,29 +4703,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         6.57%     117.922us        51.18%     918.465us     918.465us       0.000us         0.00%       1.509ms       1.509ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.411ms       100.40%       1.411ms       1.411ms             1  
-                                               aten::to         0.34%       6.101us        36.65%     657.591us     109.598us       0.000us         0.00%     815.747us     135.958us             6  
-                                         aten::_to_copy         1.52%      27.229us        36.31%     651.490us     108.582us       0.000us         0.00%     815.747us     135.958us             6  
-                                            aten::copy_         2.73%      48.941us        24.59%     441.187us      73.531us     712.610us        50.71%     815.747us     135.958us             6  
-                                           aten::conv1d         0.34%       6.120us         6.48%     116.312us      38.771us       0.000us         0.00%     692.768us     230.923us             3  
-                                      aten::convolution         0.51%       9.129us         6.14%     110.192us      36.731us       0.000us         0.00%     692.768us     230.923us             3  
-                                     aten::_convolution         1.19%      21.333us         5.63%     101.063us      33.688us       0.000us         0.00%     692.768us     230.923us             3  
-                                aten::_conv_depthwise2d         1.24%      22.308us         3.59%      64.481us      21.494us     692.768us        49.29%     692.768us     230.923us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     692.768us        49.29%     692.768us     230.923us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     407.041us        28.96%     407.041us     135.680us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     305.569us        21.74%     305.569us     101.856us             3  
-                                Activity Buffer Request        11.66%     209.253us        11.66%     209.253us     209.253us     103.137us         7.34%     103.137us     103.137us             1  
-                                    aten::empty_strided         1.92%      34.411us        10.20%     183.074us      30.512us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        11.50%     206.424us        11.50%     206.424us      22.936us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.94%      16.930us         1.24%      22.220us       2.469us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.50%       8.941us         0.50%       8.941us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.52%       9.381us         0.52%       9.381us       3.127us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.52%       9.361us         0.52%       9.361us       3.120us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.31%       5.508us         0.38%       6.869us       2.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         6.78%     123.653us        52.87%     964.884us     964.884us       0.000us         0.00%       1.509ms       1.509ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.412ms       100.39%       1.412ms       1.412ms             1  
+                                               aten::to         0.34%       6.170us        38.05%     694.387us     115.731us       0.000us         0.00%     816.698us     136.116us             6  
+                                         aten::_to_copy         1.57%      28.621us        37.71%     688.217us     114.703us       0.000us         0.00%     816.698us     136.116us             6  
+                                            aten::copy_         2.79%      50.981us        25.74%     469.702us      78.284us     713.755us        50.75%     816.698us     136.116us             6  
+                                           aten::conv1d         0.31%       5.748us         6.52%     119.002us      39.667us       0.000us         0.00%     692.571us     230.857us             3  
+                                      aten::convolution         0.58%      10.581us         6.21%     113.254us      37.751us       0.000us         0.00%     692.571us     230.857us             3  
+                                     aten::_convolution         1.23%      22.532us         5.63%     102.673us      34.224us       0.000us         0.00%     692.571us     230.857us             3  
+                                aten::_conv_depthwise2d         1.21%      22.101us         3.46%      63.221us      21.074us     692.571us        49.25%     692.571us     230.857us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     692.571us        49.25%     692.571us     230.857us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     406.878us        28.93%     406.878us     135.626us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     306.877us        21.82%     306.877us     102.292us             3  
+                                Activity Buffer Request        13.13%     239.566us        13.13%     239.566us     239.566us     102.943us         7.32%     102.943us     102.943us             1  
+                                    aten::empty_strided         2.01%      36.730us        10.40%     189.894us      31.649us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.02%     201.035us        11.02%     201.035us      22.337us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.99%      18.011us         1.28%      23.402us       2.600us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.50%       9.042us         0.50%       9.042us       0.603us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.52%       9.410us         0.52%       9.410us       3.137us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.54%       9.830us         0.54%       9.830us       3.277us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.39%       7.200us         0.47%       8.510us       2.837us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.794ms
-Self CUDA time total: 1.405ms
+Self CPU time total: 1.825ms
+Self CUDA time total: 1.406ms
 
 
 
@@ -4735,29 +4735,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         6.19%     116.332us        43.22%     812.174us     812.174us       0.000us         0.00%       1.498ms       1.498ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.429ms       100.40%       1.429ms       1.429ms             1  
-                                               aten::to         0.31%       5.839us        29.49%     554.079us      92.346us       0.000us         0.00%     756.667us     126.111us             6  
-                                         aten::_to_copy         1.13%      21.183us        29.18%     548.240us      91.373us       0.000us         0.00%     756.667us     126.111us             6  
-                                            aten::copy_         2.52%      47.321us        26.61%     500.037us      83.340us     681.756us        47.91%     756.667us     126.111us             6  
-                                           aten::conv1d         0.30%       5.691us         6.12%     114.963us      38.321us       0.000us         0.00%     741.276us     247.092us             3  
-                                      aten::convolution         0.46%       8.681us         5.82%     109.272us      36.424us       0.000us         0.00%     741.276us     247.092us             3  
-                                     aten::_convolution         1.12%      21.121us         5.35%     100.591us      33.530us       0.000us         0.00%     741.276us     247.092us             3  
-                                aten::_conv_depthwise2d         1.12%      21.039us         3.43%      64.490us      21.497us     741.276us        52.09%     741.276us     247.092us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     741.276us        52.09%     741.276us     247.092us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     397.598us        27.94%     397.598us     132.533us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     284.158us        19.97%     284.158us      94.719us             3  
-                                Activity Buffer Request        11.11%     208.773us        11.11%     208.773us     208.773us      74.911us         5.26%      74.911us      74.911us             1  
-                                    aten::empty_strided         1.44%      27.020us         1.44%      27.020us       4.503us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        14.26%     268.023us        14.26%     268.023us      29.780us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.93%      17.452us         1.22%      23.001us       2.556us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.48%       8.988us         0.48%       8.988us       0.599us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.52%       9.701us         0.52%       9.701us       3.234us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.51%       9.670us         0.51%       9.670us       3.223us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.29%       5.400us         0.35%       6.640us       2.213us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         3.67%     125.416us        69.04%       2.359ms       2.359ms       0.000us         0.00%       1.503ms       1.503ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.434ms       100.40%       1.434ms       1.434ms             1  
+                                               aten::to         0.17%       5.940us        61.15%       2.089ms     348.142us       0.000us         0.00%     760.702us     126.784us             6  
+                                         aten::_to_copy         0.72%      24.663us        60.97%       2.083ms     347.152us       0.000us         0.00%     760.702us     126.784us             6  
+                                            aten::copy_         1.35%      46.170us        59.36%       2.028ms     337.971us     686.110us        48.03%     760.702us     126.784us             6  
+                                           aten::conv1d         0.17%       5.960us         3.43%     117.002us      39.001us       0.000us         0.00%     742.490us     247.497us             3  
+                                      aten::convolution         0.28%       9.489us         3.25%     111.042us      37.014us       0.000us         0.00%     742.490us     247.497us             3  
+                                     aten::_convolution         0.69%      23.709us         2.97%     101.553us      33.851us       0.000us         0.00%     742.490us     247.497us             3  
+                                aten::_conv_depthwise2d         0.61%      20.921us         1.79%      61.223us      20.408us     742.490us        51.97%     742.490us     247.497us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     742.490us        51.97%     742.490us     247.497us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     400.959us        28.07%     400.959us     133.653us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     285.151us        19.96%     285.151us      95.050us             3  
+                                Activity Buffer Request        52.88%       1.806ms        52.88%       1.806ms       1.806ms      74.592us         5.22%      74.592us      74.592us             1  
+                                    aten::empty_strided         0.89%      30.420us         0.89%      30.420us       5.070us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         5.75%     196.514us         5.75%     196.514us      21.835us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.52%      17.690us         0.68%      23.179us       2.575us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.27%       9.290us         0.27%       9.290us       0.619us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.28%       9.521us         0.28%       9.521us       3.174us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.28%       9.690us         0.28%       9.690us       3.230us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.19%       6.410us         0.23%       7.991us       2.664us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.879ms
-Self CUDA time total: 1.423ms
+Self CPU time total: 3.416ms
+Self CUDA time total: 1.429ms
 
 
 impl                     wl                  p50(ms)  ok
@@ -4775,7 +4775,7 @@ torch_eager              cuda_B2_D64_S512_W2     0.08  True
 torch_eager              cuda_B2_D64_S512_W4     0.08  True
 torch_eager              cuda_B4_D2048_S128_W2     0.08  True
 torch_eager              cuda_B4_D2048_S128_W4     0.08  True
-torch_eager              cuda_B4_D2048_S2048_W2     0.48  True
+torch_eager              cuda_B4_D2048_S2048_W2     0.49  True
 torch_eager              cuda_B4_D2048_S2048_W4     0.50  True
 torch_eager              cuda_B4_D2048_S512_W2     0.09  True
 torch_eager              cuda_B4_D2048_S512_W4     0.10  True
@@ -4786,6 +4786,12 @@ torch_eager              cuda_B4_D64_S2048_W4     0.08  True
 torch_eager              cuda_B4_D64_S512_W2     0.08  True
 torch_eager              cuda_B4_D64_S512_W4     0.08  True
 
+
+
▶ UV Install Logs
+ +

Artifacts:

causal_conv1d.jsonl