diff --git "a/causal_conv1d/impls/torch_causal_conv1d.html" "b/causal_conv1d/impls/torch_causal_conv1d.html" --- "a/causal_conv1d/impls/torch_causal_conv1d.html" +++ "b/causal_conv1d/impls/torch_causal_conv1d.html" @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.24s +Cell: nv | 0.21s | Raw @@ -3887,7 +3895,7 @@ Cell: nv | 0.24s
-
Wed Oct 29 14:27:09 2025       
+
Wed Oct 29 15:50:16 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3904,7 @@ Cell: nv | 0.24s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   33C    P0            109W /  350W |       0MiB /  46068MiB |    100%      Default |
+| N/A   29C    P0             78W /  350W |       0MiB /  46068MiB |     18%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3918,9 +3926,9 @@ Cell: nv | 0.24s
 
 ▼ code 
 ▼ output
- ▶ uv-logs
+ ▶ uv-logs
  | 
-Cell: benchmark | 7.23s
+Cell: benchmark | 3.67s
  | 
 
 Raw
@@ -3982,29 +3990,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     465.824us      2410.10%     465.824us     465.824us             1  
-                                            torch_eager        10.38%     221.098us        99.69%       2.123ms       2.123ms       0.000us         0.00%      21.632us      21.632us             1  
-                                               aten::to         0.54%      11.460us        78.80%       1.678ms     279.633us       0.000us         0.00%      14.304us       2.384us             6  
-                                         aten::_to_copy         2.14%      45.672us        78.26%       1.666ms     277.723us       0.000us         0.00%      14.304us       2.384us             6  
-                                            aten::copy_         2.97%      63.201us        73.51%       1.565ms     260.883us      12.000us        62.09%      14.304us       2.384us             6  
-                                           aten::conv1d         0.45%       9.560us         8.33%     177.314us      59.105us       0.000us         0.00%       7.328us       2.443us             3  
-                                      aten::convolution         0.76%      16.270us         7.88%     167.754us      55.918us       0.000us         0.00%       7.328us       2.443us             3  
-                                     aten::_convolution         1.63%      34.781us         7.11%     151.484us      50.495us       0.000us         0.00%       7.328us       2.443us             3  
-                                aten::_conv_depthwise2d         2.18%      46.460us         4.51%      96.001us      32.000us       7.328us        37.91%       7.328us       2.443us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        37.91%       7.328us       2.443us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        32.45%       6.272us       2.091us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        29.64%       5.728us       1.909us             3  
-                                Activity Buffer Request        67.39%       1.435ms        67.39%       1.435ms       1.435ms       2.304us        11.92%       2.304us       2.304us             1  
-                                    aten::empty_strided         2.60%      55.371us         2.60%      55.371us       9.228us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         4.37%      93.031us         4.37%      93.031us      10.337us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.44%      30.589us         1.81%      38.620us       4.291us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.63%      13.371us         0.63%      13.371us       0.891us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.55%      11.811us         0.55%      11.811us       3.937us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.56%      11.940us         0.56%      11.940us       3.980us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.37%       7.972us         0.46%       9.712us       3.237us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     430.557us      2231.44%     430.557us     430.557us             1  
+                                            torch_eager        10.26%     219.304us        99.68%       2.131ms       2.131ms       0.000us         0.00%      21.630us      21.630us             1  
+                                               aten::to         0.52%      11.162us        80.31%       1.717ms     286.188us       0.000us         0.00%      14.269us       2.378us             6  
+                                         aten::_to_copy         1.64%      35.099us        79.78%       1.706ms     284.328us       0.000us         0.00%      14.269us       2.378us             6  
+                                            aten::copy_         2.90%      61.950us        75.45%       1.613ms     268.888us      11.934us        61.85%      14.269us       2.378us             6  
+                                           aten::conv1d         0.34%       7.309us         7.15%     152.793us      50.931us       0.000us         0.00%       7.361us       2.454us             3  
+                                      aten::convolution         0.74%      15.802us         6.80%     145.484us      48.495us       0.000us         0.00%       7.361us       2.454us             3  
+                                     aten::_convolution         1.46%      31.300us         6.06%     129.682us      43.227us       0.000us         0.00%       7.361us       2.454us             3  
+                                aten::_conv_depthwise2d         1.58%      33.771us         3.83%      81.842us      27.281us       7.361us        38.15%       7.361us       2.454us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.361us        38.15%       7.361us       2.454us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.271us        32.50%       6.271us       2.090us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.663us        29.35%       5.663us       1.888us             3  
+                                Activity Buffer Request        69.43%       1.484ms        69.43%       1.484ms       1.484ms       2.335us        12.10%       2.335us       2.335us             1  
+                                    aten::empty_strided         2.69%      57.542us         2.69%      57.542us       9.590us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         4.29%      91.753us         4.29%      91.753us      10.195us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.26%      26.879us         1.60%      34.300us       3.811us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.54%      11.581us         0.54%      11.581us       0.772us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.53%      11.370us         0.53%      11.370us       3.790us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.55%      11.861us         0.55%      11.861us       3.954us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.32%       6.790us         0.37%       8.000us       2.667us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.129ms
-Self CUDA time total: 19.328us
+Self CPU time total: 2.138ms
+Self CUDA time total: 19.295us
 
 
 
@@ -4014,29 +4022,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.863us      1691.38%     332.863us     332.863us             1  
-                                            torch_eager         6.60%     126.115us        99.71%       1.906ms       1.906ms       0.000us         0.00%      21.792us      21.792us             1  
-                                               aten::to         0.31%       5.930us        85.54%       1.635ms     272.467us       0.000us         0.00%      13.760us       2.293us             6  
-                                         aten::_to_copy         1.30%      24.791us        85.23%       1.629ms     271.478us       0.000us         0.00%      13.760us       2.293us             6  
-                                            aten::copy_         2.71%      51.809us        82.30%       1.573ms     262.158us      11.648us        59.19%      13.760us       2.293us             6  
-                                           aten::conv1d         0.31%       5.929us         6.17%     117.852us      39.284us       0.000us         0.00%       8.032us       2.677us             3  
-                                      aten::convolution         0.53%      10.111us         5.86%     111.923us      37.308us       0.000us         0.00%       8.032us       2.677us             3  
-                                     aten::_convolution         1.20%      22.951us         5.33%     101.812us      33.937us       0.000us         0.00%       8.032us       2.677us             3  
-                                aten::_conv_depthwise2d         1.20%      22.860us         3.35%      64.021us      21.340us       8.032us        40.81%       8.032us       2.677us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       8.032us        40.81%       8.032us       2.677us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.080us        30.89%       6.080us       2.027us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        28.29%       5.568us       1.856us             3  
-                                Activity Buffer Request        77.00%       1.472ms        77.00%       1.472ms       1.472ms       2.112us        10.73%       2.112us       2.112us             1  
-                                    aten::empty_strided         1.63%      31.132us         1.63%      31.132us       5.189us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.70%      70.762us         3.70%      70.762us       7.862us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.87%      16.659us         1.16%      22.190us       2.466us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.46%       8.781us         0.46%       8.781us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.55%      10.521us         0.55%      10.521us       3.507us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.49%       9.390us         0.49%       9.390us       3.130us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.29%       5.540us         0.35%       6.670us       2.223us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     335.581us      1716.35%     335.581us     335.581us             1  
+                                            torch_eager         7.79%     148.313us        99.72%       1.898ms       1.898ms       0.000us         0.00%      21.664us      21.664us             1  
+                                               aten::to         0.37%       7.132us        84.25%       1.604ms     267.308us       0.000us         0.00%      13.760us       2.293us             6  
+                                         aten::_to_copy         1.28%      24.280us        83.88%       1.597ms     266.119us       0.000us         0.00%      13.760us       2.293us             6  
+                                            aten::copy_         2.64%      50.321us        81.08%       1.543ms     257.239us      11.648us        59.57%      13.760us       2.293us             6  
+                                           aten::conv1d         0.32%       6.130us         6.26%     119.243us      39.748us       0.000us         0.00%       7.904us       2.635us             3  
+                                      aten::convolution         0.58%      11.131us         5.94%     113.113us      37.704us       0.000us         0.00%       7.904us       2.635us             3  
+                                     aten::_convolution         1.18%      22.459us         5.36%     101.982us      33.994us       0.000us         0.00%       7.904us       2.635us             3  
+                                aten::_conv_depthwise2d         1.08%      20.592us         3.32%      63.132us      21.044us       7.904us        40.43%       7.904us       2.635us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        40.43%       7.904us       2.635us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.080us        31.10%       6.080us       2.027us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        28.48%       5.568us       1.856us             3  
+                                Activity Buffer Request        75.89%       1.445ms        75.89%       1.445ms       1.445ms       2.112us        10.80%       2.112us       2.112us             1  
+                                    aten::empty_strided         1.52%      29.001us         1.52%      29.001us       4.834us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.69%      70.220us         3.69%      70.220us       7.802us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.92%      17.542us         1.22%      23.251us       2.583us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.48%       9.139us         0.48%       9.139us       0.609us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.58%      11.060us         0.58%      11.060us       3.687us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.51%       9.700us         0.51%       9.700us       3.233us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       5.860us         0.37%       7.020us       2.340us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.911ms
-Self CUDA time total: 19.680us
+Self CPU time total: 1.904ms
+Self CUDA time total: 19.552us
 
 
 
@@ -4046,29 +4054,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     355.199us      1913.89%     355.199us     355.199us             1  
-                                            torch_eager         6.67%     125.171us        99.71%       1.872ms       1.872ms       0.000us         0.00%      20.511us      20.511us             1  
-                                               aten::to         0.32%       6.091us        84.23%       1.581ms     263.570us       0.000us         0.00%      13.600us       2.267us             6  
-                                         aten::_to_copy         1.32%      24.859us        83.90%       1.575ms     262.555us       0.000us         0.00%      13.600us       2.267us             6  
-                                            aten::copy_         2.70%      50.760us        80.88%       1.518ms     253.083us      11.648us        62.76%      13.600us       2.267us             6  
-                                           aten::conv1d         0.30%       5.670us         7.37%     138.423us      46.141us       0.000us         0.00%       6.911us       2.304us             3  
-                                      aten::convolution         0.52%       9.720us         7.07%     132.753us      44.251us       0.000us         0.00%       6.911us       2.304us             3  
-                                     aten::_convolution         1.24%      23.210us         6.55%     123.033us      41.011us       0.000us         0.00%       6.911us       2.304us             3  
-                                aten::_conv_depthwise2d         1.26%      23.712us         4.48%      84.033us      28.011us       6.911us        37.24%       6.911us       2.304us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.911us        37.24%       6.911us       2.304us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.984us        32.24%       5.984us       1.995us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        30.52%       5.664us       1.888us             3  
-                                Activity Buffer Request        75.59%       1.419ms        75.59%       1.419ms       1.419ms       1.952us        10.52%       1.952us       1.952us             1  
-                                    aten::empty_strided         1.70%      31.973us         1.70%      31.973us       5.329us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.83%      72.002us         3.83%      72.002us       8.000us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.89%      16.661us         1.15%      21.682us       2.409us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.48%       8.941us         0.48%       8.941us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.49%      28.041us         1.49%      28.041us       9.347us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.47%       8.840us         0.47%       8.840us       2.947us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.32%       5.960us         0.40%       7.470us       2.490us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     343.036us      1844.97%     343.036us     343.036us             1  
+                                            torch_eager         7.68%     146.161us        99.72%       1.897ms       1.897ms       0.000us         0.00%      20.481us      20.481us             1  
+                                               aten::to         0.37%       6.953us        83.90%       1.596ms     266.066us       0.000us         0.00%      13.536us       2.256us             6  
+                                         aten::_to_copy         1.25%      23.842us        83.53%       1.589ms     264.907us       0.000us         0.00%      13.536us       2.256us             6  
+                                            aten::copy_         2.67%      50.789us        80.65%       1.535ms     255.762us      11.648us        62.65%      13.536us       2.256us             6  
+                                           aten::conv1d         0.33%       6.290us         6.66%     126.782us      42.261us       0.000us         0.00%       6.945us       2.315us             3  
+                                      aten::convolution         0.51%       9.650us         6.33%     120.492us      40.164us       0.000us         0.00%       6.945us       2.315us             3  
+                                     aten::_convolution         1.22%      23.120us         5.83%     110.842us      36.947us       0.000us         0.00%       6.945us       2.315us             3  
+                                aten::_conv_depthwise2d         1.44%      27.490us         3.78%      71.832us      23.944us       6.945us        37.35%       6.945us       2.315us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.945us        37.35%       6.945us       2.315us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.952us        32.01%       5.952us       1.984us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        30.64%       5.696us       1.899us             3  
+                                Activity Buffer Request        75.34%       1.433ms        75.34%       1.433ms       1.433ms       1.888us        10.15%       1.888us       1.888us             1  
+                                    aten::empty_strided         1.63%      31.030us         1.63%      31.030us       5.172us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.99%      75.911us         3.99%      75.911us       8.435us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.90%      17.191us         1.18%      22.431us       2.492us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.46%       8.831us         0.46%       8.831us       0.589us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.55%      10.511us         0.55%      10.511us       3.504us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.43%       8.261us         0.43%       8.261us       2.754us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       5.930us         0.38%       7.310us       2.437us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.878ms
-Self CUDA time total: 18.559us
+Self CPU time total: 1.903ms
+Self CUDA time total: 18.593us
 
 
 
@@ -4078,29 +4086,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     335.841us      1714.87%     335.841us     335.841us             1  
-                                            torch_eager         6.09%     125.084us        99.75%       2.047ms       2.047ms       0.000us         0.00%      21.728us      21.728us             1  
-                                               aten::to         0.29%       6.012us        86.59%       1.777ms     296.210us       0.000us         0.00%      14.049us       2.341us             6  
-                                         aten::_to_copy         1.18%      24.318us        86.30%       1.771ms     295.209us       0.000us         0.00%      14.049us       2.341us             6  
-                                            aten::copy_         2.44%      50.170us        83.64%       1.717ms     286.105us      11.905us        60.79%      14.049us       2.341us             6  
-                                           aten::conv1d         0.29%       5.981us         5.73%     117.633us      39.211us       0.000us         0.00%       7.679us       2.560us             3  
-                                      aten::convolution         0.48%       9.909us         5.44%     111.652us      37.217us       0.000us         0.00%       7.679us       2.560us             3  
-                                     aten::_convolution         1.11%      22.712us         4.96%     101.743us      33.914us       0.000us         0.00%       7.679us       2.560us             3  
-                                aten::_conv_depthwise2d         1.08%      22.231us         3.11%      63.781us      21.260us       7.679us        39.21%       7.679us       2.560us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.679us        39.21%       7.679us       2.560us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        31.54%       6.176us       2.059us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.729us        29.25%       5.729us       1.910us             3  
-                                Activity Buffer Request        70.17%       1.440ms        70.17%       1.440ms       1.440ms       2.144us        10.95%       2.144us       2.144us             1  
-                                    aten::empty_strided         1.48%      30.301us         1.48%      30.301us       5.050us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        12.02%     246.676us        12.02%     246.676us      27.408us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.85%      17.450us         1.12%      22.930us       2.548us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       8.940us         0.44%       8.940us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%       9.630us         0.47%       9.630us       3.210us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.56%      11.490us         0.56%      11.490us       3.830us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       5.710us         0.34%       6.930us       2.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     329.535us      1679.93%     329.535us     329.535us             1  
+                                            torch_eager         6.84%     137.444us        99.71%       2.005ms       2.005ms       0.000us         0.00%      21.760us      21.760us             1  
+                                               aten::to         0.33%       6.580us        85.73%       1.724ms     287.253us       0.000us         0.00%      14.048us       2.341us             6  
+                                         aten::_to_copy         1.15%      23.069us        85.41%       1.717ms     286.156us       0.000us         0.00%      14.048us       2.341us             6  
+                                            aten::copy_         2.51%      50.362us        82.79%       1.664ms     277.390us      11.904us        60.69%      14.048us       2.341us             6  
+                                           aten::conv1d         0.28%       5.589us         5.76%     115.862us      38.621us       0.000us         0.00%       7.712us       2.571us             3  
+                                      aten::convolution         0.52%      10.381us         5.49%     110.273us      36.758us       0.000us         0.00%       7.712us       2.571us             3  
+                                     aten::_convolution         1.13%      22.651us         4.97%      99.892us      33.297us       0.000us         0.00%       7.712us       2.571us             3  
+                                aten::_conv_depthwise2d         1.01%      20.392us         3.08%      61.981us      20.660us       7.712us        39.31%       7.712us       2.571us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us        39.31%       7.712us       2.571us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        31.48%       6.176us       2.059us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        29.20%       5.728us       1.909us             3  
+                                Activity Buffer Request        70.54%       1.418ms        70.54%       1.418ms       1.418ms       2.144us        10.93%       2.144us       2.144us             1  
+                                    aten::empty_strided         1.47%      29.531us         1.47%      29.531us       4.922us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.85%     218.024us        10.85%     218.024us      24.225us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.87%      17.580us         1.13%      22.650us       2.517us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.42%       8.370us         0.42%       8.370us       0.558us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.699us         0.48%       9.699us       3.233us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.49%       9.760us         0.49%       9.760us       3.253us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.670us         0.34%       6.790us       2.263us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.053ms
-Self CUDA time total: 19.584us
+Self CPU time total: 2.010ms
+Self CUDA time total: 19.616us
 
 
 
@@ -4110,29 +4118,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     329.565us      1339.31%     329.565us     329.565us             1  
-                                            torch_eager         6.13%     122.184us        99.75%       1.990ms       1.990ms       0.000us         0.00%      26.911us      26.911us             1  
-                                               aten::to         0.30%       5.979us        86.40%       1.724ms     287.259us       0.000us         0.00%      15.359us       2.560us             6  
-                                         aten::_to_copy         1.37%      27.300us        86.10%       1.718ms     286.262us       0.000us         0.00%      15.359us       2.560us             6  
-                                            aten::copy_         2.45%      48.801us        83.22%       1.660ms     276.655us      13.055us        53.05%      15.359us       2.560us             6  
-                                           aten::conv1d         0.29%       5.841us         5.86%     116.932us      38.977us       0.000us         0.00%      11.552us       3.851us             3  
-                                      aten::convolution         0.50%       9.929us         5.57%     111.091us      37.030us       0.000us         0.00%      11.552us       3.851us             3  
-                                     aten::_convolution         1.16%      23.192us         5.07%     101.162us      33.721us       0.000us         0.00%      11.552us       3.851us             3  
-                                aten::_conv_depthwise2d         1.12%      22.341us         3.11%      62.030us      20.677us      11.552us        46.95%      11.552us       3.851us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        46.95%      11.552us       3.851us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.688us        27.18%       6.688us       2.229us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.367us        25.87%       6.367us       2.122us             3  
-                                Activity Buffer Request        71.71%       1.430ms        71.71%       1.430ms       1.430ms       2.304us         9.36%       2.304us       2.304us             1  
-                                    aten::empty_strided         1.52%      30.342us         1.52%      30.342us       5.057us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.06%     200.744us        10.06%     200.744us      22.305us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.86%      17.251us         1.14%      22.681us       2.520us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       9.051us         0.45%       9.051us       0.603us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.48%       9.579us         0.48%       9.579us       3.193us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.50%      10.050us         0.50%      10.050us       3.350us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.30%       6.019us         0.36%       7.270us       2.423us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     344.351us      1401.22%     344.351us     344.351us             1  
+                                            torch_eager         7.42%     151.244us        99.76%       2.034ms       2.034ms       0.000us         0.00%      26.847us      26.847us             1  
+                                               aten::to         0.33%       6.730us        85.23%       1.738ms     289.583us       0.000us         0.00%      15.264us       2.544us             6  
+                                         aten::_to_copy         1.15%      23.491us        84.90%       1.731ms     288.462us       0.000us         0.00%      15.264us       2.544us             6  
+                                            aten::copy_         2.84%      57.871us        82.24%       1.677ms     279.428us      12.992us        52.87%      15.264us       2.544us             6  
+                                           aten::conv1d         0.31%       6.410us         5.76%     117.443us      39.148us       0.000us         0.00%      11.583us       3.861us             3  
+                                      aten::convolution         0.49%      10.031us         5.45%     111.033us      37.011us       0.000us         0.00%      11.583us       3.861us             3  
+                                     aten::_convolution         1.08%      22.081us         4.95%     101.002us      33.667us       0.000us         0.00%      11.583us       3.861us             3  
+                                aten::_conv_depthwise2d         1.04%      21.239us         3.10%      63.201us      21.067us      11.583us        47.13%      11.583us       3.861us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.583us        47.13%      11.583us       3.861us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.656us        27.08%       6.656us       2.219us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        25.78%       6.336us       2.112us             3  
+                                Activity Buffer Request        70.08%       1.429ms        70.08%       1.429ms       1.429ms       2.272us         9.25%       2.272us       2.272us             1  
+                                    aten::empty_strided         1.51%      30.710us         1.51%      30.710us       5.118us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.42%     212.467us        10.42%     212.467us      23.607us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.89%      18.130us         1.15%      23.350us       2.594us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.43%       8.790us         0.43%       8.790us       0.586us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.800us         0.48%       9.800us       3.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.47%       9.640us         0.47%       9.640us       3.213us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.760us         0.35%       7.050us       2.350us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.995ms
-Self CUDA time total: 24.607us
+Self CPU time total: 2.039ms
+Self CUDA time total: 24.575us
 
 
 
@@ -4142,29 +4150,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     358.812us      1379.20%     358.812us     358.812us             1  
-                                            torch_eager         6.94%     139.423us        99.75%       2.005ms       2.005ms       0.000us         0.00%      28.256us      28.256us             1  
-                                               aten::to         0.33%       6.550us        85.45%       1.717ms     286.205us       0.000us         0.00%      15.199us       2.533us             6  
-                                         aten::_to_copy         1.20%      24.182us        85.13%       1.711ms     285.114us       0.000us         0.00%      15.199us       2.533us             6  
-                                            aten::copy_         2.59%      52.130us        82.30%       1.654ms     275.648us      12.959us        49.81%      15.199us       2.533us             6  
-                                           aten::conv1d         0.30%       6.120us         5.97%     119.993us      39.998us       0.000us         0.00%      13.057us       4.352us             3  
-                                      aten::convolution         0.48%       9.660us         5.67%     113.873us      37.958us       0.000us         0.00%      13.057us       4.352us             3  
-                                     aten::_convolution         1.13%      22.802us         5.19%     104.213us      34.738us       0.000us         0.00%      13.057us       4.352us             3  
-                                aten::_conv_depthwise2d         1.09%      21.932us         3.25%      65.242us      21.747us      13.057us        50.19%      13.057us       4.352us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.057us        50.19%      13.057us       4.352us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.623us        25.46%       6.623us       2.208us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        24.35%       6.336us       2.112us             3  
-                                Activity Buffer Request        70.68%       1.420ms        70.68%       1.420ms       1.420ms       2.240us         8.61%       2.240us       2.240us             1  
-                                    aten::empty_strided         1.62%      32.611us         1.62%      32.611us       5.435us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.17%     204.364us        10.17%     204.364us      22.707us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.88%      17.647us         1.15%      23.189us       2.577us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.47%       9.382us         0.47%       9.382us       0.625us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.58%      11.651us         0.58%      11.651us       3.884us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.44%       8.769us         0.44%       8.769us       2.923us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.32%       6.420us         0.39%       7.890us       2.630us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.990us      1278.37%     332.990us     332.990us             1  
+                                            torch_eager         7.09%     142.971us        99.76%       2.011ms       2.011ms       0.000us         0.00%      28.288us      28.288us             1  
+                                               aten::to         0.35%       7.062us        85.44%       1.723ms     287.120us       0.000us         0.00%      15.232us       2.539us             6  
+                                         aten::_to_copy         1.18%      23.771us        85.09%       1.716ms     285.943us       0.000us         0.00%      15.232us       2.539us             6  
+                                            aten::copy_         2.51%      50.519us        82.47%       1.663ms     277.136us      12.992us        49.88%      15.232us       2.539us             6  
+                                           aten::conv1d         0.32%       6.541us         5.84%     117.833us      39.278us       0.000us         0.00%      13.056us       4.352us             3  
+                                      aten::convolution         0.52%      10.410us         5.52%     111.292us      37.097us       0.000us         0.00%      13.056us       4.352us             3  
+                                     aten::_convolution         1.19%      24.049us         5.00%     100.882us      33.627us       0.000us         0.00%      13.056us       4.352us             3  
+                                aten::_conv_depthwise2d         1.01%      20.460us         2.98%      60.052us      20.017us      13.056us        50.12%      13.056us       4.352us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.056us        50.12%      13.056us       4.352us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.624us        25.43%       6.624us       2.208us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.368us        24.45%       6.368us       2.123us             3  
+                                Activity Buffer Request        70.71%       1.426ms        70.71%       1.426ms       1.426ms       2.240us         8.60%       2.240us       2.240us             1  
+                                    aten::empty_strided         1.44%      29.071us         1.44%      29.071us       4.845us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.31%     207.805us        10.31%     207.805us      23.089us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.90%      18.081us         1.15%      23.201us       2.578us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.43%       8.650us         0.43%       8.650us       0.577us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.49%       9.891us         0.49%       9.891us       3.297us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.42%       8.561us         0.42%       8.561us       2.854us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       6.350us         0.38%       7.610us       2.537us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.010ms
-Self CUDA time total: 26.016us
+Self CPU time total: 2.016ms
+Self CUDA time total: 26.048us
 
 
 
@@ -4174,29 +4182,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.896us       853.65%     328.896us     328.896us             1  
-                                            torch_eager         6.29%     121.493us        99.73%       1.928ms       1.928ms       0.000us         0.00%      41.088us      41.088us             1  
-                                           aten::conv1d         0.31%       5.961us         6.00%     115.903us      38.634us       0.000us         0.00%      22.688us       7.563us             3  
-                                      aten::convolution         0.50%       9.600us         5.69%     109.942us      36.647us       0.000us         0.00%      22.688us       7.563us             3  
-                                     aten::_convolution         1.16%      22.510us         5.19%     100.342us      33.447us       0.000us         0.00%      22.688us       7.563us             3  
-                                aten::_conv_depthwise2d         1.17%      22.551us         3.25%      62.881us      20.960us      22.688us        58.89%      22.688us       7.563us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.688us        58.89%      22.688us       7.563us             3  
-                                               aten::to         0.33%       6.421us        86.08%       1.664ms     277.308us       0.000us         0.00%      18.400us       3.067us             6  
-                                         aten::_to_copy         1.25%      24.161us        85.75%       1.657ms     276.238us       0.000us         0.00%      18.400us       3.067us             6  
-                                            aten::copy_         2.57%      49.759us        82.93%       1.603ms     267.166us      15.840us        41.11%      18.400us       3.067us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.448us        21.93%       8.448us       2.816us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        19.19%       7.392us       2.464us             3  
-                                Activity Buffer Request        71.07%       1.374ms        71.07%       1.374ms       1.374ms       2.560us         6.64%       2.560us       2.560us             1  
-                                    aten::empty_strided         1.57%      30.271us         1.57%      30.271us       5.045us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.43%     201.525us        10.43%     201.525us      22.392us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.86%      16.701us         1.14%      22.001us       2.445us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       8.751us         0.45%       8.751us       0.583us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.48%       9.290us         0.48%       9.290us       3.097us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.47%       9.060us         0.47%       9.060us       3.020us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       5.459us         0.35%       6.690us       2.230us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.220us       868.07%     332.220us     332.220us             1  
+                                            torch_eager         7.10%     144.065us        99.76%       2.024ms       2.024ms       0.000us         0.00%      40.831us      40.831us             1  
+                                           aten::conv1d         0.30%       6.030us         5.72%     116.102us      38.701us       0.000us         0.00%      22.464us       7.488us             3  
+                                      aten::convolution         0.49%       9.861us         5.42%     110.072us      36.691us       0.000us         0.00%      22.464us       7.488us             3  
+                                     aten::_convolution         1.11%      22.459us         4.94%     100.211us      33.404us       0.000us         0.00%      22.464us       7.488us             3  
+                                aten::_conv_depthwise2d         1.00%      20.252us         3.07%      62.362us      20.787us      22.464us        58.70%      22.464us       7.488us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.464us        58.70%      22.464us       7.488us             3  
+                                               aten::to         0.31%       6.271us        85.57%       1.737ms     289.428us       0.000us         0.00%      18.367us       3.061us             6  
+                                         aten::_to_copy         1.14%      23.180us        85.26%       1.730ms     288.383us       0.000us         0.00%      18.367us       3.061us             6  
+                                            aten::copy_         2.42%      49.061us        82.56%       1.675ms     279.226us      15.807us        41.30%      18.367us       3.061us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.448us        22.07%       8.448us       2.816us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.359us        19.23%       7.359us       2.453us             3  
+                                Activity Buffer Request        70.88%       1.438ms        70.88%       1.438ms       1.438ms       2.560us         6.69%       2.560us       2.560us             1  
+                                    aten::empty_strided         1.57%      31.760us         1.57%      31.760us       5.293us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.30%     209.084us        10.30%     209.084us      23.232us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.88%      17.889us         1.13%      22.980us       2.553us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.43%       8.691us         0.43%       8.691us       0.579us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.58%      11.680us         0.58%      11.680us       3.893us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.45%       9.220us         0.45%       9.220us       3.073us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.29%       5.850us         0.36%       7.240us       2.413us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.933ms
-Self CUDA time total: 38.528us
+Self CPU time total: 2.029ms
+Self CUDA time total: 38.271us
 
 
 
@@ -4206,29 +4214,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     334.458us       810.83%     334.458us     334.458us             1  
-                                            torch_eager         6.32%     125.394us        99.75%       1.978ms       1.978ms       0.000us         0.00%      43.841us      43.841us             1  
-                                           aten::conv1d         0.30%       5.899us         5.88%     116.562us      38.854us       0.000us         0.00%      25.600us       8.533us             3  
-                                      aten::convolution         0.49%       9.810us         5.58%     110.663us      36.888us       0.000us         0.00%      25.600us       8.533us             3  
-                                     aten::_convolution         1.13%      22.411us         5.09%     100.853us      33.618us       0.000us         0.00%      25.600us       8.533us             3  
-                                aten::_conv_depthwise2d         1.14%      22.520us         3.20%      63.392us      21.131us      25.600us        62.06%      25.600us       8.533us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.600us        62.06%      25.600us       8.533us             3  
-                                               aten::to         0.30%       5.959us        86.14%       1.708ms     284.675us       0.000us         0.00%      18.241us       3.040us             6  
-                                         aten::_to_copy         1.33%      26.372us        85.84%       1.702ms     283.682us       0.000us         0.00%      18.241us       3.040us             6  
-                                            aten::copy_         2.49%      49.420us        83.02%       1.646ms     274.363us      15.649us        37.94%      18.241us       3.040us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.321us        20.17%       8.321us       2.774us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        17.77%       7.328us       2.443us             3  
-                                Activity Buffer Request        71.51%       1.418ms        71.51%       1.418ms       1.418ms       2.592us         6.28%       2.592us       2.592us             1  
-                                    aten::empty_strided         1.49%      29.540us         1.49%      29.540us       4.923us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.06%     199.427us        10.06%     199.427us      22.159us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.92%      18.199us         1.18%      23.330us       2.592us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       8.651us         0.44%       8.651us       0.577us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.54%      10.640us         0.54%      10.640us       3.547us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.48%       9.610us         0.48%       9.610us       3.203us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       5.590us         0.34%       6.770us       2.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     347.901us       847.38%     347.901us     347.901us             1  
+                                            torch_eager         7.21%     147.111us        99.76%       2.035ms       2.035ms       0.000us         0.00%      43.616us      43.616us             1  
+                                           aten::conv1d         0.33%       6.680us         5.94%     121.133us      40.378us       0.000us         0.00%      25.376us       8.459us             3  
+                                      aten::convolution         0.49%      10.011us         5.61%     114.453us      38.151us       0.000us         0.00%      25.376us       8.459us             3  
+                                     aten::_convolution         1.21%      24.739us         5.12%     104.442us      34.814us       0.000us         0.00%      25.376us       8.459us             3  
+                                aten::_conv_depthwise2d         1.05%      21.431us         3.09%      62.981us      20.994us      25.376us        61.81%      25.376us       8.459us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.376us        61.81%      25.376us       8.459us             3  
+                                               aten::to         0.35%       7.210us        85.28%       1.740ms     289.922us       0.000us         0.00%      18.240us       3.040us             6  
+                                         aten::_to_copy         1.21%      24.639us        84.93%       1.732ms     288.720us       0.000us         0.00%      18.240us       3.040us             6  
+                                            aten::copy_         2.56%      52.303us        82.23%       1.677ms     279.542us      15.680us        38.19%      18.240us       3.040us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.288us        20.19%       8.288us       2.763us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        18.00%       7.392us       2.464us             3  
+                                Activity Buffer Request        70.32%       1.434ms        70.32%       1.434ms       1.434ms       2.560us         6.24%       2.560us       2.560us             1  
+                                    aten::empty_strided         1.49%      30.432us         1.49%      30.432us       5.072us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.49%     213.884us        10.49%     213.884us      23.765us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.88%      17.871us         1.14%      23.242us       2.582us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.44%       8.942us         0.44%       8.942us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.840us         0.48%       9.840us       3.280us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.42%       8.540us         0.42%       8.540us       2.847us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       6.230us         0.37%       7.540us       2.513us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.983ms
-Self CUDA time total: 41.249us
+Self CPU time total: 2.040ms
+Self CUDA time total: 41.056us
 
 
 
@@ -4238,29 +4246,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     338.849us       326.92%     338.849us     338.849us             1  
-                                            torch_eager         5.95%     117.585us        99.74%       1.970ms       1.970ms       0.000us         0.00%     109.697us     109.697us             1  
-                                           aten::conv1d         0.30%       5.970us         6.05%     119.502us      39.834us       0.000us         0.00%      71.232us      23.744us             3  
-                                      aten::convolution         0.49%       9.700us         5.75%     113.532us      37.844us       0.000us         0.00%      71.232us      23.744us             3  
-                                     aten::_convolution         1.15%      22.781us         5.26%     103.832us      34.611us       0.000us         0.00%      71.232us      23.744us             3  
-                                aten::_conv_depthwise2d         1.18%      23.259us         3.31%      65.420us      21.807us      71.232us        68.72%      71.232us      23.744us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      71.232us        68.72%      71.232us      23.744us             3  
-                                               aten::to         0.31%       6.199us        86.38%       1.706ms     284.313us       0.000us         0.00%      38.465us       6.411us             6  
-                                         aten::_to_copy         1.31%      25.891us        86.06%       1.700ms     283.280us       0.000us         0.00%      38.465us       6.411us             6  
-                                            aten::copy_         2.57%      50.812us        83.17%       1.643ms     273.758us      32.417us        31.28%      38.465us       6.411us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.760us        17.13%      17.760us       5.920us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.657us        14.14%      14.657us       4.886us             3  
-                                Activity Buffer Request        71.61%       1.414ms        71.61%       1.414ms       1.414ms       6.048us         5.84%       6.048us       6.048us             1  
-                                    aten::empty_strided         1.58%      31.240us         1.58%      31.240us       5.207us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.13%     200.155us        10.13%     200.155us      22.239us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.87%      17.181us         1.15%      22.621us       2.513us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       8.941us         0.45%       8.941us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.51%      10.050us         0.51%      10.050us       3.350us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.47%       9.370us         0.47%       9.370us       3.123us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       5.551us         0.35%       6.851us       2.284us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     334.331us       325.39%     334.331us     334.331us             1  
+                                            torch_eager         7.01%     141.713us        99.76%       2.018ms       2.018ms       0.000us         0.00%     108.764us     108.764us             1  
+                                           aten::conv1d         0.30%       6.090us         5.77%     116.623us      38.874us       0.000us         0.00%      70.528us      23.509us             3  
+                                      aten::convolution         0.56%      11.281us         5.46%     110.533us      36.844us       0.000us         0.00%      70.528us      23.509us             3  
+                                     aten::_convolution         1.11%      22.501us         4.91%      99.252us      33.084us       0.000us         0.00%      70.528us      23.509us             3  
+                                aten::_conv_depthwise2d         1.02%      20.538us         3.03%      61.301us      20.434us      70.528us        68.64%      70.528us      23.509us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.528us        68.64%      70.528us      23.509us             3  
+                                               aten::to         0.31%       6.229us        85.56%       1.731ms     288.457us       0.000us         0.00%      38.236us       6.373us             6  
+                                         aten::_to_copy         1.17%      23.650us        85.25%       1.725ms     287.419us       0.000us         0.00%      38.236us       6.373us             6  
+                                            aten::copy_         2.48%      50.230us        82.63%       1.672ms     278.605us      32.221us        31.36%      38.236us       6.373us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.598us        17.13%      17.598us       5.866us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.623us        14.23%      14.623us       4.874us             3  
+                                Activity Buffer Request        70.91%       1.435ms        70.91%       1.435ms       1.435ms       6.015us         5.85%       6.015us       6.015us             1  
+                                    aten::empty_strided         1.45%      29.232us         1.45%      29.232us       4.872us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.36%     209.517us        10.36%     209.517us      23.280us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.88%      17.770us         1.13%      22.940us       2.549us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.42%       8.560us         0.42%       8.560us       0.571us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.771us         0.48%       9.771us       3.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.41%       8.351us         0.41%       8.351us       2.784us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.590us         0.33%       6.770us       2.257us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.975ms
-Self CUDA time total: 103.649us
+Self CPU time total: 2.023ms
+Self CUDA time total: 102.749us
 
 
 
@@ -4270,29 +4278,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     357.597us       314.53%     357.597us     357.597us             1  
-                                            torch_eager         6.01%     120.196us        99.73%       1.995ms       1.995ms       0.000us         0.00%     119.645us     119.645us             1  
-                                           aten::conv1d         0.28%       5.578us         6.85%     137.112us      45.704us       0.000us         0.00%      81.344us      27.115us             3  
-                                      aten::convolution         0.47%       9.452us         6.58%     131.534us      43.845us       0.000us         0.00%      81.344us      27.115us             3  
-                                     aten::_convolution         1.16%      23.298us         6.10%     122.082us      40.694us       0.000us         0.00%      81.344us      27.115us             3  
-                                aten::_conv_depthwise2d         1.16%      23.221us         4.15%      82.932us      27.644us      81.344us        71.55%      81.344us      27.115us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      81.344us        71.55%      81.344us      27.115us             3  
-                                               aten::to         0.33%       6.509us        85.46%       1.710ms     284.935us       0.000us         0.00%      38.301us       6.383us             6  
-                                         aten::_to_copy         1.29%      25.870us        85.14%       1.703ms     283.850us       0.000us         0.00%      38.301us       6.383us             6  
-                                            aten::copy_         2.58%      51.531us        82.27%       1.646ms     274.308us      32.350us        28.45%      38.301us       6.383us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.727us        15.59%      17.727us       5.909us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.623us        12.86%      14.623us       4.874us             3  
-                                Activity Buffer Request        70.95%       1.419ms        70.95%       1.419ms       1.419ms       5.951us         5.23%       5.951us       5.951us             1  
-                                    aten::empty_strided         1.57%      31.380us         1.57%      31.380us       5.230us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.95%     199.044us         9.95%     199.044us      22.116us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.89%      17.740us         1.16%      23.191us       2.577us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.47%       9.433us         0.47%       9.433us       0.629us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.53%      10.531us         0.53%      10.531us       3.510us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.26%      25.130us         1.26%      25.130us       8.377us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.30%       6.010us         0.38%       7.612us       2.537us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     330.969us       293.42%     330.969us     330.969us             1  
+                                            torch_eager        14.99%     119.634us        99.39%     793.059us     793.059us       0.000us         0.00%     118.814us     118.814us             1  
+                                           aten::conv1d         0.68%       5.459us        14.66%     116.982us      38.994us       0.000us         0.00%      80.510us      26.837us             3  
+                                      aten::convolution         1.26%      10.041us        13.98%     111.523us      37.174us       0.000us         0.00%      80.510us      26.837us             3  
+                                     aten::_convolution         2.84%      22.661us        12.72%     101.482us      33.827us       0.000us         0.00%      80.510us      26.837us             3  
+                                aten::_conv_depthwise2d         2.60%      20.719us         7.95%      63.401us      21.134us      80.510us        71.38%      80.510us      26.837us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.510us        71.38%      80.510us      26.837us             3  
+                                               aten::to         0.74%       5.920us        66.51%     530.742us      88.457us       0.000us         0.00%      38.304us       6.384us             6  
+                                         aten::_to_copy         2.94%      23.422us        65.77%     524.822us      87.470us       0.000us         0.00%      38.304us       6.384us             6  
+                                            aten::copy_         6.43%      51.340us        58.99%     470.681us      78.447us      32.288us        28.62%      38.304us       6.384us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.696us        15.69%      17.696us       5.899us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.592us        12.94%      14.592us       4.864us             3  
+                                Activity Buffer Request        29.02%     231.576us        29.02%     231.576us     231.576us       6.016us         5.33%       6.016us       6.016us             1  
+                                    aten::empty_strided         3.85%      30.719us         3.85%      30.719us       5.120us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.56%     211.935us        26.56%     211.935us      23.548us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.12%      16.940us         2.72%      21.720us       2.413us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.02%       8.121us         1.02%       8.121us       0.541us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.20%       9.582us         1.20%       9.582us       3.194us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.12%       8.930us         1.12%       8.930us       2.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.72%       5.780us         0.87%       6.970us       2.323us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.000ms
-Self CUDA time total: 113.694us
+Self CPU time total: 797.960us
+Self CUDA time total: 112.798us
 
 
 
@@ -4302,29 +4310,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         5.97%     120.782us        97.66%       1.975ms       1.975ms       0.000us         0.00%     434.301us     434.301us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     421.021us       106.85%     421.021us     421.021us             1  
-                                           aten::conv1d         0.30%       6.069us         5.79%     117.202us      39.067us       0.000us         0.00%     251.007us      83.669us             3  
-                                      aten::convolution         0.47%       9.471us         5.49%     111.133us      37.044us       0.000us         0.00%     251.007us      83.669us             3  
-                                     aten::_convolution         1.10%      22.180us         5.03%     101.662us      33.887us       0.000us         0.00%     251.007us      83.669us             3  
-                                aten::_conv_depthwise2d         1.13%      22.779us         3.17%      64.182us      21.394us     251.007us        63.71%     251.007us      83.669us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     251.007us        63.71%     251.007us      83.669us             3  
-                                               aten::to         0.31%       6.200us        84.52%       1.710ms     284.917us       0.000us         0.00%     183.294us      30.549us             6  
-                                         aten::_to_copy         1.19%      24.072us        84.22%       1.703ms     283.884us       0.000us         0.00%     183.294us      30.549us             6  
-                                            aten::copy_         2.45%      49.593us        81.56%       1.650ms     274.942us     143.007us        36.29%     183.294us      30.549us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     102.495us        26.01%     102.495us      34.165us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.512us        10.28%      40.512us      13.504us             3  
-                                Activity Buffer Request        70.36%       1.423ms        70.36%       1.423ms       1.423ms      40.287us        10.22%      40.287us      40.287us             1  
-                                    aten::empty_strided         1.46%      29.579us         1.46%      29.579us       4.930us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.86%     199.474us         9.86%     199.474us      22.164us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.84%      17.021us         1.11%      22.432us       2.492us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       9.090us         0.45%       9.090us       0.606us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.48%       9.720us         0.48%       9.720us       3.240us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.45%       9.202us         0.45%       9.202us       3.067us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       5.680us         0.35%       7.060us       2.353us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager        13.79%     117.896us        93.85%     802.069us     802.069us       0.000us         0.00%     432.858us     432.858us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     419.356us       106.55%     419.356us     419.356us             1  
+                                           aten::conv1d         0.66%       5.648us        13.24%     113.161us      37.720us       0.000us         0.00%     251.262us      83.754us             3  
+                                      aten::convolution         1.11%       9.481us        12.58%     107.513us      35.838us       0.000us         0.00%     251.262us      83.754us             3  
+                                     aten::_convolution         2.53%      21.627us        11.47%      98.032us      32.677us       0.000us         0.00%     251.262us      83.754us             3  
+                                aten::_conv_depthwise2d         2.35%      20.121us         7.14%      61.002us      20.334us     251.262us        63.84%     251.262us      83.754us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     251.262us        63.84%     251.262us      83.754us             3  
+                                               aten::to         0.66%       5.670us        63.66%     544.101us      90.683us       0.000us         0.00%     181.596us      30.266us             6  
+                                         aten::_to_copy         2.68%      22.880us        63.00%     538.431us      89.739us       0.000us         0.00%     181.596us      30.266us             6  
+                                            aten::copy_         6.04%      51.591us        56.88%     486.161us      81.027us     142.333us        36.16%     181.596us      30.266us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     102.271us        25.98%     102.271us      34.090us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.062us        10.18%      40.062us      13.354us             3  
+                                Activity Buffer Request        28.73%     245.556us        28.73%     245.556us     245.556us      39.263us         9.98%      39.263us      39.263us             1  
+                                    aten::empty_strided         3.44%      29.390us         3.44%      29.390us       4.898us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.77%     211.714us        24.77%     211.714us      23.524us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.99%      17.042us         2.56%      21.904us       2.434us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.01%       8.621us         1.01%       8.621us       0.575us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.11%       9.471us         1.11%       9.471us       3.157us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.02%       8.710us         1.02%       8.710us       2.903us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.71%       6.031us         0.84%       7.190us       2.397us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.023ms
-Self CUDA time total: 394.014us
+Self CPU time total: 854.650us
+Self CUDA time total: 393.595us
 
 
 
@@ -4334,29 +4342,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         5.89%     122.072us        95.29%       1.975ms       1.975ms       0.000us         0.00%     486.458us     486.458us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     474.010us       106.16%     474.010us     474.010us             1  
-                                           aten::conv1d         0.28%       5.830us         5.59%     115.853us      38.618us       0.000us         0.00%     299.291us      99.764us             3  
-                                      aten::convolution         0.46%       9.610us         5.31%     110.023us      36.674us       0.000us         0.00%     299.291us      99.764us             3  
-                                     aten::_convolution         1.08%      22.439us         4.85%     100.413us      33.471us       0.000us         0.00%     299.291us      99.764us             3  
-                                aten::_conv_depthwise2d         1.04%      21.490us         3.04%      62.983us      20.994us     299.291us        67.03%     299.291us      99.764us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     299.291us        67.03%     299.291us      99.764us             3  
-                                               aten::to         0.31%       6.341us        82.51%       1.710ms     284.962us       0.000us         0.00%     187.167us      31.195us             6  
-                                         aten::_to_copy         1.23%      25.592us        82.20%       1.703ms     283.906us       0.000us         0.00%     187.167us      31.195us             6  
-                                            aten::copy_         2.39%      49.481us        79.48%       1.647ms     274.512us     147.199us        32.97%     187.167us      31.195us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     106.911us        23.94%     106.911us      35.637us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.288us         9.02%      40.288us      13.429us             3  
-                                Activity Buffer Request        68.62%       1.422ms        68.62%       1.422ms       1.422ms      39.968us         8.95%      39.968us      39.968us             1  
-                                    aten::empty_strided         1.48%      30.770us         1.48%      30.770us       5.128us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.53%     197.485us         9.53%     197.485us      21.943us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.81%      16.791us         1.08%      22.301us       2.478us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       9.141us         0.44%       9.141us       0.609us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%       9.701us         0.47%       9.701us       3.234us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.48%       9.941us         0.48%       9.941us       3.314us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.27%       5.510us         0.33%       6.790us       2.263us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager        12.73%     119.312us        88.90%     833.220us     833.220us       0.000us         0.00%     487.606us     487.606us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     476.503us       106.43%     476.503us     476.503us             1  
+                                           aten::conv1d         0.59%       5.550us        12.38%     116.073us      38.691us       0.000us         0.00%     298.682us      99.561us             3  
+                                      aten::convolution         1.01%       9.430us        11.79%     110.523us      36.841us       0.000us         0.00%     298.682us      99.561us             3  
+                                     aten::_convolution         2.32%      21.781us        10.79%     101.093us      33.698us       0.000us         0.00%     298.682us      99.561us             3  
+                                aten::_conv_depthwise2d         2.19%      20.491us         6.88%      64.493us      21.498us     298.682us        66.71%     298.682us      99.561us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     298.682us        66.71%     298.682us      99.561us             3  
+                                               aten::to         0.60%       5.580us        60.86%     570.404us      95.067us       0.000us         0.00%     188.924us      31.487us             6  
+                                         aten::_to_copy         2.42%      22.662us        60.26%     564.824us      94.137us       0.000us         0.00%     188.924us      31.487us             6  
+                                            aten::copy_         5.33%      49.982us        54.62%     511.981us      85.330us     149.053us        33.29%     188.924us      31.487us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     108.926us        24.33%     108.926us      36.309us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.127us         8.96%      40.127us      13.376us             3  
+                                Activity Buffer Request        29.44%     275.977us        29.44%     275.977us     275.977us      39.871us         8.91%      39.871us      39.871us             1  
+                                    aten::empty_strided         3.22%      30.181us         3.22%      30.181us       5.030us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        22.44%     210.343us        22.44%     210.343us      23.371us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.80%      16.910us         2.35%      22.009us       2.445us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.91%       8.519us         0.91%       8.519us       0.568us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.16%      10.891us         1.16%      10.891us       3.630us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.94%       8.790us         0.94%       8.790us       2.930us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.59%       5.500us         0.71%       6.690us       2.230us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.072ms
-Self CUDA time total: 446.490us
+Self CPU time total: 937.282us
+Self CUDA time total: 447.735us
 
 
 
@@ -4366,29 +4374,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     358.523us      1924.96%     358.523us     358.523us             1  
-                                            torch_eager        17.94%     139.773us        99.33%     774.049us     774.049us       0.000us         0.00%      20.513us      20.513us             1  
-                                               aten::to         0.94%       7.351us        62.88%     489.983us      81.664us       0.000us         0.00%      13.376us       2.229us             6  
-                                         aten::_to_copy         3.20%      24.930us        61.93%     482.632us      80.439us       0.000us         0.00%      13.376us       2.229us             6  
-                                            aten::copy_         6.90%      53.742us        54.52%     424.881us      70.813us      11.488us        61.68%      13.376us       2.229us             6  
-                                           aten::conv1d         0.75%       5.841us        15.01%     116.973us      38.991us       0.000us         0.00%       7.137us       2.379us             3  
-                                      aten::convolution         1.33%      10.360us        14.26%     111.132us      37.044us       0.000us         0.00%       7.137us       2.379us             3  
-                                     aten::_convolution         3.01%      23.430us        12.93%     100.772us      33.591us       0.000us         0.00%       7.137us       2.379us             3  
-                                aten::_conv_depthwise2d         2.81%      21.882us         7.98%      62.192us      20.731us       7.137us        38.32%       7.137us       2.379us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.137us        38.32%       7.137us       2.379us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us        31.61%       5.888us       1.963us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.600us        30.07%       5.600us       1.867us             3  
-                                Activity Buffer Request        24.98%     194.695us        24.98%     194.695us     194.695us       1.888us        10.14%       1.888us       1.888us             1  
-                                    aten::empty_strided         4.21%      32.821us         4.21%      32.821us       5.470us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.28%     197.004us        25.28%     197.004us      21.889us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.16%      16.850us         2.84%      22.160us       2.462us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.13%       8.821us         1.13%       8.821us       0.588us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.22%       9.521us         1.22%       9.521us       3.174us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.31%      10.229us         1.31%      10.229us       3.410us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.74%       5.740us         0.90%       7.020us       2.340us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     323.326us      1733.09%     323.326us     323.326us             1  
+                                            torch_eager        14.06%     116.944us        99.42%     826.859us     826.859us       0.000us         0.00%      20.544us      20.544us             1  
+                                               aten::to         0.72%       5.971us        68.57%     570.283us      95.047us       0.000us         0.00%      13.344us       2.224us             6  
+                                         aten::_to_copy         2.68%      22.330us        67.85%     564.312us      94.052us       0.000us         0.00%      13.344us       2.224us             6  
+                                            aten::copy_         6.25%      51.969us        61.73%     513.371us      85.562us      11.456us        61.41%      13.344us       2.224us             6  
+                                           aten::conv1d         0.66%       5.530us        13.54%     112.622us      37.541us       0.000us         0.00%       7.200us       2.400us             3  
+                                      aten::convolution         1.25%      10.420us        12.88%     107.092us      35.697us       0.000us         0.00%       7.200us       2.400us             3  
+                                     aten::_convolution         2.52%      20.950us        11.62%      96.672us      32.224us       0.000us         0.00%       7.200us       2.400us             3  
+                                aten::_conv_depthwise2d         2.43%      20.241us         7.30%      60.692us      20.231us       7.200us        38.59%       7.200us       2.400us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.200us        38.59%       7.200us       2.400us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us        31.56%       5.888us       1.963us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        29.85%       5.568us       1.856us             3  
+                                Activity Buffer Request        31.20%     259.516us        31.20%     259.516us     259.516us       1.888us        10.12%       1.888us       1.888us             1  
+                                    aten::empty_strided         3.44%      28.611us         3.44%      28.611us       4.768us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.78%     222.677us        26.78%     222.677us      24.742us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.11%      17.509us         2.72%      22.620us       2.513us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.03%       8.541us         1.03%       8.541us       0.569us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.40%      11.610us         1.40%      11.610us       3.870us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.97%       8.050us         0.97%       8.050us       2.683us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.68%       5.660us         0.82%       6.830us       2.277us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 779.258us
-Self CUDA time total: 18.625us
+Self CPU time total: 831.660us
+Self CUDA time total: 18.656us
 
 
 
@@ -4398,29 +4406,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.763us      1698.07%     328.763us     328.763us             1  
-                                            torch_eager        14.65%     115.015us        99.34%     779.670us     779.670us       0.000us         0.00%      21.248us      21.248us             1  
-                                               aten::to         0.80%       6.290us        66.21%     519.631us      86.605us       0.000us         0.00%      13.406us       2.234us             6  
-                                         aten::_to_copy         3.14%      24.649us        65.41%     513.341us      85.557us       0.000us         0.00%      13.406us       2.234us             6  
-                                            aten::copy_         6.80%      53.351us        58.20%     456.761us      76.127us      11.519us        59.50%      13.406us       2.234us             6  
-                                           aten::conv1d         0.75%       5.880us        15.10%     118.484us      39.495us       0.000us         0.00%       7.842us       2.614us             3  
-                                      aten::convolution         1.21%       9.513us        14.35%     112.604us      37.535us       0.000us         0.00%       7.842us       2.614us             3  
-                                     aten::_convolution         2.83%      22.229us        13.14%     103.091us      34.364us       0.000us         0.00%       7.842us       2.614us             3  
-                                aten::_conv_depthwise2d         3.15%      24.720us         8.43%      66.141us      22.047us       7.842us        40.50%       7.842us       2.614us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.842us        40.50%       7.842us       2.614us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.887us        30.41%       5.887us       1.962us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.632us        29.09%       5.632us       1.877us             3  
-                                Activity Buffer Request        29.55%     231.946us        29.55%     231.946us     231.946us       1.887us         9.75%       1.887us       1.887us             1  
-                                    aten::empty_strided         4.07%      31.931us         4.07%      31.931us       5.322us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.68%     193.684us        24.68%     193.684us      21.520us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.11%      16.541us         2.75%      21.581us       2.398us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.09%       8.568us         1.09%       8.568us       0.571us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.27%       9.951us         1.27%       9.951us       3.317us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.18%       9.250us         1.18%       9.250us       3.083us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.72%       5.642us         0.89%       6.980us       2.327us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     351.835us      1817.33%     351.835us     351.835us             1  
+                                            torch_eager        14.08%     121.071us        99.43%     854.999us     854.999us       0.000us         0.00%      21.248us      21.248us             1  
+                                               aten::to         0.71%       6.141us        68.62%     590.084us      98.347us       0.000us         0.00%      13.312us       2.219us             6  
+                                         aten::_to_copy         2.73%      23.503us        67.91%     583.943us      97.324us       0.000us         0.00%      13.312us       2.219us             6  
+                                            aten::copy_         6.25%      53.711us        59.45%     511.250us      85.208us      11.424us        59.01%      13.312us       2.219us             6  
+                                           aten::conv1d         0.65%       5.630us        13.53%     116.322us      38.774us       0.000us         0.00%       7.936us       2.645us             3  
+                                      aten::convolution         1.12%       9.630us        12.87%     110.692us      36.897us       0.000us         0.00%       7.936us       2.645us             3  
+                                     aten::_convolution         2.70%      23.181us        11.75%     101.062us      33.687us       0.000us         0.00%       7.936us       2.645us             3  
+                                aten::_conv_depthwise2d         2.42%      20.779us         7.31%      62.821us      20.940us       7.936us        40.99%       7.936us       2.645us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        40.99%       7.936us       2.645us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.856us        30.25%       5.856us       1.952us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        28.76%       5.568us       1.856us             3  
+                                Activity Buffer Request        31.74%     272.946us        31.74%     272.946us     272.946us       1.888us         9.75%       1.888us       1.888us             1  
+                                    aten::empty_strided         5.72%      49.190us         5.72%      49.190us       8.198us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.15%     207.684us        24.15%     207.684us      23.076us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.01%      17.302us         2.65%      22.752us       2.528us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.04%       8.971us         1.04%       8.971us       0.598us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.16%       9.970us         1.16%       9.970us       3.323us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.04%       8.981us         1.04%       8.981us       2.994us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.67%       5.729us         0.81%       6.930us       2.310us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 784.850us
-Self CUDA time total: 19.361us
+Self CPU time total: 859.928us
+Self CUDA time total: 19.360us
 
 
 
@@ -4430,29 +4438,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     330.454us      1698.73%     330.454us     330.454us             1  
-                                            torch_eager        14.50%     115.185us        99.38%     789.290us     789.290us       0.000us         0.00%      21.628us      21.628us             1  
-                                               aten::to         0.75%       5.979us        66.62%     529.132us      88.189us       0.000us         0.00%      14.332us       2.389us             6  
-                                         aten::_to_copy         3.11%      24.732us        65.87%     523.153us      87.192us       0.000us         0.00%      14.332us       2.389us             6  
-                                            aten::copy_         6.75%      53.590us        58.69%     466.101us      77.684us      12.157us        62.49%      14.332us       2.389us             6  
-                                           aten::conv1d         0.72%       5.740us        14.75%     117.122us      39.041us       0.000us         0.00%       7.296us       2.432us             3  
-                                      aten::convolution         1.18%       9.359us        14.02%     111.382us      37.127us       0.000us         0.00%       7.296us       2.432us             3  
-                                     aten::_convolution         2.82%      22.362us        12.85%     102.023us      34.008us       0.000us         0.00%       7.296us       2.432us             3  
-                                aten::_conv_depthwise2d         2.86%      22.741us         8.10%      64.351us      21.450us       7.296us        37.51%       7.296us       2.432us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.296us        37.51%       7.296us       2.432us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.238us        32.07%       6.238us       2.079us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.919us        30.43%       5.919us       1.973us             3  
-                                Activity Buffer Request        30.19%     239.746us        30.19%     239.746us     239.746us       2.175us        11.18%       2.175us       2.175us             1  
-                                    aten::empty_strided         4.07%      32.320us         4.07%      32.320us       5.387us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.58%     195.235us        24.58%     195.235us      21.693us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.10%      16.713us         2.76%      21.891us       2.432us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.12%       8.919us         1.12%       8.919us       0.595us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.20%       9.570us         1.20%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.20%       9.570us         1.20%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.72%       5.709us         0.89%       7.030us       2.343us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     336.603us      1730.16%     336.603us     336.603us             1  
+                                            torch_eager         7.08%     144.052us        99.72%       2.028ms       2.028ms       0.000us         0.00%      21.631us      21.631us             1  
+                                               aten::to         0.35%       7.181us        85.48%       1.738ms     289.712us       0.000us         0.00%      14.367us       2.394us             6  
+                                         aten::_to_copy         1.19%      24.130us        85.13%       1.731ms     288.515us       0.000us         0.00%      14.367us       2.394us             6  
+                                            aten::copy_         2.47%      50.222us        82.49%       1.678ms     279.593us      12.191us        62.66%      14.367us       2.394us             6  
+                                           aten::conv1d         0.31%       6.211us         5.80%     117.993us      39.331us       0.000us         0.00%       7.264us       2.421us             3  
+                                      aten::convolution         0.49%       9.910us         5.50%     111.782us      37.261us       0.000us         0.00%       7.264us       2.421us             3  
+                                     aten::_convolution         1.11%      22.590us         5.01%     101.872us      33.957us       0.000us         0.00%       7.264us       2.421us             3  
+                                aten::_conv_depthwise2d         1.01%      20.531us         3.08%      62.662us      20.887us       7.264us        37.34%       7.264us       2.421us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.264us        37.34%       7.264us       2.421us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        32.24%       6.272us       2.091us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.919us        30.42%       5.919us       1.973us             3  
+                                Activity Buffer Request        70.94%       1.443ms        70.94%       1.443ms       1.443ms       2.176us        11.18%       2.176us       2.176us             1  
+                                    aten::empty_strided         1.45%      29.401us         1.45%      29.401us       4.900us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.12%     205.814us        10.12%     205.814us      22.868us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.93%      19.000us         1.20%      24.310us       2.701us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.43%       8.650us         0.43%       8.650us       0.577us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.52%      10.541us         0.52%      10.541us       3.514us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.51%      10.450us         0.51%      10.450us       3.483us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.30%       6.000us         0.36%       7.230us       2.410us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 794.200us
-Self CUDA time total: 19.453us
+Self CPU time total: 2.034ms
+Self CUDA time total: 19.455us
 
 
 
@@ -4462,29 +4470,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     325.021us      1622.51%     325.021us     325.021us             1  
-                                            torch_eager        14.95%     114.725us        99.33%     762.279us     762.279us       0.000us         0.00%      22.176us      22.176us             1  
-                                               aten::to         0.78%       5.949us        65.87%     505.530us      84.255us       0.000us         0.00%      14.272us       2.379us             6  
-                                         aten::_to_copy         3.19%      24.509us        65.10%     499.581us      83.264us       0.000us         0.00%      14.272us       2.379us             6  
-                                            aten::copy_         6.59%      50.599us        57.97%     444.890us      74.148us      12.128us        60.54%      14.272us       2.379us             6  
-                                           aten::conv1d         0.79%       6.100us        15.11%     115.973us      38.658us       0.000us         0.00%       7.904us       2.635us             3  
-                                      aten::convolution         1.34%      10.290us        14.32%     109.873us      36.624us       0.000us         0.00%       7.904us       2.635us             3  
-                                     aten::_convolution         2.97%      22.812us        12.98%      99.583us      33.194us       0.000us         0.00%       7.904us       2.635us             3  
-                                aten::_conv_depthwise2d         2.93%      22.501us         8.10%      62.182us      20.727us       7.904us        39.46%       7.904us       2.635us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        39.46%       7.904us       2.635us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.208us        30.99%       6.208us       2.069us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        29.55%       5.920us       1.973us             3  
-                                Activity Buffer Request        28.71%     220.306us        28.71%     220.306us     220.306us       2.144us        10.70%       2.144us       2.144us             1  
-                                    aten::empty_strided         3.93%      30.182us         3.93%      30.182us       5.030us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.32%     194.286us        25.32%     194.286us      21.587us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.11%      16.159us         2.76%      21.209us       2.357us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.09%       8.360us         1.09%       8.360us       0.557us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.23%       9.450us         1.23%       9.450us       3.150us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.29%       9.930us         1.29%       9.930us       3.310us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.71%       5.470us         0.87%       6.670us       2.223us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     330.813us      1648.79%     330.813us     330.813us             1  
+                                            torch_eager        15.21%     119.271us        99.33%     778.918us     778.918us       0.000us         0.00%      22.208us      22.208us             1  
+                                               aten::to         0.72%       5.611us        65.71%     515.312us      85.885us       0.000us         0.00%      14.305us       2.384us             6  
+                                         aten::_to_copy         2.87%      22.510us        65.00%     509.701us      84.950us       0.000us         0.00%      14.305us       2.384us             6  
+                                            aten::copy_         6.38%      50.021us        58.03%     455.090us      75.848us      12.161us        60.61%      14.305us       2.384us             6  
+                                           aten::conv1d         0.69%       5.380us        15.11%     118.473us      39.491us       0.000us         0.00%       7.903us       2.634us             3  
+                                      aten::convolution         1.31%      10.292us        14.42%     113.093us      37.698us       0.000us         0.00%       7.903us       2.634us             3  
+                                     aten::_convolution         2.98%      23.360us        13.11%     102.801us      34.267us       0.000us         0.00%       7.903us       2.634us             3  
+                                aten::_conv_depthwise2d         2.80%      21.952us         8.17%      64.041us      21.347us       7.903us        39.39%       7.903us       2.634us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.903us        39.39%       7.903us       2.634us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.240us        31.10%       6.240us       2.080us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.921us        29.51%       5.921us       1.974us             3  
+                                Activity Buffer Request        27.43%     215.065us        27.43%     215.065us     215.065us       2.144us        10.69%       2.144us       2.144us             1  
+                                    aten::empty_strided         4.09%      32.101us         4.09%      32.101us       5.350us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        27.10%     212.544us        27.10%     212.544us      23.616us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.14%      16.752us         2.74%      21.481us       2.387us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.03%       8.081us         1.03%       8.081us       0.539us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.34%      10.539us         1.34%      10.539us       3.513us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.15%       9.010us         1.15%       9.010us       3.003us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.74%       5.829us         0.90%       7.070us       2.357us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 767.429us
-Self CUDA time total: 20.032us
+Self CPU time total: 784.179us
+Self CUDA time total: 20.064us
 
 
 
@@ -4494,29 +4502,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     356.764us       983.15%     356.764us     356.764us             1  
-                                            torch_eager        15.53%     123.844us        99.36%     792.350us     792.350us       0.000us         0.00%      38.944us      38.944us             1  
-                                           aten::conv1d         0.79%       6.320us        15.33%     122.233us      40.744us       0.000us         0.00%      20.320us       6.773us             3  
-                                      aten::convolution         1.24%       9.851us        14.54%     115.913us      38.638us       0.000us         0.00%      20.320us       6.773us             3  
-                                     aten::_convolution         2.89%      23.052us        13.30%     106.062us      35.354us       0.000us         0.00%      20.320us       6.773us             3  
-                                aten::_conv_depthwise2d         2.97%      23.692us         8.39%      66.891us      22.297us      20.320us        56.00%      20.320us       6.773us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.320us        56.00%      20.320us       6.773us             3  
-                                               aten::to         0.80%       6.349us        64.76%     516.391us      86.065us       0.000us         0.00%      18.624us       3.104us             6  
-                                         aten::_to_copy         3.21%      25.572us        63.96%     510.042us      85.007us       0.000us         0.00%      18.624us       3.104us             6  
-                                            aten::copy_         6.54%      52.120us        56.52%     450.739us      75.123us      15.968us        44.00%      18.624us       3.104us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.607us        23.72%       8.607us       2.869us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.361us        20.28%       7.361us       2.454us             3  
-                                Activity Buffer Request        27.46%     218.966us        27.46%     218.966us     218.966us       2.656us         7.32%       2.656us       2.656us             1  
-                                    aten::empty_strided         4.23%      33.731us         4.23%      33.731us       5.622us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.38%     202.413us        25.38%     202.413us      22.490us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.20%      17.520us         2.88%      22.939us       2.549us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.21%       9.679us         1.21%       9.679us       0.645us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.40%      11.140us         1.40%      11.140us       3.713us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.17%       9.299us         1.17%       9.299us       3.100us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.75%       6.010us         0.93%       7.450us       2.483us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.020us       920.74%     332.020us     332.020us             1  
+                                            torch_eager        14.90%     117.475us        99.36%     783.438us     783.438us       0.000us         0.00%      38.651us      38.651us             1  
+                                           aten::conv1d         0.68%       5.380us        14.48%     114.172us      38.057us       0.000us         0.00%      20.190us       6.730us             3  
+                                      aten::convolution         1.18%       9.340us        13.80%     108.792us      36.264us       0.000us         0.00%      20.190us       6.730us             3  
+                                     aten::_convolution         2.79%      21.980us        12.61%      99.452us      33.151us       0.000us         0.00%      20.190us       6.730us             3  
+                                aten::_conv_depthwise2d         2.62%      20.631us         7.92%      62.452us      20.817us      20.190us        55.99%      20.190us       6.730us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.190us        55.99%      20.190us       6.730us             3  
+                                               aten::to         0.79%       6.191us        66.58%     524.962us      87.494us       0.000us         0.00%      18.461us       3.077us             6  
+                                         aten::_to_copy         2.94%      23.190us        65.79%     518.771us      86.462us       0.000us         0.00%      18.461us       3.077us             6  
+                                            aten::copy_         6.46%      50.920us        58.97%     464.950us      77.492us      15.870us        44.01%      18.461us       3.077us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.479us        23.51%       8.479us       2.826us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.391us        20.50%       7.391us       2.464us             3  
+                                Activity Buffer Request        28.77%     226.875us        28.77%     226.875us     226.875us       2.591us         7.19%       2.591us       2.591us             1  
+                                    aten::empty_strided         3.88%      30.631us         3.88%      30.631us       5.105us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.51%     209.015us        26.51%     209.015us      23.224us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.16%      17.000us         2.79%      22.010us       2.446us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.07%       8.441us         1.07%       8.441us       0.563us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.46%      11.521us         1.46%      11.521us       3.840us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.07%       8.440us         1.07%       8.440us       2.813us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.73%       5.720us         0.87%       6.850us       2.283us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 797.430us
-Self CUDA time total: 36.288us
+Self CPU time total: 788.468us
+Self CUDA time total: 36.060us
 
 
 
@@ -4526,29 +4534,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.353us       866.25%     332.353us     332.353us             1  
-                                            torch_eager         6.20%     124.083us        99.73%       1.997ms       1.997ms       0.000us         0.00%      40.959us      40.959us             1  
-                                           aten::conv1d         0.30%       6.071us         5.74%     115.013us      38.338us       0.000us         0.00%      22.592us       7.531us             3  
-                                      aten::convolution         0.48%       9.660us         5.44%     108.942us      36.314us       0.000us         0.00%      22.592us       7.531us             3  
-                                     aten::_convolution         1.09%      21.840us         4.96%      99.282us      33.094us       0.000us         0.00%      22.592us       7.531us             3  
-                                aten::_conv_depthwise2d         1.15%      22.991us         3.11%      62.342us      20.781us      22.592us        58.88%      22.592us       7.531us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.592us        58.88%      22.592us       7.531us             3  
-                                               aten::to         0.32%       6.339us        86.44%       1.731ms     288.505us       0.000us         0.00%      18.367us       3.061us             6  
-                                         aten::_to_copy         1.25%      24.980us        86.12%       1.725ms     287.449us       0.000us         0.00%      18.367us       3.061us             6  
-                                            aten::copy_         2.51%      50.252us        83.36%       1.669ms     278.222us      15.775us        41.12%      18.367us       3.061us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.416us        21.94%       8.416us       2.805us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.359us        19.18%       7.359us       2.453us             3  
-                                Activity Buffer Request        72.13%       1.445ms        72.13%       1.445ms       1.445ms       2.592us         6.76%       2.592us       2.592us             1  
-                                    aten::empty_strided         1.52%      30.382us         1.52%      30.382us       5.064us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.74%     194.985us         9.74%     194.985us      21.665us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.87%      17.330us         1.13%      22.630us       2.514us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       8.941us         0.45%       8.941us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.48%       9.610us         0.48%       9.610us       3.203us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.46%       9.250us         0.46%       9.250us       3.083us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.27%       5.490us         0.34%       6.780us       2.260us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     323.480us       850.14%     323.480us     323.480us             1  
+                                            torch_eager        14.55%     115.293us        99.34%     787.399us     787.399us       0.000us         0.00%      40.643us      40.643us             1  
+                                           aten::conv1d         0.68%       5.400us        14.40%     114.153us      38.051us       0.000us         0.00%      22.336us       7.445us             3  
+                                      aten::convolution         1.16%       9.210us        13.72%     108.753us      36.251us       0.000us         0.00%      22.336us       7.445us             3  
+                                     aten::_convolution         2.80%      22.227us        12.56%      99.543us      33.181us       0.000us         0.00%      22.336us       7.445us             3  
+                                aten::_conv_depthwise2d         2.52%      20.003us         7.87%      62.343us      20.781us      22.336us        58.70%      22.336us       7.445us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.336us        58.70%      22.336us       7.445us             3  
+                                               aten::to         0.94%       7.450us        67.15%     532.253us      88.709us       0.000us         0.00%      18.307us       3.051us             6  
+                                         aten::_to_copy         2.90%      22.999us        66.21%     524.803us      87.467us       0.000us         0.00%      18.307us       3.051us             6  
+                                            aten::copy_         6.35%      50.294us        59.67%     472.953us      78.825us      15.714us        41.30%      18.307us       3.051us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.321us        21.87%       8.321us       2.774us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.393us        19.43%       7.393us       2.464us             3  
+                                Activity Buffer Request        30.44%     241.286us        30.44%     241.286us     241.286us       2.593us         6.81%       2.593us       2.593us             1  
+                                    aten::empty_strided         3.64%      28.851us         3.64%      28.851us       4.808us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.80%     204.463us        25.80%     204.463us      22.718us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.08%      16.472us         2.71%      21.512us       2.390us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.07%       8.500us         1.07%       8.500us       0.567us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.34%      10.600us         1.34%      10.600us       3.533us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.09%       8.650us         1.09%       8.650us       2.883us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.71%       5.641us         0.87%       6.891us       2.297us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.003ms
-Self CUDA time total: 38.367us
+Self CPU time total: 792.609us
+Self CUDA time total: 38.050us
 
 
 
@@ -4558,29 +4566,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.952us       509.17%     328.952us     328.952us             1  
-                                            torch_eager        15.31%     114.903us        99.32%     745.599us     745.599us       0.000us         0.00%      68.701us      68.701us             1  
-                                           aten::conv1d         0.89%       6.660us        15.50%     116.373us      38.791us       0.000us         0.00%      42.238us      14.079us             3  
-                                      aten::convolution         1.33%       9.952us        14.61%     109.713us      36.571us       0.000us         0.00%      42.238us      14.079us             3  
-                                     aten::_convolution         2.95%      22.149us        13.29%      99.761us      33.254us       0.000us         0.00%      42.238us      14.079us             3  
-                                aten::_conv_depthwise2d         2.94%      22.090us         8.38%      62.891us      20.964us      42.238us        65.38%      42.238us      14.079us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      42.238us        65.38%      42.238us      14.079us             3  
-                                               aten::to         0.80%       6.039us        65.05%     488.341us      81.390us       0.000us         0.00%      26.463us       4.410us             6  
-                                         aten::_to_copy         3.23%      24.281us        64.25%     482.302us      80.384us       0.000us         0.00%      26.463us       4.410us             6  
-                                            aten::copy_         6.57%      49.302us        56.69%     425.561us      70.927us      22.367us        34.62%      26.463us       4.410us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.936us        18.48%      11.936us       3.979us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us        16.15%      10.431us       3.477us             3  
-                                Activity Buffer Request        26.58%     199.565us        26.58%     199.565us     199.565us       4.096us         6.34%       4.096us       4.096us             1  
-                                    aten::empty_strided         4.32%      32.460us         4.32%      32.460us       5.410us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.45%     198.565us        26.45%     198.565us      22.063us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.13%      16.001us         2.81%      21.091us       2.343us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.16%       8.690us         1.16%       8.690us       0.579us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.26%       9.490us         1.26%       9.490us       3.163us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.26%       9.440us         1.26%       9.440us       3.147us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.75%       5.611us         0.93%       6.981us       2.327us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     336.347us       525.30%     336.347us     336.347us             1  
+                                            torch_eager        15.07%     122.512us        99.37%     807.929us     807.929us       0.000us         0.00%      68.125us      68.125us             1  
+                                           aten::conv1d         0.67%       5.471us        14.06%     114.283us      38.094us       0.000us         0.00%      41.663us      13.888us             3  
+                                      aten::convolution         1.12%       9.100us        13.38%     108.812us      36.271us       0.000us         0.00%      41.663us      13.888us             3  
+                                     aten::_convolution         2.67%      21.730us        12.26%      99.712us      33.237us       0.000us         0.00%      41.663us      13.888us             3  
+                                aten::_conv_depthwise2d         2.50%      20.351us         7.64%      62.102us      20.701us      41.663us        65.07%      41.663us      13.888us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.663us        65.07%      41.663us      13.888us             3  
+                                               aten::to         0.85%       6.891us        66.79%     543.023us      90.504us       0.000us         0.00%      26.462us       4.410us             6  
+                                         aten::_to_copy         2.85%      23.179us        65.94%     536.132us      89.355us       0.000us         0.00%      26.462us       4.410us             6  
+                                            aten::copy_         6.23%      50.622us        59.33%     482.372us      80.395us      22.367us        34.93%      26.462us       4.410us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.935us        18.64%      11.935us       3.978us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us        16.29%      10.432us       3.477us             3  
+                                Activity Buffer Request        30.75%     250.036us        30.75%     250.036us     250.036us       4.095us         6.40%       4.095us       4.095us             1  
+                                    aten::empty_strided         3.76%      30.581us         3.76%      30.581us       5.097us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.31%     205.766us        25.31%     205.766us      22.863us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.22%      18.070us         2.95%      23.970us       2.663us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.13%       9.220us         1.13%       9.220us       0.615us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.19%       9.690us         1.19%       9.690us       3.230us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.99%       8.009us         0.99%       8.009us       2.670us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.69%       5.650us         0.84%       6.800us       2.267us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 750.709us
-Self CUDA time total: 64.605us
+Self CPU time total: 813.049us
+Self CUDA time total: 64.030us
 
 
 
@@ -4590,29 +4598,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.798us       467.68%     328.798us     328.798us             1  
-                                            torch_eager        14.69%     115.264us        99.37%     779.669us     779.669us       0.000us         0.00%      74.432us      74.432us             1  
-                                           aten::conv1d         0.75%       5.869us        14.89%     116.853us      38.951us       0.000us         0.00%      47.840us      15.947us             3  
-                                      aten::convolution         1.20%       9.412us        14.15%     110.984us      36.995us       0.000us         0.00%      47.840us      15.947us             3  
-                                     aten::_convolution         2.99%      23.451us        12.95%     101.572us      33.857us       0.000us         0.00%      47.840us      15.947us             3  
-                                aten::_conv_depthwise2d         2.71%      21.281us         8.10%      63.532us      21.177us      47.840us        68.05%      47.840us      15.947us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.840us        68.05%      47.840us      15.947us             3  
-                                               aten::to         0.74%       5.828us        66.46%     521.411us      86.902us       0.000us         0.00%      26.592us       4.432us             6  
-                                         aten::_to_copy         3.27%      25.622us        65.71%     515.583us      85.931us       0.000us         0.00%      26.592us       4.432us             6  
-                                            aten::copy_         6.42%      50.382us        58.46%     458.651us      76.442us      22.464us        31.95%      26.592us       4.432us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.032us        17.11%      12.032us       4.011us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us        14.84%      10.432us       3.477us             3  
-                                Activity Buffer Request        29.93%     234.846us        29.93%     234.846us     234.846us       4.128us         5.87%       4.128us       4.128us             1  
-                                    aten::empty_strided         3.99%      31.310us         3.99%      31.310us       5.218us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.83%     194.803us        24.83%     194.803us      21.645us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.07%      16.243us         2.72%      21.332us       2.370us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.07%       8.401us         1.07%       8.401us       0.560us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.35%      10.581us         1.35%      10.581us       3.527us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.31%      10.290us         1.31%      10.290us       3.430us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.69%       5.406us         0.84%       6.568us       2.189us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     325.852us       467.34%     325.852us     325.852us             1  
+                                            torch_eager        14.81%     118.946us        99.39%     798.399us     798.399us       0.000us         0.00%      73.789us      73.789us             1  
+                                           aten::conv1d         0.70%       5.610us        14.26%     114.513us      38.171us       0.000us         0.00%      47.294us      15.765us             3  
+                                      aten::convolution         1.17%       9.382us        13.56%     108.903us      36.301us       0.000us         0.00%      47.294us      15.765us             3  
+                                     aten::_convolution         2.75%      22.119us        12.39%      99.521us      33.174us       0.000us         0.00%      47.294us      15.765us             3  
+                                aten::_conv_depthwise2d         2.53%      20.361us         7.64%      61.351us      20.450us      47.294us        67.83%      47.294us      15.765us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.294us        67.83%      47.294us      15.765us             3  
+                                               aten::to         0.79%       6.379us        67.07%     538.781us      89.797us       0.000us         0.00%      26.495us       4.416us             6  
+                                         aten::_to_copy         2.79%      22.401us        66.28%     532.402us      88.734us       0.000us         0.00%      26.495us       4.416us             6  
+                                            aten::copy_         6.32%      50.749us        59.88%     480.970us      80.162us      22.431us        32.17%      26.495us       4.416us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.031us        17.25%      12.031us       4.010us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us        14.92%      10.400us       3.467us             3  
+                                Activity Buffer Request        31.04%     249.326us        31.04%     249.326us     249.326us       4.064us         5.83%       4.064us       4.064us             1  
+                                    aten::empty_strided         3.61%      29.031us         3.61%      29.031us       4.839us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.24%     202.725us        25.24%     202.725us      22.525us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.08%      16.713us         2.70%      21.680us       2.409us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.07%       8.568us         1.07%       8.568us       0.571us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.32%      10.569us         1.32%      10.569us       3.523us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.07%       8.591us         1.07%       8.591us       2.864us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.82%       6.609us         1.00%       8.010us       2.670us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 784.589us
-Self CUDA time total: 70.304us
+Self CPU time total: 803.260us
+Self CUDA time total: 69.725us
 
 
 
@@ -4622,29 +4630,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.882us       182.91%     341.882us     341.882us             1  
-                                            torch_eager        15.14%     117.185us        99.33%     768.879us     768.879us       0.000us         0.00%     197.117us     197.117us             1  
-                                           aten::conv1d         0.79%       6.110us        14.86%     114.993us      38.331us       0.000us         0.00%     134.270us      44.757us             3  
-                                      aten::convolution         1.22%       9.451us        14.07%     108.883us      36.294us       0.000us         0.00%     134.270us      44.757us             3  
-                                     aten::_convolution         2.87%      22.240us        12.85%      99.432us      33.144us       0.000us         0.00%     134.270us      44.757us             3  
-                                aten::_conv_depthwise2d         2.84%      21.991us         8.04%      62.222us      20.741us     134.270us        71.84%     134.270us      44.757us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     134.270us        71.84%     134.270us      44.757us             3  
-                                               aten::to         0.77%       5.950us        65.77%     509.102us      84.850us       0.000us         0.00%      62.847us      10.474us             6  
-                                         aten::_to_copy         3.29%      25.489us        65.00%     503.152us      83.859us       0.000us         0.00%      62.847us      10.474us             6  
-                                            aten::copy_         6.45%      49.889us        57.58%     445.721us      74.287us      52.639us        28.16%      62.847us      10.474us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.728us        15.91%      29.728us       9.909us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.911us        12.26%      22.911us       7.637us             3  
-                                Activity Buffer Request        28.61%     221.416us        28.61%     221.416us     221.416us      10.208us         5.46%      10.208us      10.208us             1  
-                                    aten::empty_strided         4.13%      31.942us         4.13%      31.942us       5.324us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.24%     195.386us        25.24%     195.386us      21.710us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.14%      16.602us         2.90%      22.460us       2.496us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.19%       9.247us         1.19%       9.247us       0.616us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.23%       9.500us         1.23%       9.500us       3.167us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.26%       9.761us         1.26%       9.761us       3.254us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.71%       5.470us         0.87%       6.700us       2.233us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     346.241us       186.20%     346.241us     346.241us             1  
+                                            torch_eager         7.14%     146.093us        99.75%       2.041ms       2.041ms       0.000us         0.00%     196.000us     196.000us             1  
+                                           aten::conv1d         0.31%       6.251us         5.79%     118.533us      39.511us       0.000us         0.00%     133.248us      44.416us             3  
+                                      aten::convolution         0.51%      10.359us         5.49%     112.282us      37.427us       0.000us         0.00%     133.248us      44.416us             3  
+                                     aten::_convolution         1.19%      24.280us         4.98%     101.923us      33.974us       0.000us         0.00%     133.248us      44.416us             3  
+                                aten::_conv_depthwise2d         1.04%      21.191us         3.02%      61.762us      20.587us     133.248us        71.66%     133.248us      44.416us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     133.248us        71.66%     133.248us      44.416us             3  
+                                               aten::to         0.32%       6.489us        85.48%       1.749ms     291.443us       0.000us         0.00%      62.752us      10.459us             6  
+                                         aten::_to_copy         1.16%      23.832us        85.16%       1.742ms     290.362us       0.000us         0.00%      62.752us      10.459us             6  
+                                            aten::copy_         2.53%      51.751us        82.59%       1.689ms     281.575us      52.704us        28.34%      62.752us      10.459us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.760us        16.00%      29.760us       9.920us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.944us        12.34%      22.944us       7.648us             3  
+                                Activity Buffer Request        71.04%       1.453ms        71.04%       1.453ms       1.453ms      10.048us         5.40%      10.048us      10.048us             1  
+                                    aten::empty_strided         1.41%      28.891us         1.41%      28.891us       4.815us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.04%     205.324us        10.04%     205.324us      22.814us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.86%      17.550us         1.11%      22.661us       2.518us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.42%       8.641us         0.42%       8.641us       0.576us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.53%      10.750us         0.53%      10.750us       3.583us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.44%       9.021us         0.44%       9.021us       3.007us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.30%       6.220us         0.37%       7.470us       2.490us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 774.039us
-Self CUDA time total: 186.909us
+Self CPU time total: 2.046ms
+Self CUDA time total: 185.952us
 
 
 
@@ -4654,29 +4662,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     349.277us       165.88%     349.277us     349.277us             1  
-                                            torch_eager        15.39%     117.165us        99.36%     756.609us     756.609us       0.000us         0.00%     224.029us     224.029us             1  
-                                           aten::conv1d         0.74%       5.661us        15.33%     116.734us      38.911us       0.000us         0.00%     154.686us      51.562us             3  
-                                      aten::convolution         1.20%       9.150us        14.59%     111.073us      37.024us       0.000us         0.00%     154.686us      51.562us             3  
-                                     aten::_convolution         2.96%      22.532us        13.38%     101.923us      33.974us       0.000us         0.00%     154.686us      51.562us             3  
-                                aten::_conv_depthwise2d         2.86%      21.751us         8.47%      64.492us      21.497us     154.686us        73.47%     154.686us      51.562us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     154.686us        73.47%     154.686us      51.562us             3  
-                                               aten::to         0.84%       6.379us        65.15%     496.150us      82.692us       0.000us         0.00%      69.343us      11.557us             6  
-                                         aten::_to_copy         3.33%      25.371us        64.32%     489.771us      81.628us       0.000us         0.00%      69.343us      11.557us             6  
-                                            aten::copy_         6.44%      49.031us        56.76%     432.240us      72.040us      55.871us        26.53%      69.343us      11.557us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.831us        15.59%      32.831us      10.944us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.040us        10.94%      23.040us       7.680us             3  
-                                Activity Buffer Request        27.33%     208.145us        27.33%     208.145us     208.145us      13.472us         6.40%      13.472us      13.472us             1  
-                                    aten::empty_strided         4.22%      32.160us         4.22%      32.160us       5.360us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.87%     197.025us        25.87%     197.025us      21.892us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.14%      16.329us         2.83%      21.520us       2.391us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.17%       8.932us         1.17%       8.932us       0.595us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.38%      10.500us         1.38%      10.500us       3.500us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.35%      10.280us         1.35%      10.280us       3.427us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.72%       5.468us         0.90%       6.839us       2.280us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.281us       162.73%     341.281us     341.281us             1  
+                                            torch_eager        15.25%     117.693us        99.36%     766.878us     766.878us       0.000us         0.00%     223.168us     223.168us             1  
+                                           aten::conv1d         0.68%       5.279us        14.60%     112.702us      37.567us       0.000us         0.00%     154.016us      51.339us             3  
+                                      aten::convolution         1.24%       9.560us        13.92%     107.423us      35.808us       0.000us         0.00%     154.016us      51.339us             3  
+                                     aten::_convolution         2.67%      20.611us        12.68%      97.863us      32.621us       0.000us         0.00%     154.016us      51.339us             3  
+                                aten::_conv_depthwise2d         2.74%      21.170us         8.14%      62.852us      20.951us     154.016us        73.44%     154.016us      51.339us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     154.016us        73.44%     154.016us      51.339us             3  
+                                               aten::to         0.75%       5.750us        66.22%     511.121us      85.187us       0.000us         0.00%      69.152us      11.525us             6  
+                                         aten::_to_copy         2.86%      22.060us        65.48%     505.371us      84.228us       0.000us         0.00%      69.152us      11.525us             6  
+                                            aten::copy_         6.81%      52.581us        58.74%     453.391us      75.565us      55.712us        26.56%      69.152us      11.525us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.705us        15.59%      32.705us      10.902us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.007us        10.97%      23.007us       7.669us             3  
+                                Activity Buffer Request        28.38%     219.045us        28.38%     219.045us     219.045us      13.440us         6.41%      13.440us      13.440us             1  
+                                    aten::empty_strided         3.88%      29.920us         3.88%      29.920us       4.987us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.50%     204.546us        26.50%     204.546us      22.727us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.10%      16.212us         2.69%      20.781us       2.309us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.01%       7.798us         1.01%       7.798us       0.520us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.33%      10.250us         1.33%      10.250us       3.417us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.12%       8.651us         1.12%       8.651us       2.884us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.71%       5.470us         0.87%       6.730us       2.243us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 761.499us
-Self CUDA time total: 210.557us
+Self CPU time total: 771.798us
+Self CUDA time total: 209.728us
 
 
 
@@ -4686,29 +4694,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         6.72%     121.944us        52.58%     953.714us     953.714us       0.000us         0.00%       1.521ms       1.521ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.421ms       100.41%       1.421ms       1.421ms             1  
-                                               aten::to         0.35%       6.300us        37.63%     682.555us     113.759us       0.000us         0.00%     824.097us     137.350us             6  
-                                         aten::_to_copy         1.68%      30.549us        37.28%     676.255us     112.709us       0.000us         0.00%     824.097us     137.350us             6  
-                                            aten::copy_         2.98%      53.981us        24.83%     450.422us      75.070us     718.817us        50.79%     824.097us     137.350us             6  
-                                           aten::conv1d         0.35%       6.281us         6.65%     120.554us      40.185us       0.000us         0.00%     696.543us     232.181us             3  
-                                      aten::convolution         0.57%      10.251us         6.30%     114.273us      38.091us       0.000us         0.00%     696.543us     232.181us             3  
-                                     aten::_convolution         1.27%      23.111us         5.73%     104.022us      34.674us       0.000us         0.00%     696.543us     232.181us             3  
-                                aten::_conv_depthwise2d         1.23%      22.359us         3.60%      65.321us      21.774us     696.543us        49.21%     696.543us     232.181us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     696.543us        49.21%     696.543us     232.181us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     409.920us        28.96%     409.920us     136.640us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     308.897us        21.82%     308.897us     102.966us             3  
-                                Activity Buffer Request        11.98%     217.246us        11.98%     217.246us     217.246us     105.280us         7.44%     105.280us     105.280us             1  
-                                    aten::empty_strided         2.17%      39.370us        10.77%     195.284us      32.547us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        11.13%     201.976us        11.13%     201.976us      22.442us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.99%      18.030us         1.31%      23.761us       2.640us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.53%       9.620us         0.53%       9.620us       0.641us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.59%      10.751us         0.59%      10.751us       3.584us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.52%       9.430us         0.52%       9.430us       3.143us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.31%       5.670us         0.39%       7.030us       2.343us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         6.80%     123.239us        52.01%     942.341us     942.341us       0.000us         0.00%       1.520ms       1.520ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.422ms       100.42%       1.422ms       1.422ms             1  
+                                               aten::to         0.34%       6.231us        36.98%     669.957us     111.660us       0.000us         0.00%     824.603us     137.434us             6  
+                                         aten::_to_copy         1.55%      28.122us        36.63%     663.726us     110.621us       0.000us         0.00%     824.603us     137.434us             6  
+                                            aten::copy_         2.95%      53.430us        24.71%     447.748us      74.625us     720.572us        50.89%     824.603us     137.434us             6  
+                                           aten::conv1d         0.34%       6.111us         6.66%     120.744us      40.248us       0.000us         0.00%     695.357us     231.786us             3  
+                                      aten::convolution         0.56%      10.201us         6.33%     114.633us      38.211us       0.000us         0.00%     695.357us     231.786us             3  
+                                     aten::_convolution         1.36%      24.689us         5.76%     104.432us      34.811us       0.000us         0.00%     695.357us     231.786us             3  
+                                aten::_conv_depthwise2d         1.22%      22.151us         3.50%      63.431us      21.144us     695.357us        49.11%     695.357us     231.786us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     695.357us        49.11%     695.357us     231.786us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     407.263us        28.76%     407.263us     135.754us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     313.309us        22.13%     313.309us     104.436us             3  
+                                Activity Buffer Request        11.46%     207.684us        11.46%     207.684us     207.684us     104.031us         7.35%     104.031us     104.031us             1  
+                                    aten::empty_strided         2.02%      36.603us        10.37%     187.856us      31.309us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.47%     207.874us        11.47%     207.874us      23.097us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.99%      18.011us         1.30%      23.581us       2.620us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.51%       9.270us         0.51%       9.270us       0.618us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.60%      10.830us         0.60%      10.830us       3.610us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.51%       9.210us         0.51%       9.210us       3.070us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.35%       6.401us         0.43%       7.711us       2.570us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.814ms
-Self CUDA time total: 1.415ms
+Self CPU time total: 1.812ms
+Self CUDA time total: 1.416ms
 
 
 
@@ -4718,29 +4726,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         4.05%     123.714us        65.96%       2.016ms       2.016ms       0.000us         0.00%       1.502ms       1.502ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.433ms       100.43%       1.433ms       1.433ms             1  
-                                               aten::to         0.21%       6.507us        56.82%       1.737ms     289.475us       0.000us         0.00%     764.927us     127.488us             6  
-                                         aten::_to_copy         0.85%      25.961us        56.61%       1.730ms     288.391us       0.000us         0.00%     764.927us     127.488us             6  
-                                            aten::copy_         1.76%      53.800us        54.73%       1.673ms     278.832us     689.887us        48.36%     764.927us     127.488us             6  
-                                           aten::conv1d         0.20%       6.220us         4.18%     127.663us      42.554us       0.000us         0.00%     736.735us     245.578us             3  
-                                      aten::convolution         0.34%      10.420us         3.97%     121.443us      40.481us       0.000us         0.00%     736.735us     245.578us             3  
-                                     aten::_convolution         0.75%      22.860us         3.63%     111.023us      37.008us       0.000us         0.00%     736.735us     245.578us             3  
-                                aten::_conv_depthwise2d         0.96%      29.441us         2.37%      72.583us      24.194us     736.735us        51.64%     736.735us     245.578us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     736.735us        51.64%     736.735us     245.578us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     397.471us        27.86%     397.471us     132.490us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     292.416us        20.50%     292.416us      97.472us             3  
-                                Activity Buffer Request        47.26%       1.445ms        47.26%       1.445ms       1.445ms      75.040us         5.26%      75.040us      75.040us             1  
-                                    aten::empty_strided         1.03%      31.391us         1.03%      31.391us       5.232us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         6.45%     197.169us         6.45%     197.169us      21.908us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.57%      17.300us         0.75%      22.850us       2.539us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.30%       9.200us         0.30%       9.200us       0.613us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.32%       9.780us         0.32%       9.780us       3.260us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.36%      10.870us         0.36%      10.870us       3.623us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.19%       5.770us         0.23%       7.180us       2.393us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         6.34%     116.852us        42.22%     778.698us     778.698us       0.000us         0.00%       1.501ms       1.501ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.431ms       100.41%       1.431ms       1.431ms             1  
+                                               aten::to         0.32%       5.860us        28.27%     521.352us      86.892us       0.000us         0.00%     762.491us     127.082us             6  
+                                         aten::_to_copy         1.22%      22.580us        27.95%     515.492us      85.915us       0.000us         0.00%     762.491us     127.082us             6  
+                                            aten::copy_         2.82%      52.081us        25.06%     462.219us      77.037us     686.779us        48.19%     762.491us     127.082us             6  
+                                           aten::conv1d         0.30%       5.601us         6.22%     114.743us      38.248us       0.000us         0.00%     738.362us     246.121us             3  
+                                      aten::convolution         0.51%       9.380us         5.92%     109.142us      36.381us       0.000us         0.00%     738.362us     246.121us             3  
+                                     aten::_convolution         1.20%      22.202us         5.41%      99.762us      33.254us       0.000us         0.00%     738.362us     246.121us             3  
+                                aten::_conv_depthwise2d         1.10%      20.251us         3.34%      61.660us      20.553us     738.362us        51.81%     738.362us     246.121us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     738.362us        51.81%     738.362us     246.121us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     398.333us        27.95%     398.333us     132.778us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     288.446us        20.24%     288.446us      96.149us             3  
+                                Activity Buffer Request        12.41%     228.855us        12.41%     228.855us     228.855us      75.712us         5.31%      75.712us      75.712us             1  
+                                    aten::empty_strided         1.66%      30.693us         1.66%      30.693us       5.115us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.01%     202.993us        11.01%     202.993us      22.555us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.90%      16.659us         1.18%      21.780us       2.420us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.46%       8.512us         0.46%       8.512us       0.567us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.53%       9.739us         0.53%       9.739us       3.246us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.54%       9.960us         0.54%       9.960us       3.320us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.33%       6.150us         0.40%       7.440us       2.480us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.057ms
-Self CUDA time total: 1.427ms
+Self CPU time total: 1.844ms
+Self CUDA time total: 1.425ms
 
 
 impl                     wl                  p50(ms)  ok
@@ -4765,16 +4773,10 @@ torch_eager              cuda_B4_D2048_S512_W4     0.10  True
 torch_eager              cuda_B4_D64_S128_W2     0.08  True
 torch_eager              cuda_B4_D64_S128_W4     0.08  True
 torch_eager              cuda_B4_D64_S2048_W2     0.08  True
-torch_eager              cuda_B4_D64_S2048_W4     0.09  True
+torch_eager              cuda_B4_D64_S2048_W4     0.08  True
 torch_eager              cuda_B4_D64_S512_W2     0.08  True
 torch_eager              cuda_B4_D64_S512_W4     0.08  True
 
-
-
▶ UV Install Logs
- -

Artifacts:

causal_conv1d.jsonl