+

PyTorch Native - Causal Conv1D

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.21s + | + +Raw +GitHub +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Tue Oct 28 14:08:09 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   28C    P0             80W /  350W |       0MiB /  46068MiB |     19%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+ +

Causal Conv1D Benchmark (PyTorch Native)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 3.63s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import torch.nn.functional as F
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def torch_causal_conv1d(input_tensor, weight, bias):
+    # Convert to weight dtype for computation
+    x = input_tensor.to(weight.dtype)
+    dim = weight.shape[0]
+    width = weight.shape[1]
+    seqlen = input_tensor.shape[-1]
+
+    # Depthwise causal conv1d using PyTorch
+    out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
+
+    # Truncate to original sequence length
+    out = out[..., :seqlen]
+
+    # Convert back to original dtype
+    return out.to(input_tensor.dtype)
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
+    impl_name="torch_eager",
+    impl_tags={"family": "pytorch", "backend": "eager"},
+    impl_func=torch_causal_conv1d,
+)
+
+ +
+
+
+
+
Running causal_conv1d benchmark on cuda with 24 workloads.
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     448.254us      2311.66%     448.254us     448.254us             1  
+                                            torch_eager        10.53%     223.197us        99.60%       2.112ms       2.112ms       0.000us         0.00%      21.727us      21.727us             1  
+                                               aten::to         0.57%      12.032us        79.33%       1.682ms     280.390us       0.000us         0.00%      14.304us       2.384us             6  
+                                         aten::_to_copy         1.82%      38.532us        78.77%       1.670ms     278.384us       0.000us         0.00%      14.304us       2.384us             6  
+                                            aten::copy_         2.94%      62.272us        74.35%       1.577ms     262.784us      11.968us        61.72%      14.304us       2.384us             6  
+                                           aten::conv1d         0.36%       7.640us         7.60%     161.165us      53.722us       0.000us         0.00%       7.423us       2.474us             3  
+                                      aten::convolution         0.68%      14.400us         7.24%     153.525us      51.175us       0.000us         0.00%       7.423us       2.474us             3  
+                                     aten::_convolution         1.64%      34.820us         6.56%     139.125us      46.375us       0.000us         0.00%       7.423us       2.474us             3  
+                                aten::_conv_depthwise2d         1.64%      34.779us         4.03%      85.503us      28.501us       7.423us        38.28%       7.423us       2.474us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.423us        38.28%       7.423us       2.474us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        32.51%       6.304us       2.101us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        29.21%       5.664us       1.888us             3  
+                                Activity Buffer Request        68.27%       1.448ms        68.27%       1.448ms       1.448ms       2.336us        12.05%       2.336us       2.336us             1  
+                                    aten::empty_strided         2.60%      55.071us         2.60%      55.071us       9.178us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         4.35%      92.254us         4.35%      92.254us      10.250us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.39%      29.522us         1.76%      37.262us       4.140us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.59%      12.410us         0.59%      12.410us       0.827us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.52%      10.960us         0.52%      10.960us       3.653us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.67%      14.291us         0.67%      14.291us       4.764us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.39%       8.321us         0.47%       9.881us       3.294us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.121ms
+Self CUDA time total: 19.391us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     334.942us      1707.32%     334.942us     334.942us             1  
+                                            torch_eager         7.85%     148.604us        99.72%       1.887ms       1.887ms       0.000us         0.00%      21.731us      21.731us             1  
+                                               aten::to         0.32%       6.111us        83.97%       1.589ms     264.793us       0.000us         0.00%      13.731us       2.288us             6  
+                                         aten::_to_copy         1.27%      24.112us        83.64%       1.583ms     263.774us       0.000us         0.00%      13.731us       2.288us             6  
+                                            aten::copy_         2.68%      50.691us        80.81%       1.529ms     254.829us      11.618us        59.22%      13.731us       2.288us             6  
+                                           aten::conv1d         0.29%       5.540us         6.41%     121.373us      40.458us       0.000us         0.00%       8.000us       2.667us             3  
+                                      aten::convolution         0.50%       9.420us         6.12%     115.833us      38.611us       0.000us         0.00%       8.000us       2.667us             3  
+                                     aten::_convolution         1.30%      24.670us         5.62%     106.413us      35.471us       0.000us         0.00%       8.000us       2.667us             3  
+                                aten::_conv_depthwise2d         1.20%      22.792us         3.44%      65.133us      21.711us       8.000us        40.78%       8.000us       2.667us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       8.000us        40.78%       8.000us       2.667us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.049us        30.83%       6.049us       2.016us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.569us        28.39%       5.569us       1.856us             3  
+                                Activity Buffer Request        75.63%       1.431ms        75.63%       1.431ms       1.431ms       2.113us        10.77%       2.113us       2.113us             1  
+                                    aten::empty_strided         1.56%      29.560us         1.56%      29.560us       4.927us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.72%      70.343us         3.72%      70.343us       7.816us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.90%      17.091us         1.18%      22.301us       2.478us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.48%       9.090us         0.48%       9.090us       0.606us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.50%       9.490us         0.50%       9.490us       3.163us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.52%       9.830us         0.52%       9.830us       3.277us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.34%       6.400us         0.42%       8.020us       2.673us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.892ms
+Self CUDA time total: 19.618us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     333.691us      1797.81%     333.691us     333.691us             1  
+                                            torch_eager         7.79%     146.606us        99.69%       1.876ms       1.876ms       0.000us         0.00%      20.481us      20.481us             1  
+                                               aten::to         0.31%       5.760us        84.09%       1.582ms     263.706us       0.000us         0.00%      13.569us       2.262us             6  
+                                         aten::_to_copy         1.25%      23.550us        83.79%       1.576ms     262.746us       0.000us         0.00%      13.569us       2.262us             6  
+                                            aten::copy_         2.67%      50.153us        80.95%       1.523ms     253.847us      11.649us        62.76%      13.569us       2.262us             6  
+                                           aten::conv1d         0.31%       5.780us         6.33%     119.033us      39.678us       0.000us         0.00%       6.912us       2.304us             3  
+                                      aten::convolution         0.52%       9.800us         6.02%     113.253us      37.751us       0.000us         0.00%       6.912us       2.304us             3  
+                                     aten::_convolution         1.28%      24.000us         5.50%     103.453us      34.484us       0.000us         0.00%       6.912us       2.304us             3  
+                                aten::_conv_depthwise2d         1.15%      21.640us         3.37%      63.473us      21.158us       6.912us        37.24%       6.912us       2.304us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.912us        37.24%       6.912us       2.304us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.953us        32.07%       5.953us       1.984us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        30.69%       5.696us       1.899us             3  
+                                Activity Buffer Request        75.77%       1.426ms        75.77%       1.426ms       1.426ms       1.920us        10.34%       1.920us       1.920us             1  
+                                    aten::empty_strided         1.59%      29.840us         1.59%      29.840us       4.973us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.79%      71.241us         3.79%      71.241us       7.916us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.92%      17.220us         1.19%      22.362us       2.485us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.47%       8.782us         0.47%       8.782us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.49%       9.312us         0.49%       9.312us       3.104us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.46%       8.581us         0.46%       8.581us       2.860us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.33%       6.290us         0.41%       7.740us       2.580us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.882ms
+Self CUDA time total: 18.561us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.628us      1741.67%     341.628us     341.628us             1  
+                                            torch_eager         6.79%     135.276us        99.76%       1.989ms       1.989ms       0.000us         0.00%      21.759us      21.759us             1  
+                                               aten::to         0.31%       6.091us        85.44%       1.703ms     283.911us       0.000us         0.00%      14.111us       2.352us             6  
+                                         aten::_to_copy         1.20%      23.892us        85.13%       1.697ms     282.896us       0.000us         0.00%      14.111us       2.352us             6  
+                                            aten::copy_         2.47%      49.180us        82.37%       1.642ms     273.716us      11.967us        61.01%      14.111us       2.352us             6  
+                                           aten::conv1d         0.29%       5.740us         6.09%     121.414us      40.471us       0.000us         0.00%       7.648us       2.549us             3  
+                                      aten::convolution         0.55%      11.061us         5.80%     115.674us      38.558us       0.000us         0.00%       7.648us       2.549us             3  
+                                     aten::_convolution         1.19%      23.780us         5.25%     104.613us      34.871us       0.000us         0.00%       7.648us       2.549us             3  
+                                aten::_conv_depthwise2d         1.14%      22.750us         3.26%      64.953us      21.651us       7.648us        38.99%       7.648us       2.549us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.648us        38.99%       7.648us       2.549us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.175us        31.48%       6.175us       2.058us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.792us        29.53%       5.792us       1.931us             3  
+                                Activity Buffer Request        68.82%       1.372ms        68.82%       1.372ms       1.372ms       2.144us        10.93%       2.144us       2.144us             1  
+                                    aten::empty_strided         1.56%      31.190us         1.56%      31.190us       5.198us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.22%     243.619us        12.22%     243.619us      27.069us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.88%      17.629us         1.14%      22.660us       2.518us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.44%       8.782us         0.44%       8.782us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.630us         0.48%       9.630us       3.210us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.50%       9.941us         0.50%       9.941us       3.314us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.34%       6.720us         0.41%       8.110us       2.703us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.994ms
+Self CUDA time total: 19.615us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.213us      1403.01%     341.213us     341.213us             1  
+                                            torch_eager         7.36%     148.867us        99.73%       2.016ms       2.016ms       0.000us         0.00%      26.560us      26.560us             1  
+                                               aten::to         0.30%       6.030us        84.88%       1.716ms     285.962us       0.000us         0.00%      15.168us       2.528us             6  
+                                         aten::_to_copy         1.20%      24.229us        84.58%       1.710ms     284.956us       0.000us         0.00%      15.168us       2.528us             6  
+                                            aten::copy_         2.44%      49.414us        81.85%       1.655ms     275.782us      12.928us        53.16%      15.168us       2.528us             6  
+                                           aten::conv1d         0.28%       5.730us         5.99%     121.174us      40.391us       0.000us         0.00%      11.392us       3.797us             3  
+                                      aten::convolution         0.47%       9.480us         5.71%     115.444us      38.481us       0.000us         0.00%      11.392us       3.797us             3  
+                                     aten::_convolution         1.14%      23.073us         5.24%     105.964us      35.321us       0.000us         0.00%      11.392us       3.797us             3  
+                                aten::_conv_depthwise2d         1.05%      21.189us         3.24%      65.411us      21.804us      11.392us        46.84%      11.392us       3.797us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.392us        46.84%      11.392us       3.797us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.592us        27.11%       6.592us       2.197us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        26.05%       6.336us       2.112us             3  
+                                Activity Buffer Request        70.12%       1.417ms        70.12%       1.417ms       1.417ms       2.240us         9.21%       2.240us       2.240us             1  
+                                    aten::empty_strided         1.52%      30.820us         1.52%      30.820us       5.137us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.45%     211.347us        10.45%     211.347us      23.483us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.95%      19.208us         1.23%      24.829us       2.759us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.46%       9.241us         0.46%       9.241us       0.616us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%       9.482us         0.47%       9.482us       3.161us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.55%      11.190us         0.55%      11.190us       3.730us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.34%       6.961us         0.41%       8.361us       2.787us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.022ms
+Self CUDA time total: 24.320us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     334.330us      1285.14%     334.330us     334.330us             1  
+                                            torch_eager         7.10%     143.875us        99.74%       2.020ms       2.020ms       0.000us         0.00%      28.255us      28.255us             1  
+                                               aten::to         0.28%       5.680us        85.25%       1.727ms     287.810us       0.000us         0.00%      15.232us       2.539us             6  
+                                         aten::_to_copy         1.18%      23.873us        84.97%       1.721ms     286.863us       0.000us         0.00%      15.232us       2.539us             6  
+                                            aten::copy_         2.45%      49.640us        82.36%       1.668ms     278.038us      12.992us        49.94%      15.232us       2.539us             6  
+                                           aten::conv1d         0.29%       5.889us         5.94%     120.414us      40.138us       0.000us         0.00%      13.023us       4.341us             3  
+                                      aten::convolution         0.46%       9.401us         5.65%     114.525us      38.175us       0.000us         0.00%      13.023us       4.341us             3  
+                                     aten::_convolution         1.22%      24.611us         5.19%     105.124us      35.041us       0.000us         0.00%      13.023us       4.341us             3  
+                                aten::_conv_depthwise2d         1.06%      21.480us         3.19%      64.562us      21.521us      13.023us        50.06%      13.023us       4.341us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.023us        50.06%      13.023us       4.341us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.624us        25.46%       6.624us       2.208us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.368us        24.48%       6.368us       2.123us             3  
+                                Activity Buffer Request        71.17%       1.442ms        71.17%       1.442ms       1.442ms       2.240us         8.61%       2.240us       2.240us             1  
+                                    aten::empty_strided         1.44%      29.082us         1.44%      29.082us       4.847us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.85%     199.548us         9.85%     199.548us      22.172us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.91%      18.470us         1.17%      23.650us       2.628us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.44%       8.970us         0.44%       8.970us       0.598us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.51%      10.400us         0.51%      10.400us       3.467us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.50%      10.200us         0.50%      10.200us       3.400us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.30%       6.091us         0.38%       7.621us       2.540us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.026ms
+Self CUDA time total: 26.015us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.315us       888.50%     340.315us     340.315us             1  
+                                            torch_eager         7.29%     147.016us        99.74%       2.012ms       2.012ms       0.000us         0.00%      40.894us      40.894us             1  
+                                           aten::conv1d         0.29%       5.920us         5.91%     119.264us      39.755us       0.000us         0.00%      22.496us       7.499us             3  
+                                      aten::convolution         0.47%       9.411us         5.62%     113.344us      37.781us       0.000us         0.00%      22.496us       7.499us             3  
+                                     aten::_convolution         1.19%      23.960us         5.15%     103.933us      34.644us       0.000us         0.00%      22.496us       7.499us             3  
+                                aten::_conv_depthwise2d         1.11%      22.310us         3.18%      64.143us      21.381us      22.496us        58.73%      22.496us       7.499us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.496us        58.73%      22.496us       7.499us             3  
+                                               aten::to         0.29%       5.851us        85.12%       1.717ms     286.238us       0.000us         0.00%      18.398us       3.066us             6  
+                                         aten::_to_copy         1.17%      23.549us        84.83%       1.712ms     285.263us       0.000us         0.00%      18.398us       3.066us             6  
+                                            aten::copy_         2.43%      48.960us        82.11%       1.657ms     276.121us      15.806us        41.27%      18.398us       3.066us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.416us        21.97%       8.416us       2.805us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.390us        19.29%       7.390us       2.463us             3  
+                                Activity Buffer Request        70.87%       1.430ms        70.87%       1.430ms       1.430ms       2.592us         6.77%       2.592us       2.592us             1  
+                                    aten::empty_strided         1.55%      31.301us         1.55%      31.301us       5.217us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.91%     199.938us         9.91%     199.938us      22.215us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.87%      17.540us         1.13%      22.711us       2.523us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.44%       8.912us         0.44%       8.912us       0.594us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%       9.390us         0.47%       9.390us       3.130us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.51%      10.361us         0.51%      10.361us       3.454us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.30%       6.100us         0.37%       7.550us       2.517us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.018ms
+Self CUDA time total: 38.302us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     363.388us       882.35%     363.388us     363.388us             1  
+                                            torch_eager         8.20%     165.958us        99.73%       2.020ms       2.020ms       0.000us         0.00%      43.808us      43.808us             1  
+                                           aten::conv1d         0.32%       6.510us         6.06%     122.733us      40.911us       0.000us         0.00%      25.408us       8.469us             3  
+                                      aten::convolution         0.48%       9.730us         5.74%     116.223us      38.741us       0.000us         0.00%      25.408us       8.469us             3  
+                                     aten::_convolution         1.17%      23.611us         5.26%     106.493us      35.498us       0.000us         0.00%      25.408us       8.469us             3  
+                                aten::_conv_depthwise2d         1.11%      22.549us         3.28%      66.422us      22.141us      25.408us        61.69%      25.408us       8.469us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.408us        61.69%      25.408us       8.469us             3  
+                                               aten::to         0.31%       6.220us        83.98%       1.701ms     283.450us       0.000us         0.00%      18.400us       3.067us             6  
+                                         aten::_to_copy         1.16%      23.591us        83.68%       1.694ms     282.413us       0.000us         0.00%      18.400us       3.067us             6  
+                                            aten::copy_         2.51%      50.781us        81.00%       1.640ms     273.388us      15.776us        38.31%      18.400us       3.067us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.352us        20.28%       8.352us       2.784us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.424us        18.03%       7.424us       2.475us             3  
+                                Activity Buffer Request        69.68%       1.411ms        69.68%       1.411ms       1.411ms       2.624us         6.37%       2.624us       2.624us             1  
+                                    aten::empty_strided         1.51%      30.560us         1.51%      30.560us       5.093us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.99%     202.397us         9.99%     202.397us      22.489us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.88%      17.759us         1.14%      23.000us       2.556us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.46%       9.250us         0.46%       9.250us       0.617us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.51%      10.382us         0.51%      10.382us       3.461us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.48%       9.651us         0.48%       9.651us       3.217us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.33%       6.630us         0.40%       8.160us       2.720us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.025ms
+Self CUDA time total: 41.184us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     352.830us       343.38%     352.830us     352.830us             1  
+                                            torch_eager         7.15%     144.983us        99.76%       2.023ms       2.023ms       0.000us         0.00%     108.768us     108.768us             1  
+                                           aten::conv1d         0.29%       5.781us         5.92%     120.074us      40.025us       0.000us         0.00%      70.432us      23.477us             3  
+                                      aten::convolution         0.47%       9.599us         5.64%     114.293us      38.098us       0.000us         0.00%      70.432us      23.477us             3  
+                                     aten::_convolution         1.14%      23.149us         5.16%     104.694us      34.898us       0.000us         0.00%      70.432us      23.477us             3  
+                                aten::_conv_depthwise2d         1.16%      23.581us         3.22%      65.212us      21.737us      70.432us        68.55%      70.432us      23.477us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.432us        68.55%      70.432us      23.477us             3  
+                                               aten::to         0.30%       6.111us        85.26%       1.729ms     288.085us       0.000us         0.00%      38.336us       6.389us             6  
+                                         aten::_to_copy         1.62%      32.820us        84.95%       1.722ms     287.067us       0.000us         0.00%      38.336us       6.389us             6  
+                                            aten::copy_         2.46%      49.781us        81.90%       1.660ms     276.745us      32.320us        31.45%      38.336us       6.389us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.696us        17.22%      17.696us       5.899us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.624us        14.23%      14.624us       4.875us             3  
+                                Activity Buffer Request        70.70%       1.433ms        70.70%       1.433ms       1.433ms       6.016us         5.85%       6.016us       6.016us             1  
+                                    aten::empty_strided         1.44%      29.111us         1.44%      29.111us       4.852us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.89%     200.449us         9.89%     200.449us      22.272us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.88%      17.943us         1.16%      23.512us       2.612us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.46%       9.330us         0.46%       9.330us       0.622us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%       9.471us         0.47%       9.471us       3.157us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.45%       9.050us         0.45%       9.050us       3.017us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.32%       6.391us         0.39%       7.911us       2.637us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.027ms
+Self CUDA time total: 102.752us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     330.363us       292.97%     330.363us     330.363us             1  
+                                            torch_eager        16.10%     118.634us        99.31%     731.955us     731.955us       0.000us         0.00%     118.781us     118.781us             1  
+                                           aten::conv1d         0.80%       5.881us        15.92%     117.344us      39.115us       0.000us         0.00%      80.541us      26.847us             3  
+                                      aten::convolution         1.32%       9.760us        15.12%     111.463us      37.154us       0.000us         0.00%      80.541us      26.847us             3  
+                                     aten::_convolution         3.06%      22.540us        13.80%     101.703us      33.901us       0.000us         0.00%      80.541us      26.847us             3  
+                                aten::_conv_depthwise2d         2.83%      20.841us         8.49%      62.593us      20.864us      80.541us        71.42%      80.541us      26.847us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.541us        71.42%      80.541us      26.847us             3  
+                                               aten::to         0.79%       5.790us        63.53%     468.255us      78.043us       0.000us         0.00%      38.240us       6.373us             6  
+                                         aten::_to_copy         3.21%      23.660us        62.75%     462.465us      77.078us       0.000us         0.00%      38.240us       6.373us             6  
+                                            aten::copy_         6.76%      49.831us        55.55%     409.415us      68.236us      32.224us        28.58%      38.240us       6.373us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.728us        15.72%      17.728us       5.909us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.496us        12.86%      14.496us       4.832us             3  
+                                Activity Buffer Request        25.24%     185.996us        25.24%     185.996us     185.996us       6.016us         5.33%       6.016us       6.016us             1  
+                                    aten::empty_strided         3.99%      29.390us         3.99%      29.390us       4.898us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.60%     196.028us        26.60%     196.028us      21.781us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.44%      17.960us         3.11%      22.951us       2.550us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.15%       8.461us         1.15%       8.461us       0.564us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.35%       9.931us         1.35%       9.931us       3.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.27%       9.381us         1.27%       9.381us       3.127us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.87%       6.430us         1.06%       7.840us       2.613us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 737.005us
+Self CUDA time total: 112.765us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager        22.21%     170.695us        99.32%     763.366us     763.366us       0.000us         0.00%     430.770us     430.770us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     416.723us       106.46%     416.723us     416.723us             1  
+                                           aten::conv1d         0.77%       5.951us        14.86%     114.225us      38.075us       0.000us         0.00%     251.288us      83.763us             3  
+                                      aten::convolution         1.24%       9.541us        14.09%     108.274us      36.091us       0.000us         0.00%     251.288us      83.763us             3  
+                                     aten::_convolution         2.83%      21.719us        12.85%      98.733us      32.911us       0.000us         0.00%     251.288us      83.763us             3  
+                                aten::_conv_depthwise2d         2.74%      21.061us         7.99%      61.422us      20.474us     251.288us        64.20%     251.288us      83.763us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     251.288us        64.20%     251.288us      83.763us             3  
+                                               aten::to         0.75%       5.750us        58.89%     452.676us      75.446us       0.000us         0.00%     179.482us      29.914us             6  
+                                         aten::_to_copy         3.02%      23.182us        58.15%     446.926us      74.488us       0.000us         0.00%     179.482us      29.914us             6  
+                                            aten::copy_         6.40%      49.211us        51.58%     396.473us      66.079us     140.155us        35.80%     179.482us      29.914us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     100.254us        25.61%     100.254us      33.418us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      39.901us        10.19%      39.901us      13.300us             3  
+                                Activity Buffer Request        22.72%     174.636us        22.72%     174.636us     174.636us      39.327us        10.05%      39.327us      39.327us             1  
+                                    aten::empty_strided         3.55%      27.271us         3.55%      27.271us       4.545us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.36%     194.936us        25.36%     194.936us      21.660us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.13%      16.381us         2.81%      21.611us       2.401us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.16%       8.880us         1.16%       8.880us       0.592us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.18%       9.091us         1.18%       9.091us       3.030us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.17%       8.960us         1.17%       8.960us       2.987us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.75%       5.770us         0.94%       7.191us       2.397us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 768.616us
+Self CUDA time total: 391.443us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager        13.26%     117.114us        87.73%     774.557us     774.557us       0.000us         0.00%     486.014us     486.014us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     473.342us       105.98%     473.342us     473.342us             1  
+                                           aten::conv1d         0.63%       5.520us        13.02%     114.943us      38.314us       0.000us         0.00%     298.622us      99.541us             3  
+                                      aten::convolution         1.08%       9.570us        12.39%     109.423us      36.474us       0.000us         0.00%     298.622us      99.541us             3  
+                                     aten::_convolution         2.49%      22.001us        11.31%      99.853us      33.284us       0.000us         0.00%     298.622us      99.541us             3  
+                                aten::_conv_depthwise2d         2.40%      21.190us         7.05%      62.252us      20.751us     298.622us        66.86%     298.622us      99.541us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     298.622us        66.86%     298.622us      99.541us             3  
+                                               aten::to         0.65%       5.781us        58.29%     514.667us      85.778us       0.000us         0.00%     187.392us      31.232us             6  
+                                         aten::_to_copy         2.57%      22.699us        57.64%     508.886us      84.814us       0.000us         0.00%     187.392us      31.232us             6  
+                                            aten::copy_         5.62%      49.650us        51.80%     457.366us      76.228us     148.032us        33.14%     187.392us      31.232us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     108.256us        24.24%     108.256us      36.085us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      39.776us         8.91%      39.776us      13.259us             3  
+                                Activity Buffer Request        26.78%     236.449us        26.78%     236.449us     236.449us      39.360us         8.81%      39.360us      39.360us             1  
+                                    aten::empty_strided         3.26%      28.821us         3.26%      28.821us       4.804us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        22.01%     194.327us        22.01%     194.327us      21.592us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.00%      17.701us         2.60%      22.912us       2.546us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.01%       8.901us         1.01%       8.901us       0.593us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.05%       9.311us         1.05%       9.311us       3.104us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.98%       8.691us         0.98%       8.691us       2.897us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.65%       5.750us         0.82%       7.230us       2.410us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 882.890us
+Self CUDA time total: 446.654us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     324.122us      1734.57%     324.122us     324.122us             1  
+                                            torch_eager        15.60%     121.627us        99.38%     775.067us     775.067us       0.000us         0.00%      20.574us      20.574us             1  
+                                               aten::to         0.72%       5.589us        65.70%     512.356us      85.393us       0.000us         0.00%      13.343us       2.224us             6  
+                                         aten::_to_copy         2.88%      22.431us        64.98%     506.767us      84.461us       0.000us         0.00%      13.343us       2.224us             6  
+                                            aten::copy_         6.46%      50.411us        58.51%     456.326us      76.054us      11.455us        61.30%      13.343us       2.224us             6  
+                                           aten::conv1d         0.72%       5.580us        14.59%     113.823us      37.941us       0.000us         0.00%       7.231us       2.410us             3  
+                                      aten::convolution         1.19%       9.260us        13.88%     108.243us      36.081us       0.000us         0.00%       7.231us       2.410us             3  
+                                     aten::_convolution         2.87%      22.359us        12.69%      98.983us      32.994us       0.000us         0.00%       7.231us       2.410us             3  
+                                aten::_conv_depthwise2d         2.67%      20.840us         7.84%      61.153us      20.384us       7.231us        38.70%       7.231us       2.410us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.231us        38.70%       7.231us       2.410us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.856us        31.34%       5.856us       1.952us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.599us        29.96%       5.599us       1.866us             3  
+                                Activity Buffer Request        30.21%     235.608us        30.21%     235.608us     235.608us       1.888us        10.10%       1.888us       1.888us             1  
+                                    aten::empty_strided         3.59%      28.010us         3.59%      28.010us       4.668us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.63%     192.088us        24.63%     192.088us      21.343us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.29%      17.871us         2.95%      23.001us       2.556us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.13%       8.820us         1.13%       8.820us       0.588us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.21%       9.401us         1.21%       9.401us       3.134us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.17%       9.131us         1.17%       9.131us       3.044us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.75%       5.851us         0.94%       7.321us       2.440us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 779.877us
+Self CUDA time total: 18.686us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     316.346us      1628.63%     316.346us     316.346us             1  
+                                            torch_eager        14.51%     117.604us        99.38%     805.188us     805.188us       0.000us         0.00%      21.312us      21.312us             1  
+                                               aten::to         0.69%       5.621us        67.40%     546.068us      91.011us       0.000us         0.00%      13.376us       2.229us             6  
+                                         aten::_to_copy         2.81%      22.789us        66.70%     540.447us      90.075us       0.000us         0.00%      13.376us       2.229us             6  
+                                            aten::copy_         5.89%      47.733us        60.20%     487.757us      81.293us      11.488us        59.14%      13.376us       2.229us             6  
+                                           aten::conv1d         0.69%       5.581us        14.11%     114.294us      38.098us       0.000us         0.00%       7.936us       2.645us             3  
+                                      aten::convolution         1.17%       9.520us        13.42%     108.713us      36.238us       0.000us         0.00%       7.936us       2.645us             3  
+                                     aten::_convolution         2.68%      21.682us        12.24%      99.193us      33.064us       0.000us         0.00%       7.936us       2.645us             3  
+                                aten::_conv_depthwise2d         2.64%      21.391us         7.61%      61.682us      20.561us       7.936us        40.86%       7.936us       2.645us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        40.86%       7.936us       2.645us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.856us        30.15%       5.856us       1.952us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.632us        29.00%       5.632us       1.877us             3  
+                                Activity Buffer Request        33.53%     271.649us        33.53%     271.649us     271.649us       1.888us         9.72%       1.888us       1.888us             1  
+                                    aten::empty_strided         3.69%      29.901us         3.69%      29.901us       4.984us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        23.39%     189.555us        23.39%     189.555us      21.062us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.18%      17.698us         2.81%      22.771us       2.530us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.07%       8.674us         1.07%       8.674us       0.578us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.14%       9.260us         1.14%       9.260us       3.087us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.22%       9.851us         1.22%       9.851us       3.284us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.76%       6.120us         0.93%       7.530us       2.510us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 810.248us
+Self CUDA time total: 19.424us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     322.590us      1658.13%     322.590us     322.590us             1  
+                                            torch_eager         6.77%     135.447us        99.76%       1.996ms       1.996ms       0.000us         0.00%      21.631us      21.631us             1  
+                                               aten::to         0.29%       5.801us        85.87%       1.718ms     286.282us       0.000us         0.00%      14.400us       2.400us             6  
+                                         aten::_to_copy         1.16%      23.150us        85.58%       1.712ms     285.315us       0.000us         0.00%      14.400us       2.400us             6  
+                                            aten::copy_         2.46%      49.110us        82.93%       1.659ms     276.491us      12.224us        62.83%      14.400us       2.400us             6  
+                                           aten::conv1d         0.28%       5.690us         5.75%     114.953us      38.318us       0.000us         0.00%       7.231us       2.410us             3  
+                                      aten::convolution         0.48%       9.520us         5.46%     109.263us      36.421us       0.000us         0.00%       7.231us       2.410us             3  
+                                     aten::_convolution         1.10%      21.931us         4.99%      99.743us      33.248us       0.000us         0.00%       7.231us       2.410us             3  
+                                aten::_conv_depthwise2d         1.06%      21.231us         3.12%      62.372us      20.791us       7.231us        37.17%       7.231us       2.410us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.231us        37.17%       7.231us       2.410us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        32.40%       6.304us       2.101us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        30.43%       5.920us       1.973us             3  
+                                Activity Buffer Request        71.98%       1.440ms        71.98%       1.440ms       1.440ms       2.176us        11.18%       2.176us       2.176us             1  
+                                    aten::empty_strided         1.49%      29.791us         1.49%      29.791us       4.965us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.66%     193.277us         9.66%     193.277us      21.475us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.86%      17.278us         1.13%      22.539us       2.504us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       9.001us         0.45%       9.001us       0.600us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.46%       9.281us         0.46%       9.281us       3.094us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.43%       8.570us         0.43%       8.570us       2.857us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.29%       5.760us         0.36%       7.200us       2.400us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.000ms
+Self CUDA time total: 19.455us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     326.589us      1627.82%     326.589us     326.589us             1  
+                                            torch_eager         7.03%     140.275us        99.72%       1.991ms       1.991ms       0.000us         0.00%      22.207us      22.207us             1  
+                                               aten::to         0.30%       6.010us        85.45%       1.706ms     284.341us       0.000us         0.00%      14.304us       2.384us             6  
+                                         aten::_to_copy         1.18%      23.623us        85.15%       1.700ms     283.340us       0.000us         0.00%      14.304us       2.384us             6  
+                                            aten::copy_         2.42%      48.261us        82.53%       1.648ms     274.613us      12.160us        60.61%      14.304us       2.384us             6  
+                                           aten::conv1d         0.34%       6.690us         5.89%     117.664us      39.221us       0.000us         0.00%       7.903us       2.634us             3  
+                                      aten::convolution         0.46%       9.260us         5.56%     110.974us      36.991us       0.000us         0.00%       7.903us       2.634us             3  
+                                     aten::_convolution         1.15%      23.009us         5.09%     101.714us      33.905us       0.000us         0.00%       7.903us       2.634us             3  
+                                aten::_conv_depthwise2d         1.10%      21.970us         3.15%      62.812us      20.937us       7.903us        39.39%       7.903us       2.634us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.903us        39.39%       7.903us       2.634us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.240us        31.10%       6.240us       2.080us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        29.51%       5.920us       1.973us             3  
+                                Activity Buffer Request        71.49%       1.427ms        71.49%       1.427ms       1.427ms       2.144us        10.69%       2.144us       2.144us             1  
+                                    aten::empty_strided         1.44%      28.740us         1.44%      28.740us       4.790us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.68%     193.308us         9.68%     193.308us      21.479us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.85%      16.982us         1.11%      22.224us       2.469us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       8.892us         0.45%       8.892us       0.593us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%       9.420us         0.47%       9.420us       3.140us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.51%      10.100us         0.51%      10.100us       3.367us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       6.130us         0.38%       7.650us       2.550us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.997ms
+Self CUDA time total: 20.063us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     319.164us       887.36%     319.164us     319.164us             1  
+                                            torch_eager        15.26%     115.785us        99.38%     754.046us     754.046us       0.000us         0.00%      38.560us      38.560us             1  
+                                           aten::conv1d         0.72%       5.471us        14.90%     113.045us      37.682us       0.000us         0.00%      20.097us       6.699us             3  
+                                      aten::convolution         1.25%       9.510us        14.18%     107.574us      35.858us       0.000us         0.00%      20.097us       6.699us             3  
+                                     aten::_convolution         2.85%      21.590us        12.92%      98.064us      32.688us       0.000us         0.00%      20.097us       6.699us             3  
+                                aten::_conv_depthwise2d         2.82%      21.412us         8.06%      61.133us      20.378us      20.097us        55.87%      20.097us       6.699us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.097us        55.87%      20.097us       6.699us             3  
+                                               aten::to         0.74%       5.628us        65.55%     497.346us      82.891us       0.000us         0.00%      18.463us       3.077us             6  
+                                         aten::_to_copy         3.02%      22.942us        64.80%     491.718us      81.953us       0.000us         0.00%      18.463us       3.077us             6  
+                                            aten::copy_         6.50%      49.290us        57.91%     439.376us      73.229us      15.871us        44.13%      18.463us       3.077us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.447us        23.48%       8.447us       2.816us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.424us        20.64%       7.424us       2.475us             3  
+                                Activity Buffer Request        28.99%     219.958us        28.99%     219.958us     219.958us       2.592us         7.21%       2.592us       2.592us             1  
+                                    aten::empty_strided         3.87%      29.400us         3.87%      29.400us       4.900us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.31%     192.058us        25.31%     192.058us      21.340us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.43%      18.410us         3.09%      23.410us       2.601us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.12%       8.490us         1.12%       8.490us       0.566us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.20%       9.081us         1.20%       9.081us       3.027us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.15%       8.710us         1.15%       8.710us       2.903us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.77%       5.871us         0.96%       7.301us       2.434us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 758.766us
+Self CUDA time total: 35.968us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     318.166us       839.07%     318.166us     318.166us             1  
+                                            torch_eager        15.61%     115.614us        99.23%     735.056us     735.056us       0.000us         0.00%      40.512us      40.512us             1  
+                                           aten::conv1d         0.77%       5.689us        15.23%     112.833us      37.611us       0.000us         0.00%      22.206us       7.402us             3  
+                                      aten::convolution         1.28%       9.450us        14.46%     107.144us      35.715us       0.000us         0.00%      22.206us       7.402us             3  
+                                     aten::_convolution         2.90%      21.450us        13.19%      97.694us      32.565us       0.000us         0.00%      22.206us       7.402us             3  
+                                aten::_conv_depthwise2d         2.86%      21.190us         8.15%      60.352us      20.117us      22.206us        58.56%      22.206us       7.402us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.206us        58.56%      22.206us       7.402us             3  
+                                               aten::to         0.76%       5.621us        64.62%     478.657us      79.776us       0.000us         0.00%      18.306us       3.051us             6  
+                                         aten::_to_copy         3.14%      23.241us        63.86%     473.036us      78.839us       0.000us         0.00%      18.306us       3.051us             6  
+                                            aten::copy_         6.66%      49.364us        56.82%     420.865us      70.144us      15.713us        41.44%      18.306us       3.051us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.385us        22.11%       8.385us       2.795us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        19.33%       7.328us       2.443us             3  
+                                Activity Buffer Request        27.11%     200.816us        27.11%     200.816us     200.816us       2.593us         6.84%       2.593us       2.593us             1  
+                                    aten::empty_strided         3.91%      28.930us         3.91%      28.930us       4.822us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.94%     192.117us        25.94%     192.117us      21.346us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.42%      17.932us         3.14%      23.222us       2.580us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.19%       8.781us         1.19%       8.781us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.25%       9.270us         1.25%       9.270us       3.090us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.14%       8.460us         1.14%       8.460us       2.820us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.85%       6.280us         1.02%       7.591us       2.530us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 740.726us
+Self CUDA time total: 37.919us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     321.366us       502.64%     321.366us     321.366us             1  
+                                            torch_eager        15.27%     113.396us        99.28%     737.126us     737.126us       0.000us         0.00%      68.031us      68.031us             1  
+                                           aten::conv1d         0.76%       5.670us        15.56%     115.503us      38.501us       0.000us         0.00%      41.567us      13.856us             3  
+                                      aten::convolution         1.28%       9.489us        14.79%     109.833us      36.611us       0.000us         0.00%      41.567us      13.856us             3  
+                                     aten::_convolution         3.08%      22.850us        13.52%     100.344us      33.448us       0.000us         0.00%      41.567us      13.856us             3  
+                                aten::_conv_depthwise2d         2.89%      21.483us         8.27%      61.383us      20.461us      41.567us        65.01%      41.567us      13.856us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.567us        65.01%      41.567us      13.856us             3  
+                                               aten::to         0.76%       5.660us        64.85%     481.506us      80.251us       0.000us         0.00%      26.464us       4.411us             6  
+                                         aten::_to_copy         3.08%      22.842us        64.09%     475.846us      79.308us       0.000us         0.00%      26.464us       4.411us             6  
+                                            aten::copy_         6.57%      48.752us        57.01%     423.304us      70.551us      22.368us        34.99%      26.464us       4.411us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.968us        18.72%      11.968us       3.989us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us        16.27%      10.400us       3.467us             3  
+                                Activity Buffer Request        27.27%     202.487us        27.27%     202.487us     202.487us       4.096us         6.41%       4.096us       4.096us             1  
+                                    aten::empty_strided         4.00%      29.700us         4.00%      29.700us       4.950us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.15%     194.125us        26.15%     194.125us      21.569us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.30%      17.061us         2.99%      22.191us       2.466us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.19%       8.800us         1.19%       8.800us       0.587us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.25%       9.280us         1.25%       9.280us       3.093us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.15%       8.560us         1.15%       8.560us       2.853us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.77%       5.741us         0.96%       7.151us       2.384us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 742.446us
+Self CUDA time total: 63.935us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     326.264us       468.36%     326.264us     326.264us             1  
+                                            torch_eager        14.61%     117.663us        99.38%     800.347us     800.347us       0.000us         0.00%      73.789us      73.789us             1  
+                                           aten::conv1d         0.75%       6.020us        14.38%     115.844us      38.615us       0.000us         0.00%      47.230us      15.743us             3  
+                                      aten::convolution         1.16%       9.351us        13.64%     109.824us      36.608us       0.000us         0.00%      47.230us      15.743us             3  
+                                     aten::_convolution         2.76%      22.250us        12.48%     100.473us      33.491us       0.000us         0.00%      47.230us      15.743us             3  
+                                aten::_conv_depthwise2d         2.71%      21.790us         7.76%      62.461us      20.820us      47.230us        67.80%      47.230us      15.743us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.230us        67.80%      47.230us      15.743us             3  
+                                               aten::to         0.71%       5.690us        66.94%     539.059us      89.843us       0.000us         0.00%      26.559us       4.426us             6  
+                                         aten::_to_copy         2.87%      23.082us        66.23%     533.369us      88.895us       0.000us         0.00%      26.559us       4.426us             6  
+                                            aten::copy_         6.12%      49.260us        59.73%     480.976us      80.163us      22.431us        32.20%      26.559us       4.426us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        17.23%      12.000us       4.000us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us        14.97%      10.431us       3.477us             3  
+                                Activity Buffer Request        29.99%     241.509us        29.99%     241.509us     241.509us       4.128us         5.93%       4.128us       4.128us             1  
+                                    aten::empty_strided         3.64%      29.311us         3.64%      29.311us       4.885us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.37%     212.348us        26.37%     212.348us      23.594us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.22%      17.841us         2.86%      23.041us       2.560us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.09%       8.761us         1.09%       8.761us       0.584us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.16%       9.320us         1.16%       9.320us       3.107us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.14%       9.210us         1.14%       9.210us       3.070us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.77%       6.201us         0.95%       7.621us       2.540us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 805.317us
+Self CUDA time total: 69.661us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     372.509us       200.60%     372.509us     372.509us             1  
+                                            torch_eager        16.32%     136.903us        99.36%     833.418us     833.418us       0.000us         0.00%     195.711us     195.711us             1  
+                                           aten::conv1d         0.67%       5.580us        15.45%     129.615us      43.205us       0.000us         0.00%     133.247us      44.416us             3  
+                                      aten::convolution         1.13%       9.510us        14.79%     124.035us      41.345us       0.000us         0.00%     133.247us      44.416us             3  
+                                     aten::_convolution         3.89%      32.633us        13.65%     114.525us      38.175us       0.000us         0.00%     133.247us      44.416us             3  
+                                aten::_conv_depthwise2d         2.50%      20.960us         7.87%      66.022us      22.007us     133.247us        71.76%     133.247us      44.416us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     133.247us        71.76%     133.247us      44.416us             3  
+                                               aten::to         0.72%       6.039us        64.27%     539.099us      89.850us       0.000us         0.00%      62.464us      10.411us             6  
+                                         aten::_to_copy         2.75%      23.094us        63.55%     533.060us      88.843us       0.000us         0.00%      62.464us      10.411us             6  
+                                            aten::copy_         5.97%      50.071us        57.15%     479.385us      79.897us      52.448us        28.24%      62.464us      10.411us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.504us        15.89%      29.504us       9.835us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.944us        12.36%      22.944us       7.648us             3  
+                                Activity Buffer Request        30.64%     256.969us        30.64%     256.969us     256.969us      10.016us         5.39%      10.016us      10.016us             1  
+                                    aten::empty_strided         3.65%      30.581us         3.65%      30.581us       5.097us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        23.59%     197.827us        23.59%     197.827us      21.981us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.16%      18.130us         2.81%      23.610us       2.623us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.09%       9.169us         1.09%       9.169us       0.611us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.19%       9.940us         1.19%       9.940us       3.313us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.15%       9.640us         1.15%       9.640us       3.213us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.72%       6.001us         0.89%       7.490us       2.497us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 838.778us
+Self CUDA time total: 185.695us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     368.701us       175.32%     368.701us     368.701us             1  
+                                            torch_eager        16.38%     138.724us        99.39%     841.559us     841.559us       0.000us         0.00%     224.383us     224.383us             1  
+                                           aten::conv1d         0.69%       5.870us        14.05%     118.945us      39.648us       0.000us         0.00%     154.015us      51.338us             3  
+                                      aten::convolution         1.19%      10.050us        13.35%     113.075us      37.692us       0.000us         0.00%     154.015us      51.338us             3  
+                                     aten::_convolution         2.68%      22.669us        12.17%     103.025us      34.342us       0.000us         0.00%     154.015us      51.338us             3  
+                                aten::_conv_depthwise2d         2.54%      21.472us         7.66%      64.883us      21.628us     154.015us        73.23%     154.015us      51.338us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     154.015us        73.23%     154.015us      51.338us             3  
+                                               aten::to         0.70%       5.911us        65.49%     554.540us      92.423us       0.000us         0.00%      70.368us      11.728us             6  
+                                         aten::_to_copy         2.70%      22.862us        64.79%     548.629us      91.438us       0.000us         0.00%      70.368us      11.728us             6  
+                                            aten::copy_         5.97%      50.511us        58.49%     495.276us      82.546us      56.288us        26.77%      70.368us      11.728us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      33.248us        15.81%      33.248us      11.083us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.040us        10.96%      23.040us       7.680us             3  
+                                Activity Buffer Request        32.21%     272.739us        32.21%     272.739us     272.739us      14.080us         6.70%      14.080us      14.080us             1  
+                                    aten::empty_strided         3.60%      30.491us         3.60%      30.491us       5.082us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        23.06%     195.277us        23.06%     195.277us      21.697us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.26%      19.134us         2.91%      24.623us       2.736us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.07%       9.019us         1.07%       9.019us       0.601us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.15%       9.700us         1.15%       9.700us       3.233us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.24%      10.460us         1.24%      10.460us       3.487us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.68%       5.760us         0.85%       7.180us       2.393us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 846.749us
+Self CUDA time total: 210.303us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         6.86%     124.525us        53.03%     963.064us     963.064us       0.000us         0.00%       1.524ms       1.524ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.423ms       100.42%       1.423ms       1.423ms             1  
+                                               aten::to         0.37%       6.781us        38.11%     692.105us     115.351us       0.000us         0.00%     827.798us     137.966us             6  
+                                         aten::_to_copy         1.62%      29.329us        37.74%     685.324us     114.221us       0.000us         0.00%     827.798us     137.966us             6  
+                                            aten::copy_         2.86%      52.014us        24.74%     449.228us      74.871us     721.111us        50.87%     827.798us     137.966us             6  
+                                           aten::conv1d         0.32%       5.800us         6.51%     118.154us      39.385us       0.000us         0.00%     696.313us     232.104us             3  
+                                      aten::convolution         0.55%       9.981us         6.19%     112.354us      37.451us       0.000us         0.00%     696.313us     232.104us             3  
+                                     aten::_convolution         1.25%      22.722us         5.64%     102.373us      34.124us       0.000us         0.00%     696.313us     232.104us             3  
+                                aten::_conv_depthwise2d         1.22%      22.241us         3.54%      64.332us      21.444us     696.313us        49.13%     696.313us     232.104us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     696.313us        49.13%     696.313us     232.104us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     411.194us        29.01%     411.194us     137.065us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     309.917us        21.86%     309.917us     103.306us             3  
+                                Activity Buffer Request        12.02%     218.207us        12.02%     218.207us     218.207us     106.687us         7.53%     106.687us     106.687us             1  
+                                    aten::empty_strided         1.97%      35.692us        11.39%     206.767us      34.461us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.11%     201.717us        11.11%     201.717us      22.413us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.96%      17.369us         1.26%      22.889us       2.543us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.51%       9.249us         0.51%       9.249us       0.617us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.50%       9.061us         0.50%       9.061us       3.020us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.57%      10.320us         0.57%      10.320us       3.440us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.33%       5.990us         0.41%       7.360us       2.453us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.816ms
+Self CUDA time total: 1.417ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         6.33%     114.706us        41.01%     743.286us     743.286us       0.000us         0.00%       1.500ms       1.500ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.431ms       100.39%       1.431ms       1.431ms             1  
+                                               aten::to         0.32%       5.881us        26.81%     485.936us      80.989us       0.000us         0.00%     762.577us     127.096us             6  
+                                         aten::_to_copy         1.28%      23.109us        26.49%     480.055us      80.009us       0.000us         0.00%     762.577us     127.096us             6  
+                                            aten::copy_         2.74%      49.733us        23.67%     429.056us      71.509us     687.698us        48.25%     762.577us     127.096us             6  
+                                           aten::conv1d         0.31%       5.590us         6.38%     115.623us      38.541us       0.000us         0.00%     737.523us     245.841us             3  
+                                      aten::convolution         0.55%       9.990us         6.07%     110.033us      36.678us       0.000us         0.00%     737.523us     245.841us             3  
+                                     aten::_convolution         1.21%      21.900us         5.52%     100.043us      33.348us       0.000us         0.00%     737.523us     245.841us             3  
+                                aten::_conv_depthwise2d         1.16%      21.072us         3.45%      62.453us      20.818us     737.523us        51.75%     737.523us     245.841us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     737.523us        51.75%     737.523us     245.841us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     400.247us        28.08%     400.247us     133.416us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     287.451us        20.17%     287.451us      95.817us             3  
+                                Activity Buffer Request        11.32%     205.227us        11.32%     205.227us     205.227us      74.879us         5.25%      74.879us      74.879us             1  
+                                    aten::empty_strided         1.54%      27.890us         1.54%      27.890us       4.648us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.89%     197.296us        10.89%     197.296us      21.922us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.95%      17.181us         1.23%      22.321us       2.480us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.49%       8.961us         0.49%       8.961us       0.597us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.50%       9.050us         0.50%       9.050us       3.017us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.50%       9.131us         0.50%       9.131us       3.044us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.32%       5.870us         0.41%       7.390us       2.463us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.812ms
+Self CUDA time total: 1.425ms
+
+
+impl                     wl                  p50(ms)  ok
+torch_eager              cuda_B2_D2048_S128_W2     0.09  True
+torch_eager              cuda_B2_D2048_S128_W4     0.08  True
+torch_eager              cuda_B2_D2048_S2048_W2     0.15  True
+torch_eager              cuda_B2_D2048_S2048_W4     0.16  True
+torch_eager              cuda_B2_D2048_S512_W2     0.08  True
+torch_eager              cuda_B2_D2048_S512_W4     0.08  True
+torch_eager              cuda_B2_D64_S128_W2     0.07  True
+torch_eager              cuda_B2_D64_S128_W4     0.09  True
+torch_eager              cuda_B2_D64_S2048_W2     0.09  True
+torch_eager              cuda_B2_D64_S2048_W4     0.08  True
+torch_eager              cuda_B2_D64_S512_W2     0.09  True
+torch_eager              cuda_B2_D64_S512_W4     0.08  True
+torch_eager              cuda_B4_D2048_S128_W2     0.08  True
+torch_eager              cuda_B4_D2048_S128_W4     0.08  True
+torch_eager              cuda_B4_D2048_S2048_W2     0.49  True
+torch_eager              cuda_B4_D2048_S2048_W4     0.50  True
+torch_eager              cuda_B4_D2048_S512_W2     0.09  True
+torch_eager              cuda_B4_D2048_S512_W4     0.10  True
+torch_eager              cuda_B4_D64_S128_W2     0.08  True
+torch_eager              cuda_B4_D64_S128_W4     0.08  True
+torch_eager              cuda_B4_D64_S2048_W2     0.08  True
+torch_eager              cuda_B4_D64_S2048_W4     0.08  True
+torch_eager              cuda_B4_D64_S512_W2     0.08  True
+torch_eager              cuda_B4_D64_S512_W4     0.08  True
+
+
+

Artifacts:

+causal_conv1d.jsonl +
+
+
+