Running causal_conv1d benchmark on cuda with 24 workloads.
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 448.254us 2311.66% 448.254us 448.254us 1
+ torch_eager 10.53% 223.197us 99.60% 2.112ms 2.112ms 0.000us 0.00% 21.727us 21.727us 1
+ aten::to 0.57% 12.032us 79.33% 1.682ms 280.390us 0.000us 0.00% 14.304us 2.384us 6
+ aten::_to_copy 1.82% 38.532us 78.77% 1.670ms 278.384us 0.000us 0.00% 14.304us 2.384us 6
+ aten::copy_ 2.94% 62.272us 74.35% 1.577ms 262.784us 11.968us 61.72% 14.304us 2.384us 6
+ aten::conv1d 0.36% 7.640us 7.60% 161.165us 53.722us 0.000us 0.00% 7.423us 2.474us 3
+ aten::convolution 0.68% 14.400us 7.24% 153.525us 51.175us 0.000us 0.00% 7.423us 2.474us 3
+ aten::_convolution 1.64% 34.820us 6.56% 139.125us 46.375us 0.000us 0.00% 7.423us 2.474us 3
+ aten::_conv_depthwise2d 1.64% 34.779us 4.03% 85.503us 28.501us 7.423us 38.28% 7.423us 2.474us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.423us 38.28% 7.423us 2.474us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 32.51% 6.304us 2.101us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 29.21% 5.664us 1.888us 3
+ Activity Buffer Request 68.27% 1.448ms 68.27% 1.448ms 1.448ms 2.336us 12.05% 2.336us 2.336us 1
+ aten::empty_strided 2.60% 55.071us 2.60% 55.071us 9.178us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 4.35% 92.254us 4.35% 92.254us 10.250us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 1.39% 29.522us 1.76% 37.262us 4.140us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.59% 12.410us 0.59% 12.410us 0.827us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.52% 10.960us 0.52% 10.960us 3.653us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.67% 14.291us 0.67% 14.291us 4.764us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.39% 8.321us 0.47% 9.881us 3.294us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.121ms
+Self CUDA time total: 19.391us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 334.942us 1707.32% 334.942us 334.942us 1
+ torch_eager 7.85% 148.604us 99.72% 1.887ms 1.887ms 0.000us 0.00% 21.731us 21.731us 1
+ aten::to 0.32% 6.111us 83.97% 1.589ms 264.793us 0.000us 0.00% 13.731us 2.288us 6
+ aten::_to_copy 1.27% 24.112us 83.64% 1.583ms 263.774us 0.000us 0.00% 13.731us 2.288us 6
+ aten::copy_ 2.68% 50.691us 80.81% 1.529ms 254.829us 11.618us 59.22% 13.731us 2.288us 6
+ aten::conv1d 0.29% 5.540us 6.41% 121.373us 40.458us 0.000us 0.00% 8.000us 2.667us 3
+ aten::convolution 0.50% 9.420us 6.12% 115.833us 38.611us 0.000us 0.00% 8.000us 2.667us 3
+ aten::_convolution 1.30% 24.670us 5.62% 106.413us 35.471us 0.000us 0.00% 8.000us 2.667us 3
+ aten::_conv_depthwise2d 1.20% 22.792us 3.44% 65.133us 21.711us 8.000us 40.78% 8.000us 2.667us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 8.000us 40.78% 8.000us 2.667us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.049us 30.83% 6.049us 2.016us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.569us 28.39% 5.569us 1.856us 3
+ Activity Buffer Request 75.63% 1.431ms 75.63% 1.431ms 1.431ms 2.113us 10.77% 2.113us 2.113us 1
+ aten::empty_strided 1.56% 29.560us 1.56% 29.560us 4.927us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 3.72% 70.343us 3.72% 70.343us 7.816us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.90% 17.091us 1.18% 22.301us 2.478us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.48% 9.090us 0.48% 9.090us 0.606us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.50% 9.490us 0.50% 9.490us 3.163us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.52% 9.830us 0.52% 9.830us 3.277us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.34% 6.400us 0.42% 8.020us 2.673us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.892ms
+Self CUDA time total: 19.618us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 333.691us 1797.81% 333.691us 333.691us 1
+ torch_eager 7.79% 146.606us 99.69% 1.876ms 1.876ms 0.000us 0.00% 20.481us 20.481us 1
+ aten::to 0.31% 5.760us 84.09% 1.582ms 263.706us 0.000us 0.00% 13.569us 2.262us 6
+ aten::_to_copy 1.25% 23.550us 83.79% 1.576ms 262.746us 0.000us 0.00% 13.569us 2.262us 6
+ aten::copy_ 2.67% 50.153us 80.95% 1.523ms 253.847us 11.649us 62.76% 13.569us 2.262us 6
+ aten::conv1d 0.31% 5.780us 6.33% 119.033us 39.678us 0.000us 0.00% 6.912us 2.304us 3
+ aten::convolution 0.52% 9.800us 6.02% 113.253us 37.751us 0.000us 0.00% 6.912us 2.304us 3
+ aten::_convolution 1.28% 24.000us 5.50% 103.453us 34.484us 0.000us 0.00% 6.912us 2.304us 3
+ aten::_conv_depthwise2d 1.15% 21.640us 3.37% 63.473us 21.158us 6.912us 37.24% 6.912us 2.304us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.912us 37.24% 6.912us 2.304us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.953us 32.07% 5.953us 1.984us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 30.69% 5.696us 1.899us 3
+ Activity Buffer Request 75.77% 1.426ms 75.77% 1.426ms 1.426ms 1.920us 10.34% 1.920us 1.920us 1
+ aten::empty_strided 1.59% 29.840us 1.59% 29.840us 4.973us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 3.79% 71.241us 3.79% 71.241us 7.916us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.92% 17.220us 1.19% 22.362us 2.485us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.47% 8.782us 0.47% 8.782us 0.585us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.49% 9.312us 0.49% 9.312us 3.104us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.46% 8.581us 0.46% 8.581us 2.860us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.33% 6.290us 0.41% 7.740us 2.580us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.882ms
+Self CUDA time total: 18.561us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 341.628us 1741.67% 341.628us 341.628us 1
+ torch_eager 6.79% 135.276us 99.76% 1.989ms 1.989ms 0.000us 0.00% 21.759us 21.759us 1
+ aten::to 0.31% 6.091us 85.44% 1.703ms 283.911us 0.000us 0.00% 14.111us 2.352us 6
+ aten::_to_copy 1.20% 23.892us 85.13% 1.697ms 282.896us 0.000us 0.00% 14.111us 2.352us 6
+ aten::copy_ 2.47% 49.180us 82.37% 1.642ms 273.716us 11.967us 61.01% 14.111us 2.352us 6
+ aten::conv1d 0.29% 5.740us 6.09% 121.414us 40.471us 0.000us 0.00% 7.648us 2.549us 3
+ aten::convolution 0.55% 11.061us 5.80% 115.674us 38.558us 0.000us 0.00% 7.648us 2.549us 3
+ aten::_convolution 1.19% 23.780us 5.25% 104.613us 34.871us 0.000us 0.00% 7.648us 2.549us 3
+ aten::_conv_depthwise2d 1.14% 22.750us 3.26% 64.953us 21.651us 7.648us 38.99% 7.648us 2.549us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.648us 38.99% 7.648us 2.549us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.175us 31.48% 6.175us 2.058us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.792us 29.53% 5.792us 1.931us 3
+ Activity Buffer Request 68.82% 1.372ms 68.82% 1.372ms 1.372ms 2.144us 10.93% 2.144us 2.144us 1
+ aten::empty_strided 1.56% 31.190us 1.56% 31.190us 5.198us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 12.22% 243.619us 12.22% 243.619us 27.069us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.88% 17.629us 1.14% 22.660us 2.518us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.44% 8.782us 0.44% 8.782us 0.585us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.48% 9.630us 0.48% 9.630us 3.210us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.50% 9.941us 0.50% 9.941us 3.314us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.34% 6.720us 0.41% 8.110us 2.703us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.994ms
+Self CUDA time total: 19.615us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 341.213us 1403.01% 341.213us 341.213us 1
+ torch_eager 7.36% 148.867us 99.73% 2.016ms 2.016ms 0.000us 0.00% 26.560us 26.560us 1
+ aten::to 0.30% 6.030us 84.88% 1.716ms 285.962us 0.000us 0.00% 15.168us 2.528us 6
+ aten::_to_copy 1.20% 24.229us 84.58% 1.710ms 284.956us 0.000us 0.00% 15.168us 2.528us 6
+ aten::copy_ 2.44% 49.414us 81.85% 1.655ms 275.782us 12.928us 53.16% 15.168us 2.528us 6
+ aten::conv1d 0.28% 5.730us 5.99% 121.174us 40.391us 0.000us 0.00% 11.392us 3.797us 3
+ aten::convolution 0.47% 9.480us 5.71% 115.444us 38.481us 0.000us 0.00% 11.392us 3.797us 3
+ aten::_convolution 1.14% 23.073us 5.24% 105.964us 35.321us 0.000us 0.00% 11.392us 3.797us 3
+ aten::_conv_depthwise2d 1.05% 21.189us 3.24% 65.411us 21.804us 11.392us 46.84% 11.392us 3.797us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.392us 46.84% 11.392us 3.797us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 27.11% 6.592us 2.197us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 26.05% 6.336us 2.112us 3
+ Activity Buffer Request 70.12% 1.417ms 70.12% 1.417ms 1.417ms 2.240us 9.21% 2.240us 2.240us 1
+ aten::empty_strided 1.52% 30.820us 1.52% 30.820us 5.137us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.45% 211.347us 10.45% 211.347us 23.483us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.95% 19.208us 1.23% 24.829us 2.759us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.46% 9.241us 0.46% 9.241us 0.616us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.47% 9.482us 0.47% 9.482us 3.161us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.55% 11.190us 0.55% 11.190us 3.730us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.34% 6.961us 0.41% 8.361us 2.787us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.022ms
+Self CUDA time total: 24.320us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 334.330us 1285.14% 334.330us 334.330us 1
+ torch_eager 7.10% 143.875us 99.74% 2.020ms 2.020ms 0.000us 0.00% 28.255us 28.255us 1
+ aten::to 0.28% 5.680us 85.25% 1.727ms 287.810us 0.000us 0.00% 15.232us 2.539us 6
+ aten::_to_copy 1.18% 23.873us 84.97% 1.721ms 286.863us 0.000us 0.00% 15.232us 2.539us 6
+ aten::copy_ 2.45% 49.640us 82.36% 1.668ms 278.038us 12.992us 49.94% 15.232us 2.539us 6
+ aten::conv1d 0.29% 5.889us 5.94% 120.414us 40.138us 0.000us 0.00% 13.023us 4.341us 3
+ aten::convolution 0.46% 9.401us 5.65% 114.525us 38.175us 0.000us 0.00% 13.023us 4.341us 3
+ aten::_convolution 1.22% 24.611us 5.19% 105.124us 35.041us 0.000us 0.00% 13.023us 4.341us 3
+ aten::_conv_depthwise2d 1.06% 21.480us 3.19% 64.562us 21.521us 13.023us 50.06% 13.023us 4.341us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.023us 50.06% 13.023us 4.341us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 25.46% 6.624us 2.208us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 24.48% 6.368us 2.123us 3
+ Activity Buffer Request 71.17% 1.442ms 71.17% 1.442ms 1.442ms 2.240us 8.61% 2.240us 2.240us 1
+ aten::empty_strided 1.44% 29.082us 1.44% 29.082us 4.847us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 9.85% 199.548us 9.85% 199.548us 22.172us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.91% 18.470us 1.17% 23.650us 2.628us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.44% 8.970us 0.44% 8.970us 0.598us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.51% 10.400us 0.51% 10.400us 3.467us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.50% 10.200us 0.50% 10.200us 3.400us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.30% 6.091us 0.38% 7.621us 2.540us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.026ms
+Self CUDA time total: 26.015us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.315us 888.50% 340.315us 340.315us 1
+ torch_eager 7.29% 147.016us 99.74% 2.012ms 2.012ms 0.000us 0.00% 40.894us 40.894us 1
+ aten::conv1d 0.29% 5.920us 5.91% 119.264us 39.755us 0.000us 0.00% 22.496us 7.499us 3
+ aten::convolution 0.47% 9.411us 5.62% 113.344us 37.781us 0.000us 0.00% 22.496us 7.499us 3
+ aten::_convolution 1.19% 23.960us 5.15% 103.933us 34.644us 0.000us 0.00% 22.496us 7.499us 3
+ aten::_conv_depthwise2d 1.11% 22.310us 3.18% 64.143us 21.381us 22.496us 58.73% 22.496us 7.499us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.496us 58.73% 22.496us 7.499us 3
+ aten::to 0.29% 5.851us 85.12% 1.717ms 286.238us 0.000us 0.00% 18.398us 3.066us 6
+ aten::_to_copy 1.17% 23.549us 84.83% 1.712ms 285.263us 0.000us 0.00% 18.398us 3.066us 6
+ aten::copy_ 2.43% 48.960us 82.11% 1.657ms 276.121us 15.806us 41.27% 18.398us 3.066us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.416us 21.97% 8.416us 2.805us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.390us 19.29% 7.390us 2.463us 3
+ Activity Buffer Request 70.87% 1.430ms 70.87% 1.430ms 1.430ms 2.592us 6.77% 2.592us 2.592us 1
+ aten::empty_strided 1.55% 31.301us 1.55% 31.301us 5.217us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 9.91% 199.938us 9.91% 199.938us 22.215us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.87% 17.540us 1.13% 22.711us 2.523us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.44% 8.912us 0.44% 8.912us 0.594us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.47% 9.390us 0.47% 9.390us 3.130us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.51% 10.361us 0.51% 10.361us 3.454us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.30% 6.100us 0.37% 7.550us 2.517us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.018ms
+Self CUDA time total: 38.302us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 363.388us 882.35% 363.388us 363.388us 1
+ torch_eager 8.20% 165.958us 99.73% 2.020ms 2.020ms 0.000us 0.00% 43.808us 43.808us 1
+ aten::conv1d 0.32% 6.510us 6.06% 122.733us 40.911us 0.000us 0.00% 25.408us 8.469us 3
+ aten::convolution 0.48% 9.730us 5.74% 116.223us 38.741us 0.000us 0.00% 25.408us 8.469us 3
+ aten::_convolution 1.17% 23.611us 5.26% 106.493us 35.498us 0.000us 0.00% 25.408us 8.469us 3
+ aten::_conv_depthwise2d 1.11% 22.549us 3.28% 66.422us 22.141us 25.408us 61.69% 25.408us 8.469us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.408us 61.69% 25.408us 8.469us 3
+ aten::to 0.31% 6.220us 83.98% 1.701ms 283.450us 0.000us 0.00% 18.400us 3.067us 6
+ aten::_to_copy 1.16% 23.591us 83.68% 1.694ms 282.413us 0.000us 0.00% 18.400us 3.067us 6
+ aten::copy_ 2.51% 50.781us 81.00% 1.640ms 273.388us 15.776us 38.31% 18.400us 3.067us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.352us 20.28% 8.352us 2.784us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.424us 18.03% 7.424us 2.475us 3
+ Activity Buffer Request 69.68% 1.411ms 69.68% 1.411ms 1.411ms 2.624us 6.37% 2.624us 2.624us 1
+ aten::empty_strided 1.51% 30.560us 1.51% 30.560us 5.093us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 9.99% 202.397us 9.99% 202.397us 22.489us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.88% 17.759us 1.14% 23.000us 2.556us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.46% 9.250us 0.46% 9.250us 0.617us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.51% 10.382us 0.51% 10.382us 3.461us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.48% 9.651us 0.48% 9.651us 3.217us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.33% 6.630us 0.40% 8.160us 2.720us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.025ms
+Self CUDA time total: 41.184us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 352.830us 343.38% 352.830us 352.830us 1
+ torch_eager 7.15% 144.983us 99.76% 2.023ms 2.023ms 0.000us 0.00% 108.768us 108.768us 1
+ aten::conv1d 0.29% 5.781us 5.92% 120.074us 40.025us 0.000us 0.00% 70.432us 23.477us 3
+ aten::convolution 0.47% 9.599us 5.64% 114.293us 38.098us 0.000us 0.00% 70.432us 23.477us 3
+ aten::_convolution 1.14% 23.149us 5.16% 104.694us 34.898us 0.000us 0.00% 70.432us 23.477us 3
+ aten::_conv_depthwise2d 1.16% 23.581us 3.22% 65.212us 21.737us 70.432us 68.55% 70.432us 23.477us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.432us 68.55% 70.432us 23.477us 3
+ aten::to 0.30% 6.111us 85.26% 1.729ms 288.085us 0.000us 0.00% 38.336us 6.389us 6
+ aten::_to_copy 1.62% 32.820us 84.95% 1.722ms 287.067us 0.000us 0.00% 38.336us 6.389us 6
+ aten::copy_ 2.46% 49.781us 81.90% 1.660ms 276.745us 32.320us 31.45% 38.336us 6.389us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.696us 17.22% 17.696us 5.899us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.624us 14.23% 14.624us 4.875us 3
+ Activity Buffer Request 70.70% 1.433ms 70.70% 1.433ms 1.433ms 6.016us 5.85% 6.016us 6.016us 1
+ aten::empty_strided 1.44% 29.111us 1.44% 29.111us 4.852us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 9.89% 200.449us 9.89% 200.449us 22.272us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.88% 17.943us 1.16% 23.512us 2.612us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.46% 9.330us 0.46% 9.330us 0.622us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.47% 9.471us 0.47% 9.471us 3.157us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.45% 9.050us 0.45% 9.050us 3.017us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.32% 6.391us 0.39% 7.911us 2.637us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.027ms
+Self CUDA time total: 102.752us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 330.363us 292.97% 330.363us 330.363us 1
+ torch_eager 16.10% 118.634us 99.31% 731.955us 731.955us 0.000us 0.00% 118.781us 118.781us 1
+ aten::conv1d 0.80% 5.881us 15.92% 117.344us 39.115us 0.000us 0.00% 80.541us 26.847us 3
+ aten::convolution 1.32% 9.760us 15.12% 111.463us 37.154us 0.000us 0.00% 80.541us 26.847us 3
+ aten::_convolution 3.06% 22.540us 13.80% 101.703us 33.901us 0.000us 0.00% 80.541us 26.847us 3
+ aten::_conv_depthwise2d 2.83% 20.841us 8.49% 62.593us 20.864us 80.541us 71.42% 80.541us 26.847us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.541us 71.42% 80.541us 26.847us 3
+ aten::to 0.79% 5.790us 63.53% 468.255us 78.043us 0.000us 0.00% 38.240us 6.373us 6
+ aten::_to_copy 3.21% 23.660us 62.75% 462.465us 77.078us 0.000us 0.00% 38.240us 6.373us 6
+ aten::copy_ 6.76% 49.831us 55.55% 409.415us 68.236us 32.224us 28.58% 38.240us 6.373us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.728us 15.72% 17.728us 5.909us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.496us 12.86% 14.496us 4.832us 3
+ Activity Buffer Request 25.24% 185.996us 25.24% 185.996us 185.996us 6.016us 5.33% 6.016us 6.016us 1
+ aten::empty_strided 3.99% 29.390us 3.99% 29.390us 4.898us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 26.60% 196.028us 26.60% 196.028us 21.781us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.44% 17.960us 3.11% 22.951us 2.550us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.15% 8.461us 1.15% 8.461us 0.564us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.35% 9.931us 1.35% 9.931us 3.310us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.27% 9.381us 1.27% 9.381us 3.127us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.87% 6.430us 1.06% 7.840us 2.613us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 737.005us
+Self CUDA time total: 112.765us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 22.21% 170.695us 99.32% 763.366us 763.366us 0.000us 0.00% 430.770us 430.770us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 416.723us 106.46% 416.723us 416.723us 1
+ aten::conv1d 0.77% 5.951us 14.86% 114.225us 38.075us 0.000us 0.00% 251.288us 83.763us 3
+ aten::convolution 1.24% 9.541us 14.09% 108.274us 36.091us 0.000us 0.00% 251.288us 83.763us 3
+ aten::_convolution 2.83% 21.719us 12.85% 98.733us 32.911us 0.000us 0.00% 251.288us 83.763us 3
+ aten::_conv_depthwise2d 2.74% 21.061us 7.99% 61.422us 20.474us 251.288us 64.20% 251.288us 83.763us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 251.288us 64.20% 251.288us 83.763us 3
+ aten::to 0.75% 5.750us 58.89% 452.676us 75.446us 0.000us 0.00% 179.482us 29.914us 6
+ aten::_to_copy 3.02% 23.182us 58.15% 446.926us 74.488us 0.000us 0.00% 179.482us 29.914us 6
+ aten::copy_ 6.40% 49.211us 51.58% 396.473us 66.079us 140.155us 35.80% 179.482us 29.914us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 100.254us 25.61% 100.254us 33.418us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.901us 10.19% 39.901us 13.300us 3
+ Activity Buffer Request 22.72% 174.636us 22.72% 174.636us 174.636us 39.327us 10.05% 39.327us 39.327us 1
+ aten::empty_strided 3.55% 27.271us 3.55% 27.271us 4.545us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 25.36% 194.936us 25.36% 194.936us 21.660us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.13% 16.381us 2.81% 21.611us 2.401us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.16% 8.880us 1.16% 8.880us 0.592us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.18% 9.091us 1.18% 9.091us 3.030us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.17% 8.960us 1.17% 8.960us 2.987us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.75% 5.770us 0.94% 7.191us 2.397us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 768.616us
+Self CUDA time total: 391.443us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 13.26% 117.114us 87.73% 774.557us 774.557us 0.000us 0.00% 486.014us 486.014us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 473.342us 105.98% 473.342us 473.342us 1
+ aten::conv1d 0.63% 5.520us 13.02% 114.943us 38.314us 0.000us 0.00% 298.622us 99.541us 3
+ aten::convolution 1.08% 9.570us 12.39% 109.423us 36.474us 0.000us 0.00% 298.622us 99.541us 3
+ aten::_convolution 2.49% 22.001us 11.31% 99.853us 33.284us 0.000us 0.00% 298.622us 99.541us 3
+ aten::_conv_depthwise2d 2.40% 21.190us 7.05% 62.252us 20.751us 298.622us 66.86% 298.622us 99.541us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 298.622us 66.86% 298.622us 99.541us 3
+ aten::to 0.65% 5.781us 58.29% 514.667us 85.778us 0.000us 0.00% 187.392us 31.232us 6
+ aten::_to_copy 2.57% 22.699us 57.64% 508.886us 84.814us 0.000us 0.00% 187.392us 31.232us 6
+ aten::copy_ 5.62% 49.650us 51.80% 457.366us 76.228us 148.032us 33.14% 187.392us 31.232us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 108.256us 24.24% 108.256us 36.085us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.776us 8.91% 39.776us 13.259us 3
+ Activity Buffer Request 26.78% 236.449us 26.78% 236.449us 236.449us 39.360us 8.81% 39.360us 39.360us 1
+ aten::empty_strided 3.26% 28.821us 3.26% 28.821us 4.804us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 22.01% 194.327us 22.01% 194.327us 21.592us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.00% 17.701us 2.60% 22.912us 2.546us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.01% 8.901us 1.01% 8.901us 0.593us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.05% 9.311us 1.05% 9.311us 3.104us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.98% 8.691us 0.98% 8.691us 2.897us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.65% 5.750us 0.82% 7.230us 2.410us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 882.890us
+Self CUDA time total: 446.654us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 324.122us 1734.57% 324.122us 324.122us 1
+ torch_eager 15.60% 121.627us 99.38% 775.067us 775.067us 0.000us 0.00% 20.574us 20.574us 1
+ aten::to 0.72% 5.589us 65.70% 512.356us 85.393us 0.000us 0.00% 13.343us 2.224us 6
+ aten::_to_copy 2.88% 22.431us 64.98% 506.767us 84.461us 0.000us 0.00% 13.343us 2.224us 6
+ aten::copy_ 6.46% 50.411us 58.51% 456.326us 76.054us 11.455us 61.30% 13.343us 2.224us 6
+ aten::conv1d 0.72% 5.580us 14.59% 113.823us 37.941us 0.000us 0.00% 7.231us 2.410us 3
+ aten::convolution 1.19% 9.260us 13.88% 108.243us 36.081us 0.000us 0.00% 7.231us 2.410us 3
+ aten::_convolution 2.87% 22.359us 12.69% 98.983us 32.994us 0.000us 0.00% 7.231us 2.410us 3
+ aten::_conv_depthwise2d 2.67% 20.840us 7.84% 61.153us 20.384us 7.231us 38.70% 7.231us 2.410us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.231us 38.70% 7.231us 2.410us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.856us 31.34% 5.856us 1.952us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.599us 29.96% 5.599us 1.866us 3
+ Activity Buffer Request 30.21% 235.608us 30.21% 235.608us 235.608us 1.888us 10.10% 1.888us 1.888us 1
+ aten::empty_strided 3.59% 28.010us 3.59% 28.010us 4.668us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 24.63% 192.088us 24.63% 192.088us 21.343us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.29% 17.871us 2.95% 23.001us 2.556us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.13% 8.820us 1.13% 8.820us 0.588us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.21% 9.401us 1.21% 9.401us 3.134us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.17% 9.131us 1.17% 9.131us 3.044us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.75% 5.851us 0.94% 7.321us 2.440us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 779.877us
+Self CUDA time total: 18.686us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 316.346us 1628.63% 316.346us 316.346us 1
+ torch_eager 14.51% 117.604us 99.38% 805.188us 805.188us 0.000us 0.00% 21.312us 21.312us 1
+ aten::to 0.69% 5.621us 67.40% 546.068us 91.011us 0.000us 0.00% 13.376us 2.229us 6
+ aten::_to_copy 2.81% 22.789us 66.70% 540.447us 90.075us 0.000us 0.00% 13.376us 2.229us 6
+ aten::copy_ 5.89% 47.733us 60.20% 487.757us 81.293us 11.488us 59.14% 13.376us 2.229us 6
+ aten::conv1d 0.69% 5.581us 14.11% 114.294us 38.098us 0.000us 0.00% 7.936us 2.645us 3
+ aten::convolution 1.17% 9.520us 13.42% 108.713us 36.238us 0.000us 0.00% 7.936us 2.645us 3
+ aten::_convolution 2.68% 21.682us 12.24% 99.193us 33.064us 0.000us 0.00% 7.936us 2.645us 3
+ aten::_conv_depthwise2d 2.64% 21.391us 7.61% 61.682us 20.561us 7.936us 40.86% 7.936us 2.645us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 40.86% 7.936us 2.645us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.856us 30.15% 5.856us 1.952us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.632us 29.00% 5.632us 1.877us 3
+ Activity Buffer Request 33.53% 271.649us 33.53% 271.649us 271.649us 1.888us 9.72% 1.888us 1.888us 1
+ aten::empty_strided 3.69% 29.901us 3.69% 29.901us 4.984us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 23.39% 189.555us 23.39% 189.555us 21.062us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.18% 17.698us 2.81% 22.771us 2.530us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.07% 8.674us 1.07% 8.674us 0.578us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.14% 9.260us 1.14% 9.260us 3.087us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.22% 9.851us 1.22% 9.851us 3.284us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.76% 6.120us 0.93% 7.530us 2.510us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 810.248us
+Self CUDA time total: 19.424us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 322.590us 1658.13% 322.590us 322.590us 1
+ torch_eager 6.77% 135.447us 99.76% 1.996ms 1.996ms 0.000us 0.00% 21.631us 21.631us 1
+ aten::to 0.29% 5.801us 85.87% 1.718ms 286.282us 0.000us 0.00% 14.400us 2.400us 6
+ aten::_to_copy 1.16% 23.150us 85.58% 1.712ms 285.315us 0.000us 0.00% 14.400us 2.400us 6
+ aten::copy_ 2.46% 49.110us 82.93% 1.659ms 276.491us 12.224us 62.83% 14.400us 2.400us 6
+ aten::conv1d 0.28% 5.690us 5.75% 114.953us 38.318us 0.000us 0.00% 7.231us 2.410us 3
+ aten::convolution 0.48% 9.520us 5.46% 109.263us 36.421us 0.000us 0.00% 7.231us 2.410us 3
+ aten::_convolution 1.10% 21.931us 4.99% 99.743us 33.248us 0.000us 0.00% 7.231us 2.410us 3
+ aten::_conv_depthwise2d 1.06% 21.231us 3.12% 62.372us 20.791us 7.231us 37.17% 7.231us 2.410us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.231us 37.17% 7.231us 2.410us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 32.40% 6.304us 2.101us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 30.43% 5.920us 1.973us 3
+ Activity Buffer Request 71.98% 1.440ms 71.98% 1.440ms 1.440ms 2.176us 11.18% 2.176us 2.176us 1
+ aten::empty_strided 1.49% 29.791us 1.49% 29.791us 4.965us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 9.66% 193.277us 9.66% 193.277us 21.475us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.86% 17.278us 1.13% 22.539us 2.504us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.45% 9.001us 0.45% 9.001us 0.600us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.46% 9.281us 0.46% 9.281us 3.094us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.43% 8.570us 0.43% 8.570us 2.857us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.29% 5.760us 0.36% 7.200us 2.400us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.000ms
+Self CUDA time total: 19.455us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 326.589us 1627.82% 326.589us 326.589us 1
+ torch_eager 7.03% 140.275us 99.72% 1.991ms 1.991ms 0.000us 0.00% 22.207us 22.207us 1
+ aten::to 0.30% 6.010us 85.45% 1.706ms 284.341us 0.000us 0.00% 14.304us 2.384us 6
+ aten::_to_copy 1.18% 23.623us 85.15% 1.700ms 283.340us 0.000us 0.00% 14.304us 2.384us 6
+ aten::copy_ 2.42% 48.261us 82.53% 1.648ms 274.613us 12.160us 60.61% 14.304us 2.384us 6
+ aten::conv1d 0.34% 6.690us 5.89% 117.664us 39.221us 0.000us 0.00% 7.903us 2.634us 3
+ aten::convolution 0.46% 9.260us 5.56% 110.974us 36.991us 0.000us 0.00% 7.903us 2.634us 3
+ aten::_convolution 1.15% 23.009us 5.09% 101.714us 33.905us 0.000us 0.00% 7.903us 2.634us 3
+ aten::_conv_depthwise2d 1.10% 21.970us 3.15% 62.812us 20.937us 7.903us 39.39% 7.903us 2.634us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.903us 39.39% 7.903us 2.634us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.240us 31.10% 6.240us 2.080us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 29.51% 5.920us 1.973us 3
+ Activity Buffer Request 71.49% 1.427ms 71.49% 1.427ms 1.427ms 2.144us 10.69% 2.144us 2.144us 1
+ aten::empty_strided 1.44% 28.740us 1.44% 28.740us 4.790us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 9.68% 193.308us 9.68% 193.308us 21.479us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.85% 16.982us 1.11% 22.224us 2.469us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.45% 8.892us 0.45% 8.892us 0.593us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.47% 9.420us 0.47% 9.420us 3.140us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.51% 10.100us 0.51% 10.100us 3.367us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.31% 6.130us 0.38% 7.650us 2.550us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.997ms
+Self CUDA time total: 20.063us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 319.164us 887.36% 319.164us 319.164us 1
+ torch_eager 15.26% 115.785us 99.38% 754.046us 754.046us 0.000us 0.00% 38.560us 38.560us 1
+ aten::conv1d 0.72% 5.471us 14.90% 113.045us 37.682us 0.000us 0.00% 20.097us 6.699us 3
+ aten::convolution 1.25% 9.510us 14.18% 107.574us 35.858us 0.000us 0.00% 20.097us 6.699us 3
+ aten::_convolution 2.85% 21.590us 12.92% 98.064us 32.688us 0.000us 0.00% 20.097us 6.699us 3
+ aten::_conv_depthwise2d 2.82% 21.412us 8.06% 61.133us 20.378us 20.097us 55.87% 20.097us 6.699us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.097us 55.87% 20.097us 6.699us 3
+ aten::to 0.74% 5.628us 65.55% 497.346us 82.891us 0.000us 0.00% 18.463us 3.077us 6
+ aten::_to_copy 3.02% 22.942us 64.80% 491.718us 81.953us 0.000us 0.00% 18.463us 3.077us 6
+ aten::copy_ 6.50% 49.290us 57.91% 439.376us 73.229us 15.871us 44.13% 18.463us 3.077us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.447us 23.48% 8.447us 2.816us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.424us 20.64% 7.424us 2.475us 3
+ Activity Buffer Request 28.99% 219.958us 28.99% 219.958us 219.958us 2.592us 7.21% 2.592us 2.592us 1
+ aten::empty_strided 3.87% 29.400us 3.87% 29.400us 4.900us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 25.31% 192.058us 25.31% 192.058us 21.340us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.43% 18.410us 3.09% 23.410us 2.601us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.12% 8.490us 1.12% 8.490us 0.566us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.20% 9.081us 1.20% 9.081us 3.027us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.15% 8.710us 1.15% 8.710us 2.903us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.77% 5.871us 0.96% 7.301us 2.434us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 758.766us
+Self CUDA time total: 35.968us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 318.166us 839.07% 318.166us 318.166us 1
+ torch_eager 15.61% 115.614us 99.23% 735.056us 735.056us 0.000us 0.00% 40.512us 40.512us 1
+ aten::conv1d 0.77% 5.689us 15.23% 112.833us 37.611us 0.000us 0.00% 22.206us 7.402us 3
+ aten::convolution 1.28% 9.450us 14.46% 107.144us 35.715us 0.000us 0.00% 22.206us 7.402us 3
+ aten::_convolution 2.90% 21.450us 13.19% 97.694us 32.565us 0.000us 0.00% 22.206us 7.402us 3
+ aten::_conv_depthwise2d 2.86% 21.190us 8.15% 60.352us 20.117us 22.206us 58.56% 22.206us 7.402us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.206us 58.56% 22.206us 7.402us 3
+ aten::to 0.76% 5.621us 64.62% 478.657us 79.776us 0.000us 0.00% 18.306us 3.051us 6
+ aten::_to_copy 3.14% 23.241us 63.86% 473.036us 78.839us 0.000us 0.00% 18.306us 3.051us 6
+ aten::copy_ 6.66% 49.364us 56.82% 420.865us 70.144us 15.713us 41.44% 18.306us 3.051us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.385us 22.11% 8.385us 2.795us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 19.33% 7.328us 2.443us 3
+ Activity Buffer Request 27.11% 200.816us 27.11% 200.816us 200.816us 2.593us 6.84% 2.593us 2.593us 1
+ aten::empty_strided 3.91% 28.930us 3.91% 28.930us 4.822us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 25.94% 192.117us 25.94% 192.117us 21.346us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.42% 17.932us 3.14% 23.222us 2.580us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.19% 8.781us 1.19% 8.781us 0.585us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.25% 9.270us 1.25% 9.270us 3.090us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.14% 8.460us 1.14% 8.460us 2.820us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.85% 6.280us 1.02% 7.591us 2.530us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 740.726us
+Self CUDA time total: 37.919us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 321.366us 502.64% 321.366us 321.366us 1
+ torch_eager 15.27% 113.396us 99.28% 737.126us 737.126us 0.000us 0.00% 68.031us 68.031us 1
+ aten::conv1d 0.76% 5.670us 15.56% 115.503us 38.501us 0.000us 0.00% 41.567us 13.856us 3
+ aten::convolution 1.28% 9.489us 14.79% 109.833us 36.611us 0.000us 0.00% 41.567us 13.856us 3
+ aten::_convolution 3.08% 22.850us 13.52% 100.344us 33.448us 0.000us 0.00% 41.567us 13.856us 3
+ aten::_conv_depthwise2d 2.89% 21.483us 8.27% 61.383us 20.461us 41.567us 65.01% 41.567us 13.856us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.567us 65.01% 41.567us 13.856us 3
+ aten::to 0.76% 5.660us 64.85% 481.506us 80.251us 0.000us 0.00% 26.464us 4.411us 6
+ aten::_to_copy 3.08% 22.842us 64.09% 475.846us 79.308us 0.000us 0.00% 26.464us 4.411us 6
+ aten::copy_ 6.57% 48.752us 57.01% 423.304us 70.551us 22.368us 34.99% 26.464us 4.411us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.968us 18.72% 11.968us 3.989us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 16.27% 10.400us 3.467us 3
+ Activity Buffer Request 27.27% 202.487us 27.27% 202.487us 202.487us 4.096us 6.41% 4.096us 4.096us 1
+ aten::empty_strided 4.00% 29.700us 4.00% 29.700us 4.950us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 26.15% 194.125us 26.15% 194.125us 21.569us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.30% 17.061us 2.99% 22.191us 2.466us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.19% 8.800us 1.19% 8.800us 0.587us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.25% 9.280us 1.25% 9.280us 3.093us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.15% 8.560us 1.15% 8.560us 2.853us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.77% 5.741us 0.96% 7.151us 2.384us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 742.446us
+Self CUDA time total: 63.935us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 326.264us 468.36% 326.264us 326.264us 1
+ torch_eager 14.61% 117.663us 99.38% 800.347us 800.347us 0.000us 0.00% 73.789us 73.789us 1
+ aten::conv1d 0.75% 6.020us 14.38% 115.844us 38.615us 0.000us 0.00% 47.230us 15.743us 3
+ aten::convolution 1.16% 9.351us 13.64% 109.824us 36.608us 0.000us 0.00% 47.230us 15.743us 3
+ aten::_convolution 2.76% 22.250us 12.48% 100.473us 33.491us 0.000us 0.00% 47.230us 15.743us 3
+ aten::_conv_depthwise2d 2.71% 21.790us 7.76% 62.461us 20.820us 47.230us 67.80% 47.230us 15.743us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.230us 67.80% 47.230us 15.743us 3
+ aten::to 0.71% 5.690us 66.94% 539.059us 89.843us 0.000us 0.00% 26.559us 4.426us 6
+ aten::_to_copy 2.87% 23.082us 66.23% 533.369us 88.895us 0.000us 0.00% 26.559us 4.426us 6
+ aten::copy_ 6.12% 49.260us 59.73% 480.976us 80.163us 22.431us 32.20% 26.559us 4.426us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 17.23% 12.000us 4.000us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 14.97% 10.431us 3.477us 3
+ Activity Buffer Request 29.99% 241.509us 29.99% 241.509us 241.509us 4.128us 5.93% 4.128us 4.128us 1
+ aten::empty_strided 3.64% 29.311us 3.64% 29.311us 4.885us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 26.37% 212.348us 26.37% 212.348us 23.594us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.22% 17.841us 2.86% 23.041us 2.560us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.09% 8.761us 1.09% 8.761us 0.584us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.16% 9.320us 1.16% 9.320us 3.107us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.14% 9.210us 1.14% 9.210us 3.070us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.77% 6.201us 0.95% 7.621us 2.540us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 805.317us
+Self CUDA time total: 69.661us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 372.509us 200.60% 372.509us 372.509us 1
+ torch_eager 16.32% 136.903us 99.36% 833.418us 833.418us 0.000us 0.00% 195.711us 195.711us 1
+ aten::conv1d 0.67% 5.580us 15.45% 129.615us 43.205us 0.000us 0.00% 133.247us 44.416us 3
+ aten::convolution 1.13% 9.510us 14.79% 124.035us 41.345us 0.000us 0.00% 133.247us 44.416us 3
+ aten::_convolution 3.89% 32.633us 13.65% 114.525us 38.175us 0.000us 0.00% 133.247us 44.416us 3
+ aten::_conv_depthwise2d 2.50% 20.960us 7.87% 66.022us 22.007us 133.247us 71.76% 133.247us 44.416us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.247us 71.76% 133.247us 44.416us 3
+ aten::to 0.72% 6.039us 64.27% 539.099us 89.850us 0.000us 0.00% 62.464us 10.411us 6
+ aten::_to_copy 2.75% 23.094us 63.55% 533.060us 88.843us 0.000us 0.00% 62.464us 10.411us 6
+ aten::copy_ 5.97% 50.071us 57.15% 479.385us 79.897us 52.448us 28.24% 62.464us 10.411us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.504us 15.89% 29.504us 9.835us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.944us 12.36% 22.944us 7.648us 3
+ Activity Buffer Request 30.64% 256.969us 30.64% 256.969us 256.969us 10.016us 5.39% 10.016us 10.016us 1
+ aten::empty_strided 3.65% 30.581us 3.65% 30.581us 5.097us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 23.59% 197.827us 23.59% 197.827us 21.981us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.16% 18.130us 2.81% 23.610us 2.623us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.09% 9.169us 1.09% 9.169us 0.611us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.19% 9.940us 1.19% 9.940us 3.313us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.15% 9.640us 1.15% 9.640us 3.213us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.72% 6.001us 0.89% 7.490us 2.497us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 838.778us
+Self CUDA time total: 185.695us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 368.701us 175.32% 368.701us 368.701us 1
+ torch_eager 16.38% 138.724us 99.39% 841.559us 841.559us 0.000us 0.00% 224.383us 224.383us 1
+ aten::conv1d 0.69% 5.870us 14.05% 118.945us 39.648us 0.000us 0.00% 154.015us 51.338us 3
+ aten::convolution 1.19% 10.050us 13.35% 113.075us 37.692us 0.000us 0.00% 154.015us 51.338us 3
+ aten::_convolution 2.68% 22.669us 12.17% 103.025us 34.342us 0.000us 0.00% 154.015us 51.338us 3
+ aten::_conv_depthwise2d 2.54% 21.472us 7.66% 64.883us 21.628us 154.015us 73.23% 154.015us 51.338us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 154.015us 73.23% 154.015us 51.338us 3
+ aten::to 0.70% 5.911us 65.49% 554.540us 92.423us 0.000us 0.00% 70.368us 11.728us 6
+ aten::_to_copy 2.70% 22.862us 64.79% 548.629us 91.438us 0.000us 0.00% 70.368us 11.728us 6
+ aten::copy_ 5.97% 50.511us 58.49% 495.276us 82.546us 56.288us 26.77% 70.368us 11.728us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 33.248us 15.81% 33.248us 11.083us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.040us 10.96% 23.040us 7.680us 3
+ Activity Buffer Request 32.21% 272.739us 32.21% 272.739us 272.739us 14.080us 6.70% 14.080us 14.080us 1
+ aten::empty_strided 3.60% 30.491us 3.60% 30.491us 5.082us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 23.06% 195.277us 23.06% 195.277us 21.697us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.26% 19.134us 2.91% 24.623us 2.736us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.07% 9.019us 1.07% 9.019us 0.601us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.15% 9.700us 1.15% 9.700us 3.233us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.24% 10.460us 1.24% 10.460us 3.487us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.68% 5.760us 0.85% 7.180us 2.393us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 846.749us
+Self CUDA time total: 210.303us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 6.86% 124.525us 53.03% 963.064us 963.064us 0.000us 0.00% 1.524ms 1.524ms 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.423ms 100.42% 1.423ms 1.423ms 1
+ aten::to 0.37% 6.781us 38.11% 692.105us 115.351us 0.000us 0.00% 827.798us 137.966us 6
+ aten::_to_copy 1.62% 29.329us 37.74% 685.324us 114.221us 0.000us 0.00% 827.798us 137.966us 6
+ aten::copy_ 2.86% 52.014us 24.74% 449.228us 74.871us 721.111us 50.87% 827.798us 137.966us 6
+ aten::conv1d 0.32% 5.800us 6.51% 118.154us 39.385us 0.000us 0.00% 696.313us 232.104us 3
+ aten::convolution 0.55% 9.981us 6.19% 112.354us 37.451us 0.000us 0.00% 696.313us 232.104us 3
+ aten::_convolution 1.25% 22.722us 5.64% 102.373us 34.124us 0.000us 0.00% 696.313us 232.104us 3
+ aten::_conv_depthwise2d 1.22% 22.241us 3.54% 64.332us 21.444us 696.313us 49.13% 696.313us 232.104us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 696.313us 49.13% 696.313us 232.104us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 411.194us 29.01% 411.194us 137.065us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 309.917us 21.86% 309.917us 103.306us 3
+ Activity Buffer Request 12.02% 218.207us 12.02% 218.207us 218.207us 106.687us 7.53% 106.687us 106.687us 1
+ aten::empty_strided 1.97% 35.692us 11.39% 206.767us 34.461us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 11.11% 201.717us 11.11% 201.717us 22.413us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.96% 17.369us 1.26% 22.889us 2.543us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.51% 9.249us 0.51% 9.249us 0.617us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.50% 9.061us 0.50% 9.061us 3.020us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.57% 10.320us 0.57% 10.320us 3.440us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.33% 5.990us 0.41% 7.360us 2.453us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.816ms
+Self CUDA time total: 1.417ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 6.33% 114.706us 41.01% 743.286us 743.286us 0.000us 0.00% 1.500ms 1.500ms 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.431ms 100.39% 1.431ms 1.431ms 1
+ aten::to 0.32% 5.881us 26.81% 485.936us 80.989us 0.000us 0.00% 762.577us 127.096us 6
+ aten::_to_copy 1.28% 23.109us 26.49% 480.055us 80.009us 0.000us 0.00% 762.577us 127.096us 6
+ aten::copy_ 2.74% 49.733us 23.67% 429.056us 71.509us 687.698us 48.25% 762.577us 127.096us 6
+ aten::conv1d 0.31% 5.590us 6.38% 115.623us 38.541us 0.000us 0.00% 737.523us 245.841us 3
+ aten::convolution 0.55% 9.990us 6.07% 110.033us 36.678us 0.000us 0.00% 737.523us 245.841us 3
+ aten::_convolution 1.21% 21.900us 5.52% 100.043us 33.348us 0.000us 0.00% 737.523us 245.841us 3
+ aten::_conv_depthwise2d 1.16% 21.072us 3.45% 62.453us 20.818us 737.523us 51.75% 737.523us 245.841us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 737.523us 51.75% 737.523us 245.841us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 400.247us 28.08% 400.247us 133.416us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 287.451us 20.17% 287.451us 95.817us 3
+ Activity Buffer Request 11.32% 205.227us 11.32% 205.227us 205.227us 74.879us 5.25% 74.879us 74.879us 1
+ aten::empty_strided 1.54% 27.890us 1.54% 27.890us 4.648us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.89% 197.296us 10.89% 197.296us 21.922us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.95% 17.181us 1.23% 22.321us 2.480us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.49% 8.961us 0.49% 8.961us 0.597us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.50% 9.050us 0.50% 9.050us 3.017us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.50% 9.131us 0.50% 9.131us 3.044us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.32% 5.870us 0.41% 7.390us 2.463us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.812ms
+Self CUDA time total: 1.425ms
+
+
+impl wl p50(ms) ok
+torch_eager cuda_B2_D2048_S128_W2 0.09 True
+torch_eager cuda_B2_D2048_S128_W4 0.08 True
+torch_eager cuda_B2_D2048_S2048_W2 0.15 True
+torch_eager cuda_B2_D2048_S2048_W4 0.16 True
+torch_eager cuda_B2_D2048_S512_W2 0.08 True
+torch_eager cuda_B2_D2048_S512_W4 0.08 True
+torch_eager cuda_B2_D64_S128_W2 0.07 True
+torch_eager cuda_B2_D64_S128_W4 0.09 True
+torch_eager cuda_B2_D64_S2048_W2 0.09 True
+torch_eager cuda_B2_D64_S2048_W4 0.08 True
+torch_eager cuda_B2_D64_S512_W2 0.09 True
+torch_eager cuda_B2_D64_S512_W4 0.08 True
+torch_eager cuda_B4_D2048_S128_W2 0.08 True
+torch_eager cuda_B4_D2048_S128_W4 0.08 True
+torch_eager cuda_B4_D2048_S2048_W2 0.49 True
+torch_eager cuda_B4_D2048_S2048_W4 0.50 True
+torch_eager cuda_B4_D2048_S512_W2 0.09 True
+torch_eager cuda_B4_D2048_S512_W4 0.10 True
+torch_eager cuda_B4_D64_S128_W2 0.08 True
+torch_eager cuda_B4_D64_S128_W4 0.08 True
+torch_eager cuda_B4_D64_S2048_W2 0.08 True
+torch_eager cuda_B4_D64_S2048_W4 0.08 True
+torch_eager cuda_B4_D64_S512_W2 0.08 True
+torch_eager cuda_B4_D64_S512_W4 0.08 True
+