Running causal_conv1d benchmark on cuda with 24 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 467.679us 2403.78% 467.679us 467.679us 1
torch_eager 10.81% 233.756us 99.62% 2.155ms 2.155ms 0.000us 0.00% 21.792us 21.792us 1
aten::to 0.55% 11.919us 78.64% 1.701ms 283.539us 0.000us 0.00% 14.304us 2.384us 6
aten::_to_copy 2.04% 44.223us 78.09% 1.689ms 281.553us 0.000us 0.00% 14.304us 2.384us 6
aten::copy_ 3.07% 66.360us 73.27% 1.585ms 264.169us 11.968us 61.51% 14.304us 2.384us 6
aten::conv1d 0.40% 8.600us 7.96% 172.134us 57.378us 0.000us 0.00% 7.488us 2.496us 3
aten::convolution 0.76% 16.533us 7.56% 163.534us 54.511us 0.000us 0.00% 7.488us 2.496us 3
aten::_convolution 1.65% 35.660us 6.80% 147.001us 49.000us 0.000us 0.00% 7.488us 2.496us 3
aten::_conv_depthwise2d 1.78% 38.520us 4.15% 89.871us 29.957us 7.488us 38.49% 7.488us 2.496us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.488us 38.49% 7.488us 2.496us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 32.24% 6.272us 2.091us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 29.28% 5.696us 1.899us 3
Activity Buffer Request 67.06% 1.451ms 67.06% 1.451ms 1.451ms 2.336us 12.01% 2.336us 2.336us 1
aten::empty_strided 2.78% 60.080us 2.78% 60.080us 10.013us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 4.39% 95.004us 4.39% 95.004us 10.556us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 1.49% 32.209us 1.86% 40.319us 4.480us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.61% 13.180us 0.61% 13.180us 0.879us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.57% 12.310us 0.57% 12.310us 4.103us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.56% 12.130us 0.56% 12.130us 4.043us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.40% 8.601us 0.48% 10.281us 3.427us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.163ms
Self CUDA time total: 19.456us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 345.183us 1768.26% 345.183us 345.183us 1
torch_eager 6.79% 129.712us 99.69% 1.904ms 1.904ms 0.000us 0.00% 21.633us 21.633us 1
aten::to 0.35% 6.752us 84.92% 1.622ms 270.359us 0.000us 0.00% 13.697us 2.283us 6
aten::_to_copy 1.29% 24.629us 84.57% 1.615ms 269.234us 0.000us 0.00% 13.697us 2.283us 6
aten::copy_ 2.57% 49.181us 81.04% 1.548ms 258.016us 11.585us 59.35% 13.697us 2.283us 6
aten::conv1d 0.34% 6.520us 6.51% 124.283us 41.428us 0.000us 0.00% 7.936us 2.645us 3
aten::convolution 0.52% 9.860us 6.16% 117.763us 39.254us 0.000us 0.00% 7.936us 2.645us 3
aten::_convolution 1.28% 24.503us 5.65% 107.903us 35.968us 0.000us 0.00% 7.936us 2.645us 3
aten::_conv_depthwise2d 1.17% 22.379us 3.49% 66.751us 22.250us 7.936us 40.65% 7.936us 2.645us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 40.65% 7.936us 2.645us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.080us 31.15% 6.080us 2.027us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.505us 28.20% 5.505us 1.835us 3
Activity Buffer Request 75.86% 1.449ms 75.86% 1.449ms 1.449ms 2.112us 10.82% 2.112us 2.112us 1
aten::empty_strided 2.23% 42.682us 2.23% 42.682us 7.114us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.91% 74.643us 3.91% 74.643us 8.294us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.96% 18.249us 1.26% 24.119us 2.680us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.50% 9.600us 0.50% 9.600us 0.640us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.51% 9.750us 0.51% 9.750us 3.250us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.51% 9.801us 0.51% 9.801us 3.267us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.31% 5.930us 0.39% 7.450us 2.483us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.910ms
Self CUDA time total: 19.521us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 350.236us 1887.05% 350.236us 350.236us 1
torch_eager 6.95% 131.684us 99.72% 1.889ms 1.889ms 0.000us 0.00% 20.481us 20.481us 1
aten::to 0.32% 5.979us 84.80% 1.606ms 267.646us 0.000us 0.00% 13.570us 2.262us 6
aten::_to_copy 1.26% 23.830us 84.48% 1.600ms 266.649us 0.000us 0.00% 13.570us 2.262us 6
aten::copy_ 3.12% 59.102us 81.66% 1.546ms 257.734us 11.649us 62.76% 13.570us 2.262us 6
aten::conv1d 0.33% 6.189us 6.49% 122.822us 40.941us 0.000us 0.00% 6.911us 2.304us 3
aten::convolution 0.53% 10.011us 6.16% 116.633us 38.878us 0.000us 0.00% 6.911us 2.304us 3
aten::_convolution 1.28% 24.209us 5.63% 106.622us 35.541us 0.000us 0.00% 6.911us 2.304us 3
aten::_conv_depthwise2d 1.23% 23.239us 3.44% 65.172us 21.724us 6.911us 37.24% 6.911us 2.304us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.911us 37.24% 6.911us 2.304us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.921us 31.90% 5.921us 1.974us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 30.86% 5.728us 1.909us 3
Activity Buffer Request 75.86% 1.437ms 75.86% 1.437ms 1.437ms 1.921us 10.35% 1.921us 1.921us 1
aten::empty_strided 1.57% 29.661us 1.57% 29.661us 4.944us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.93% 74.492us 3.93% 74.492us 8.277us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.98% 18.470us 1.28% 24.221us 2.691us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.50% 9.492us 0.50% 9.492us 0.633us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.52% 9.761us 0.52% 9.761us 3.254us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.44% 8.371us 0.44% 8.371us 2.790us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.31% 5.870us 0.39% 7.390us 2.463us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.894ms
Self CUDA time total: 18.560us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 349.473us 1781.39% 349.473us 349.473us 1
torch_eager 6.11% 131.525us 99.75% 2.146ms 2.146ms 0.000us 0.00% 21.795us 21.795us 1
aten::to 0.31% 6.681us 86.66% 1.864ms 310.738us 0.000us 0.00% 14.148us 2.358us 6
aten::_to_copy 1.14% 24.510us 86.35% 1.858ms 309.625us 0.000us 0.00% 14.148us 2.358us 6
aten::copy_ 2.35% 50.532us 83.71% 1.801ms 300.153us 11.971us 61.02% 14.148us 2.358us 6
aten::conv1d 0.29% 6.159us 5.69% 122.482us 40.827us 0.000us 0.00% 7.647us 2.549us 3
aten::convolution 0.45% 9.650us 5.41% 116.323us 38.774us 0.000us 0.00% 7.647us 2.549us 3
aten::_convolution 1.16% 25.049us 4.96% 106.673us 35.558us 0.000us 0.00% 7.647us 2.549us 3
aten::_conv_depthwise2d 1.06% 22.843us 3.03% 65.182us 21.727us 7.647us 38.98% 7.647us 2.549us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.647us 38.98% 7.647us 2.549us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.211us 31.66% 6.211us 2.070us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.760us 29.36% 5.760us 1.920us 3
Activity Buffer Request 68.59% 1.476ms 68.59% 1.476ms 1.476ms 2.177us 11.10% 2.177us 2.177us 1
aten::empty_strided 1.50% 32.320us 1.50% 32.320us 5.387us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 13.84% 297.685us 13.84% 297.685us 33.076us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.81% 17.433us 1.07% 22.952us 2.550us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.42% 9.029us 0.42% 9.029us 0.602us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.47% 10.100us 0.47% 10.100us 3.367us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.44% 9.389us 0.44% 9.389us 3.130us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.30% 6.350us 0.36% 7.690us 2.563us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.151ms
Self CUDA time total: 19.618us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 348.958us 1429.22% 348.958us 348.958us 1
torch_eager 6.64% 139.433us 99.75% 2.094ms 2.094ms 0.000us 0.00% 26.656us 26.656us 1
aten::to 0.30% 6.349us 85.85% 1.803ms 300.420us 0.000us 0.00% 15.136us 2.523us 6
aten::_to_copy 1.17% 24.664us 85.55% 1.796ms 299.362us 0.000us 0.00% 15.136us 2.523us 6
aten::copy_ 2.46% 51.670us 82.81% 1.739ms 289.779us 12.896us 52.82% 15.136us 2.523us 6
aten::conv1d 0.30% 6.230us 5.89% 123.663us 41.221us 0.000us 0.00% 11.520us 3.840us 3
aten::convolution 0.49% 10.211us 5.59% 117.433us 39.144us 0.000us 0.00% 11.520us 3.840us 3
aten::_convolution 1.21% 25.350us 5.11% 107.222us 35.741us 0.000us 0.00% 11.520us 3.840us 3
aten::_conv_depthwise2d 1.07% 22.551us 3.09% 64.932us 21.644us 11.520us 47.18% 11.520us 3.840us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.520us 47.18% 11.520us 3.840us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 27.00% 6.592us 2.197us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 25.82% 6.304us 2.101us 3
Activity Buffer Request 67.91% 1.426ms 67.91% 1.426ms 1.426ms 2.240us 9.17% 2.240us 2.240us 1
aten::empty_strided 1.56% 32.829us 1.56% 32.829us 5.472us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 13.56% 284.686us 13.56% 284.686us 31.632us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.89% 18.631us 1.15% 24.041us 2.671us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.44% 9.180us 0.44% 9.180us 0.612us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.47% 9.871us 0.47% 9.871us 3.290us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.43% 8.930us 0.43% 8.930us 2.977us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.31% 6.480us 0.38% 7.900us 2.633us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.100ms
Self CUDA time total: 24.416us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 350.300us 1353.19% 350.300us 350.300us 1
torch_eager 6.16% 129.673us 99.74% 2.100ms 2.100ms 0.000us 0.00% 28.127us 28.127us 1
aten::to 0.30% 6.400us 86.28% 1.817ms 302.813us 0.000us 0.00% 15.135us 2.522us 6
aten::_to_copy 1.17% 24.572us 85.97% 1.810ms 301.746us 0.000us 0.00% 15.135us 2.522us 6
aten::copy_ 2.32% 48.831us 83.30% 1.754ms 292.358us 12.895us 49.81% 15.135us 2.522us 6
aten::conv1d 0.30% 6.370us 5.91% 124.553us 41.518us 0.000us 0.00% 12.992us 4.331us 3
aten::convolution 0.48% 10.021us 5.61% 118.183us 39.394us 0.000us 0.00% 12.992us 4.331us 3
aten::_convolution 1.13% 23.790us 5.14% 108.162us 36.054us 0.000us 0.00% 12.992us 4.331us 3
aten::_conv_depthwise2d 1.15% 24.221us 3.16% 66.582us 22.194us 12.992us 50.19% 12.992us 4.331us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 12.992us 50.19% 12.992us 4.331us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 25.46% 6.592us 2.197us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.303us 24.35% 6.303us 2.101us 3
Activity Buffer Request 68.95% 1.452ms 68.95% 1.452ms 1.452ms 2.240us 8.65% 2.240us 2.240us 1
aten::empty_strided 1.51% 31.759us 1.51% 31.759us 5.293us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 13.13% 276.435us 13.13% 276.435us 30.715us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.91% 19.219us 1.21% 25.491us 2.832us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.48% 10.094us 0.48% 10.094us 0.673us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.47% 9.932us 0.47% 9.932us 3.311us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.44% 9.289us 0.44% 9.289us 3.096us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.33% 6.860us 0.40% 8.350us 2.783us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.106ms
Self CUDA time total: 25.887us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 344.670us 904.34% 344.670us 344.670us 1
torch_eager 6.27% 130.413us 99.73% 2.076ms 2.076ms 0.000us 0.00% 40.673us 40.673us 1
aten::conv1d 0.29% 6.011us 5.81% 120.902us 40.301us 0.000us 0.00% 22.369us 7.456us 3
aten::convolution 0.46% 9.579us 5.52% 114.891us 38.297us 0.000us 0.00% 22.369us 7.456us 3
aten::_convolution 1.17% 24.271us 5.06% 105.312us 35.104us 0.000us 0.00% 22.369us 7.456us 3
aten::_conv_depthwise2d 1.07% 22.281us 3.10% 64.540us 21.513us 22.369us 58.69% 22.369us 7.456us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.369us 58.69% 22.369us 7.456us 3
aten::to 0.30% 6.240us 86.29% 1.796ms 299.368us 0.000us 0.00% 18.304us 3.051us 6
aten::_to_copy 1.19% 24.702us 86.00% 1.790ms 298.328us 0.000us 0.00% 18.304us 3.051us 6
aten::copy_ 2.32% 48.271us 83.37% 1.735ms 289.226us 15.744us 41.31% 18.304us 3.051us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.352us 21.91% 8.352us 2.784us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 19.39% 7.392us 2.464us 3
Activity Buffer Request 69.09% 1.438ms 69.09% 1.438ms 1.438ms 2.560us 6.72% 2.560us 2.560us 1
aten::empty_strided 1.44% 29.909us 1.44% 29.909us 4.985us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 13.10% 272.705us 13.10% 272.705us 30.301us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.90% 18.821us 1.17% 24.281us 2.698us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.43% 8.910us 0.43% 8.910us 0.594us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.47% 9.769us 0.47% 9.769us 3.256us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.42% 8.830us 0.42% 8.830us 2.943us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.30% 6.180us 0.36% 7.570us 2.523us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.081ms
Self CUDA time total: 38.113us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.324us 829.06% 339.324us 339.324us 1
torch_eager 6.37% 130.712us 99.71% 2.046ms 2.046ms 0.000us 0.00% 43.521us 43.521us 1
aten::conv1d 0.29% 5.880us 5.94% 121.953us 40.651us 0.000us 0.00% 25.216us 8.405us 3
aten::convolution 0.47% 9.711us 5.66% 116.073us 38.691us 0.000us 0.00% 25.216us 8.405us 3
aten::_convolution 1.26% 25.911us 5.18% 106.362us 35.454us 0.000us 0.00% 25.216us 8.405us 3
aten::_conv_depthwise2d 1.09% 22.379us 3.16% 64.832us 21.611us 25.216us 61.61% 25.216us 8.405us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.216us 61.61% 25.216us 8.405us 3
aten::to 0.29% 5.930us 86.07% 1.766ms 294.313us 0.000us 0.00% 18.305us 3.051us 6
aten::_to_copy 1.14% 23.292us 85.79% 1.760ms 293.325us 0.000us 0.00% 18.305us 3.051us 6
aten::copy_ 2.44% 50.149us 83.18% 1.707ms 284.430us 15.713us 38.39% 18.305us 3.051us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.320us 20.33% 8.320us 2.773us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.393us 18.06% 7.393us 2.464us 3
Activity Buffer Request 69.04% 1.416ms 69.04% 1.416ms 1.416ms 2.592us 6.33% 2.592us 2.592us 1
aten::empty_strided 1.47% 30.081us 1.47% 30.081us 5.013us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 12.82% 263.078us 12.82% 263.078us 29.231us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.84% 17.161us 1.08% 22.249us 2.472us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.43% 8.738us 0.43% 8.738us 0.583us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.52% 10.621us 0.52% 10.621us 3.540us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.43% 8.801us 0.43% 8.801us 2.934us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.28% 5.670us 0.35% 7.160us 2.387us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.052ms
Self CUDA time total: 40.929us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 346.587us 339.00% 346.587us 346.587us 1
torch_eager 6.07% 126.382us 99.75% 2.075ms 2.075ms 0.000us 0.00% 108.223us 108.223us 1
aten::conv1d 0.27% 5.689us 5.84% 121.563us 40.521us 0.000us 0.00% 70.111us 23.370us 3
aten::convolution 0.45% 9.432us 5.57% 115.874us 38.625us 0.000us 0.00% 70.111us 23.370us 3
aten::_convolution 1.15% 23.992us 5.12% 106.442us 35.481us 0.000us 0.00% 70.111us 23.370us 3
aten::_conv_depthwise2d 1.13% 23.510us 3.19% 66.451us 22.150us 70.111us 68.58% 70.111us 23.370us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.111us 68.58% 70.111us 23.370us 3
aten::to 0.33% 6.762us 86.49% 1.799ms 299.916us 0.000us 0.00% 38.112us 6.352us 6
aten::_to_copy 1.17% 24.419us 86.17% 1.793ms 298.789us 0.000us 0.00% 38.112us 6.352us 6
aten::copy_ 2.24% 46.671us 83.54% 1.738ms 289.665us 32.128us 31.42% 38.112us 6.352us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.568us 17.18% 17.568us 5.856us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.560us 14.24% 14.560us 4.853us 3
Activity Buffer Request 69.85% 1.453ms 69.85% 1.453ms 1.453ms 5.984us 5.85% 5.984us 5.984us 1
aten::empty_strided 1.46% 30.330us 1.46% 30.330us 5.055us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 12.58% 261.816us 12.58% 261.816us 29.091us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.87% 18.161us 1.14% 23.661us 2.629us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.44% 9.119us 0.44% 9.119us 0.608us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.48% 9.940us 0.48% 9.940us 3.313us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.45% 9.280us 0.45% 9.280us 3.093us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.27% 5.680us 0.34% 7.119us 2.373us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.081ms
Self CUDA time total: 102.239us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.895us 303.69% 340.895us 340.895us 1
torch_eager 5.96% 121.922us 99.72% 2.040ms 2.040ms 0.000us 0.00% 118.204us 118.204us 1
aten::conv1d 0.29% 5.851us 5.96% 121.923us 40.641us 0.000us 0.00% 80.190us 26.730us 3
aten::convolution 0.47% 9.659us 5.67% 116.072us 38.691us 0.000us 0.00% 80.190us 26.730us 3
aten::_convolution 1.15% 23.552us 5.20% 106.413us 35.471us 0.000us 0.00% 80.190us 26.730us 3
aten::_conv_depthwise2d 1.14% 23.240us 3.13% 64.041us 21.347us 80.190us 71.44% 80.190us 26.730us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.190us 71.44% 80.190us 26.730us 3
aten::to 0.30% 6.190us 86.45% 1.769ms 294.821us 0.000us 0.00% 38.014us 6.336us 6
aten::_to_copy 1.15% 23.531us 86.15% 1.763ms 293.790us 0.000us 0.00% 38.014us 6.336us 6
aten::copy_ 2.44% 49.841us 83.49% 1.708ms 284.726us 32.062us 28.56% 38.014us 6.336us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.503us 15.59% 17.503us 5.834us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.559us 12.97% 14.559us 4.853us 3
Activity Buffer Request 69.71% 1.426ms 69.71% 1.426ms 1.426ms 5.952us 5.30% 5.952us 5.952us 1
aten::empty_strided 1.51% 30.850us 1.51% 30.850us 5.142us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 12.43% 254.276us 12.43% 254.276us 28.253us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.92% 18.780us 1.20% 24.600us 2.733us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.47% 9.541us 0.47% 9.541us 0.636us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.49% 10.090us 0.49% 10.090us 3.363us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.42% 8.680us 0.42% 8.680us 2.893us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.34% 6.870us 0.41% 8.300us 2.767us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.046ms
Self CUDA time total: 112.252us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 6.19% 128.263us 98.22% 2.035ms 2.035ms 0.000us 0.00% 432.800us 432.800us 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 422.880us 107.61% 422.880us 422.880us 1
aten::conv1d 0.31% 6.390us 5.89% 122.123us 40.708us 0.000us 0.00% 250.976us 83.659us 3
aten::convolution 0.46% 9.601us 5.59% 115.733us 38.578us 0.000us 0.00% 250.976us 83.659us 3
aten::_convolution 1.21% 25.079us 5.12% 106.132us 35.377us 0.000us 0.00% 250.976us 83.659us 3
aten::_conv_depthwise2d 1.14% 23.570us 3.11% 64.391us 21.464us 250.976us 63.87% 250.976us 83.659us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 250.976us 63.87% 250.976us 83.659us 3
aten::to 0.32% 6.560us 84.78% 1.757ms 292.786us 0.000us 0.00% 181.824us 30.304us 6
aten::_to_copy 1.18% 24.419us 84.47% 1.750ms 291.693us 0.000us 0.00% 181.824us 30.304us 6
aten::copy_ 2.44% 50.653us 81.84% 1.696ms 282.633us 141.984us 36.13% 181.824us 30.304us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 102.336us 26.04% 102.336us 34.112us 3
Activity Buffer Request 68.54% 1.420ms 68.54% 1.420ms 1.420ms 39.840us 10.14% 39.840us 39.840us 1
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.648us 10.09% 39.648us 13.216us 3
aten::empty_strided 1.44% 29.940us 1.44% 29.940us 4.990us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 11.90% 246.595us 11.90% 246.595us 27.399us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.85% 17.582us 1.11% 22.961us 2.551us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.44% 9.098us 0.44% 9.098us 0.607us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.47% 9.750us 0.47% 9.750us 3.250us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.45% 9.420us 0.45% 9.420us 3.140us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.32% 6.712us 0.39% 8.162us 2.721us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.072ms
Self CUDA time total: 392.960us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 5.97% 128.995us 95.86% 2.073ms 2.073ms 0.000us 0.00% 487.835us 487.835us 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 481.723us 107.54% 481.723us 481.723us 1
aten::conv1d 0.29% 6.320us 5.75% 124.323us 41.441us 0.000us 0.00% 300.092us 100.031us 3
aten::convolution 0.47% 10.180us 5.46% 118.003us 39.334us 0.000us 0.00% 300.092us 100.031us 3
aten::_convolution 1.09% 23.583us 4.99% 107.823us 35.941us 0.000us 0.00% 300.092us 100.031us 3
aten::_conv_depthwise2d 1.05% 22.771us 3.07% 66.451us 22.150us 300.092us 67.00% 300.092us 100.031us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 300.092us 67.00% 300.092us 100.031us 3
aten::to 0.32% 6.900us 82.82% 1.791ms 298.496us 0.000us 0.00% 187.743us 31.290us 6
aten::_to_copy 1.13% 24.450us 82.50% 1.784ms 297.346us 0.000us 0.00% 187.743us 31.290us 6
aten::copy_ 2.37% 51.149us 79.94% 1.729ms 288.123us 147.839us 33.00% 187.743us 31.290us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 107.872us 24.08% 107.872us 35.957us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.967us 8.92% 39.967us 13.322us 3
Activity Buffer Request 67.34% 1.456ms 67.34% 1.456ms 1.456ms 39.904us 8.91% 39.904us 39.904us 1
aten::empty_strided 1.43% 30.891us 1.43% 30.891us 5.149us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 11.33% 245.015us 11.33% 245.015us 27.224us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.88% 18.930us 1.15% 24.910us 2.768us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.45% 9.739us 0.45% 9.739us 0.649us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.47% 10.080us 0.47% 10.080us 3.360us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.46% 9.870us 0.46% 9.870us 3.290us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.30% 6.460us 0.36% 7.889us 2.630us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.162ms
Self CUDA time total: 447.931us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 336.383us 1799.90% 336.383us 336.383us 1
torch_eager 14.70% 125.222us 99.35% 846.539us 846.539us 0.000us 0.00% 20.577us 20.577us 1
aten::to 0.73% 6.218us 67.41% 574.433us 95.739us 0.000us 0.00% 13.344us 2.224us 6
aten::_to_copy 2.79% 23.792us 66.68% 568.215us 94.702us 0.000us 0.00% 13.344us 2.224us 6
aten::copy_ 6.26% 53.310us 60.38% 514.471us 85.745us 11.456us 61.30% 13.344us 2.224us 6
aten::conv1d 0.70% 5.960us 14.03% 119.583us 39.861us 0.000us 0.00% 7.233us 2.411us 3
aten::convolution 1.15% 9.760us 13.33% 113.623us 37.874us 0.000us 0.00% 7.233us 2.411us 3
aten::_convolution 2.80% 23.881us 12.19% 103.863us 34.621us 0.000us 0.00% 7.233us 2.411us 3
aten::_conv_depthwise2d 2.66% 22.671us 7.54% 64.252us 21.417us 7.233us 38.70% 7.233us 2.411us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.233us 38.70% 7.233us 2.411us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.856us 31.33% 5.856us 1.952us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.600us 29.96% 5.600us 1.867us 3
Activity Buffer Request 28.19% 240.205us 28.19% 240.205us 240.205us 1.888us 10.10% 1.888us 1.888us 1
aten::empty_strided 3.52% 29.952us 3.52% 29.952us 4.992us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 28.71% 244.666us 28.71% 244.666us 27.185us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.06% 17.560us 2.66% 22.660us 2.518us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.01% 8.590us 1.01% 8.590us 0.573us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.10% 9.390us 1.10% 9.390us 3.130us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.00% 8.481us 1.00% 8.481us 2.827us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.71% 6.030us 0.87% 7.390us 2.463us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 852.119us
Self CUDA time total: 18.689us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 337.210us 1736.14% 337.210us 337.210us 1
torch_eager 14.23% 122.135us 99.33% 852.339us 852.339us 0.000us 0.00% 21.279us 21.279us 1
aten::to 0.69% 5.930us 67.81% 581.931us 96.989us 0.000us 0.00% 13.375us 2.229us 6
aten::_to_copy 2.77% 23.810us 67.12% 576.001us 96.000us 0.000us 0.00% 13.375us 2.229us 6
aten::copy_ 5.94% 51.010us 60.80% 521.760us 86.960us 11.519us 59.31% 13.375us 2.229us 6
aten::conv1d 0.66% 5.690us 14.16% 121.503us 40.501us 0.000us 0.00% 7.904us 2.635us 3
aten::convolution 1.11% 9.501us 13.50% 115.813us 38.604us 0.000us 0.00% 7.904us 2.635us 3
aten::_convolution 3.02% 25.902us 12.39% 106.312us 35.437us 0.000us 0.00% 7.904us 2.635us 3
aten::_conv_depthwise2d 2.72% 23.320us 7.45% 63.940us 21.313us 7.904us 40.69% 7.904us 2.635us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 40.69% 7.904us 2.635us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.855us 30.14% 5.855us 1.952us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 29.16% 5.664us 1.888us 3
Activity Buffer Request 28.95% 248.446us 28.95% 248.446us 248.446us 1.856us 9.56% 1.856us 1.856us 1
aten::empty_strided 3.55% 30.431us 3.55% 30.431us 5.072us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 28.34% 243.224us 28.34% 243.224us 27.025us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 1.98% 16.978us 2.58% 22.140us 2.460us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.00% 8.573us 1.00% 8.573us 0.572us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.13% 9.660us 1.13% 9.660us 3.220us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.17% 10.040us 1.17% 10.040us 3.347us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.78% 6.699us 0.93% 7.990us 2.663us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 858.129us
Self CUDA time total: 19.423us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.632us 1762.47% 340.632us 340.632us 1
torch_eager 14.22% 122.814us 99.35% 857.879us 857.879us 0.000us 0.00% 21.503us 21.503us 1
aten::to 0.71% 6.160us 68.06% 587.732us 97.955us 0.000us 0.00% 14.304us 2.384us 6
aten::_to_copy 2.69% 23.228us 67.35% 581.572us 96.929us 0.000us 0.00% 14.304us 2.384us 6
aten::copy_ 5.95% 51.401us 60.88% 525.681us 87.614us 12.128us 62.75% 14.304us 2.384us 6
aten::conv1d 0.72% 6.190us 13.86% 119.652us 39.884us 0.000us 0.00% 7.199us 2.400us 3
aten::convolution 1.11% 9.620us 13.14% 113.462us 37.821us 0.000us 0.00% 7.199us 2.400us 3
aten::_convolution 2.71% 23.420us 12.03% 103.842us 34.614us 0.000us 0.00% 7.199us 2.400us 3
aten::_conv_depthwise2d 2.67% 23.041us 7.39% 63.831us 21.277us 7.199us 37.25% 7.199us 2.400us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.199us 37.25% 7.199us 2.400us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 32.45% 6.272us 2.091us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.856us 30.30% 5.856us 1.952us 3
Activity Buffer Request 29.60% 255.626us 29.60% 255.626us 255.626us 2.176us 11.26% 2.176us 2.176us 1
aten::empty_strided 3.78% 32.663us 3.78% 32.663us 5.444us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 27.93% 241.174us 27.93% 241.174us 26.797us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.07% 17.891us 2.69% 23.211us 2.579us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.04% 8.951us 1.04% 8.951us 0.597us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.14% 9.880us 1.14% 9.880us 3.293us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.97% 8.390us 0.97% 8.390us 2.797us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.77% 6.630us 0.93% 8.000us 2.667us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 863.509us
Self CUDA time total: 19.327us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 334.363us 1661.27% 334.363us 334.363us 1
torch_eager 14.65% 121.213us 99.39% 822.628us 822.628us 0.000us 0.00% 22.271us 22.271us 1
aten::to 0.73% 6.022us 66.87% 553.441us 92.240us 0.000us 0.00% 14.239us 2.373us 6
aten::_to_copy 2.76% 22.839us 66.14% 547.419us 91.237us 0.000us 0.00% 14.239us 2.373us 6
aten::copy_ 6.10% 50.480us 59.81% 495.040us 82.507us 12.095us 60.09% 14.239us 2.373us 6
aten::conv1d 0.71% 5.911us 14.57% 120.603us 40.201us 0.000us 0.00% 8.032us 2.677us 3
aten::convolution 1.15% 9.530us 13.86% 114.692us 38.231us 0.000us 0.00% 8.032us 2.677us 3
aten::_convolution 2.90% 23.998us 12.71% 105.162us 35.054us 0.000us 0.00% 8.032us 2.677us 3
aten::_conv_depthwise2d 2.69% 22.281us 7.85% 64.952us 21.651us 8.032us 39.91% 8.032us 2.677us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 8.032us 39.91% 8.032us 2.677us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.207us 30.84% 6.207us 2.069us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 29.25% 5.888us 1.963us 3
Activity Buffer Request 27.45% 227.155us 27.45% 227.155us 227.155us 2.144us 10.65% 2.144us 2.144us 1
aten::empty_strided 3.57% 29.540us 3.57% 29.540us 4.923us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 29.06% 240.556us 29.06% 240.556us 26.728us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.13% 17.627us 2.77% 22.910us 2.546us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.09% 9.014us 1.09% 9.014us 0.601us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.17% 9.710us 1.17% 9.710us 3.237us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.19% 9.810us 1.19% 9.810us 3.270us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.72% 5.931us 0.90% 7.422us 2.474us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 827.669us
Self CUDA time total: 20.127us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 335.875us 933.82% 335.875us 335.875us 1
torch_eager 14.86% 122.213us 99.37% 816.978us 816.978us 0.000us 0.00% 38.560us 38.560us 1
aten::conv1d 0.73% 6.020us 14.48% 119.072us 39.691us 0.000us 0.00% 20.064us 6.688us 3
aten::convolution 1.17% 9.589us 13.75% 113.052us 37.684us 0.000us 0.00% 20.064us 6.688us 3
aten::_convolution 2.85% 23.419us 12.58% 103.463us 34.488us 0.000us 0.00% 20.064us 6.688us 3
aten::_conv_depthwise2d 2.73% 22.441us 7.81% 64.191us 21.397us 20.064us 55.78% 20.064us 6.688us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.064us 55.78% 20.064us 6.688us 3
aten::to 0.74% 6.089us 66.65% 547.971us 91.328us 0.000us 0.00% 18.496us 3.083us 6
aten::_to_copy 2.81% 23.090us 65.91% 541.882us 90.314us 0.000us 0.00% 18.496us 3.083us 6
aten::copy_ 6.02% 49.484us 59.30% 487.602us 81.267us 15.904us 44.22% 18.496us 3.083us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.576us 23.84% 8.576us 2.859us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 20.37% 7.328us 2.443us 3
Activity Buffer Request 27.35% 224.865us 27.35% 224.865us 224.865us 2.592us 7.21% 2.592us 2.592us 1
aten::empty_strided 3.79% 31.190us 3.79% 31.190us 5.198us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 28.67% 235.693us 28.67% 235.693us 26.188us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.17% 17.820us 2.82% 23.212us 2.579us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.10% 9.012us 1.10% 9.012us 0.601us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.17% 9.620us 1.17% 9.620us 3.207us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.18% 9.690us 1.18% 9.690us 3.230us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.68% 5.572us 0.85% 6.952us 2.317us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 822.198us
Self CUDA time total: 35.968us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 370.623us 978.21% 370.623us 370.623us 1
torch_eager 6.18% 128.993us 99.74% 2.082ms 2.082ms 0.000us 0.00% 40.448us 40.448us 1
aten::conv1d 0.30% 6.311us 5.92% 123.493us 41.164us 0.000us 0.00% 22.177us 7.392us 3
aten::convolution 0.50% 10.340us 5.61% 117.182us 39.061us 0.000us 0.00% 22.177us 7.392us 3
aten::_convolution 1.15% 24.110us 5.12% 106.842us 35.614us 0.000us 0.00% 22.177us 7.392us 3
aten::_conv_depthwise2d 1.14% 23.742us 3.15% 65.742us 21.914us 22.177us 58.53% 22.177us 7.392us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.177us 58.53% 22.177us 7.392us 3
aten::to 1.13% 23.681us 86.31% 1.802ms 300.273us 0.000us 0.00% 18.271us 3.045us 6
aten::_to_copy 1.20% 24.951us 85.17% 1.778ms 296.326us 0.000us 0.00% 18.271us 3.045us 6
aten::copy_ 2.41% 50.250us 82.40% 1.720ms 286.684us 15.711us 41.47% 18.271us 3.045us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.321us 21.96% 8.321us 2.774us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.390us 19.50% 7.390us 2.463us 3
Activity Buffer Request 69.33% 1.447ms 69.33% 1.447ms 1.447ms 2.560us 6.76% 2.560us 2.560us 1
aten::empty_strided 1.58% 32.901us 1.58% 32.901us 5.484us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 11.73% 244.945us 11.73% 244.945us 27.216us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.87% 18.191us 1.14% 23.770us 2.641us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.44% 9.210us 0.44% 9.210us 0.614us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.48% 9.930us 0.48% 9.930us 3.310us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.46% 9.680us 0.46% 9.680us 3.227us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.32% 6.640us 0.38% 7.960us 2.653us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.088ms
Self CUDA time total: 37.888us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.324us 532.86% 339.324us 339.324us 1
torch_eager 14.96% 124.364us 99.38% 826.288us 826.288us 0.000us 0.00% 67.776us 67.776us 1
aten::conv1d 0.74% 6.121us 14.48% 120.383us 40.128us 0.000us 0.00% 41.409us 13.803us 3
aten::convolution 1.31% 10.850us 13.74% 114.262us 38.087us 0.000us 0.00% 41.409us 13.803us 3
aten::_convolution 2.80% 23.271us 12.44% 103.412us 34.471us 0.000us 0.00% 41.409us 13.803us 3
aten::_conv_depthwise2d 2.86% 23.801us 7.77% 64.610us 21.537us 41.409us 65.03% 41.409us 13.803us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.409us 65.03% 41.409us 13.803us 3
aten::to 0.72% 5.961us 66.69% 554.441us 92.407us 0.000us 0.00% 26.367us 4.395us 6
aten::_to_copy 2.84% 23.608us 65.97% 548.480us 91.413us 0.000us 0.00% 26.367us 4.395us 6
aten::copy_ 6.26% 52.010us 59.41% 493.909us 82.318us 22.271us 34.97% 26.367us 4.395us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.935us 18.74% 11.935us 3.978us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.336us 16.23% 10.336us 3.445us 3
Activity Buffer Request 27.47% 228.425us 27.47% 228.425us 228.425us 4.096us 6.43% 4.096us 4.096us 1
aten::empty_strided 3.72% 30.963us 3.72% 30.963us 5.160us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 28.31% 235.354us 28.31% 235.354us 26.150us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.06% 17.130us 2.64% 21.981us 2.442us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.00% 8.352us 1.00% 8.352us 0.557us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.18% 9.829us 1.18% 9.829us 3.276us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.09% 9.100us 1.09% 9.100us 3.033us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.71% 5.910us 0.88% 7.280us 2.427us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 831.408us
Self CUDA time total: 63.680us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 341.884us 492.57% 341.884us 341.884us 1
torch_eager 14.66% 124.263us 99.38% 842.608us 842.608us 0.000us 0.00% 73.472us 73.472us 1
aten::conv1d 0.69% 5.810us 14.06% 119.183us 39.728us 0.000us 0.00% 47.072us 15.691us 3
aten::convolution 1.10% 9.331us 13.37% 113.373us 37.791us 0.000us 0.00% 47.072us 15.691us 3
aten::_convolution 2.98% 25.231us 12.27% 104.042us 34.681us 0.000us 0.00% 47.072us 15.691us 3
aten::_conv_depthwise2d 2.57% 21.770us 7.47% 63.341us 21.114us 47.072us 67.82% 47.072us 15.691us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.072us 67.82% 47.072us 15.691us 3
aten::to 0.71% 6.042us 67.35% 571.062us 95.177us 0.000us 0.00% 26.400us 4.400us 6
aten::_to_copy 2.91% 24.658us 66.64% 565.020us 94.170us 0.000us 0.00% 26.400us 4.400us 6
aten::copy_ 5.98% 50.742us 60.23% 510.651us 85.108us 22.336us 32.18% 26.400us 4.400us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.967us 17.24% 11.967us 3.989us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.369us 14.94% 10.369us 3.456us 3
Activity Buffer Request 28.69% 243.255us 28.69% 243.255us 243.255us 4.064us 5.86% 4.064us 4.064us 1
aten::empty_strided 3.50% 29.711us 3.50% 29.711us 4.952us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 28.24% 239.475us 28.24% 239.475us 26.608us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.11% 17.861us 2.71% 22.969us 2.552us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.01% 8.598us 1.01% 8.598us 0.573us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.13% 9.580us 1.13% 9.580us 3.193us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.08% 9.170us 1.08% 9.170us 3.057us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.70% 5.911us 0.85% 7.210us 2.403us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 847.859us
Self CUDA time total: 69.408us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 350.870us 189.71% 350.870us 350.870us 1
torch_eager 14.64% 124.941us 99.33% 847.778us 847.778us 0.000us 0.00% 194.907us 194.907us 1
aten::conv1d 0.69% 5.931us 14.37% 122.643us 40.881us 0.000us 0.00% 132.732us 44.244us 3
aten::convolution 1.16% 9.910us 13.67% 116.712us 38.904us 0.000us 0.00% 132.732us 44.244us 3
aten::_convolution 2.94% 25.098us 12.51% 106.802us 35.601us 0.000us 0.00% 132.732us 44.244us 3
aten::_conv_depthwise2d 2.63% 22.470us 7.66% 65.342us 21.781us 132.732us 71.76% 132.732us 44.244us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 132.732us 71.76% 132.732us 44.244us 3
aten::to 0.71% 6.042us 67.13% 572.943us 95.490us 0.000us 0.00% 62.175us 10.362us 6
aten::_to_copy 2.75% 23.470us 66.42% 566.901us 94.484us 0.000us 0.00% 62.175us 10.362us 6
aten::copy_ 6.00% 51.182us 60.05% 512.571us 85.428us 52.223us 28.24% 62.175us 10.362us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.343us 15.86% 29.343us 9.781us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.880us 12.37% 22.880us 7.627us 3
Activity Buffer Request 29.33% 250.295us 29.33% 250.295us 250.295us 9.952us 5.38% 9.952us 9.952us 1
aten::empty_strided 3.62% 30.860us 3.62% 30.860us 5.143us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 27.39% 233.736us 27.39% 233.736us 25.971us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.08% 17.752us 2.70% 23.071us 2.563us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.06% 9.008us 1.06% 9.008us 0.601us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.27% 10.820us 1.27% 10.820us 3.607us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.10% 9.410us 1.10% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.72% 6.112us 0.88% 7.511us 2.504us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 853.509us
Self CUDA time total: 184.955us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 355.227us 169.45% 355.227us 355.227us 1
torch_eager 15.18% 123.692us 99.29% 808.918us 808.918us 0.000us 0.00% 223.518us 223.518us 1
aten::conv1d 0.72% 5.860us 14.71% 119.853us 39.951us 0.000us 0.00% 153.470us 51.157us 3
aten::convolution 1.17% 9.541us 13.99% 113.993us 37.998us 0.000us 0.00% 153.470us 51.157us 3
aten::_convolution 3.03% 24.710us 12.82% 104.452us 34.817us 0.000us 0.00% 153.470us 51.157us 3
aten::_conv_depthwise2d 2.76% 22.461us 7.85% 63.951us 21.317us 153.470us 73.21% 153.470us 51.157us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 153.470us 73.21% 153.470us 51.157us 3
aten::to 0.75% 6.140us 65.95% 537.281us 89.547us 0.000us 0.00% 70.048us 11.675us 6
aten::_to_copy 2.84% 23.150us 65.19% 531.141us 88.524us 0.000us 0.00% 70.048us 11.675us 6
aten::copy_ 6.47% 52.731us 58.48% 476.471us 79.412us 56.160us 26.79% 70.048us 11.675us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 33.184us 15.83% 33.184us 11.061us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.976us 10.96% 22.976us 7.659us 3
Activity Buffer Request 26.55% 216.325us 26.55% 216.325us 216.325us 13.888us 6.63% 13.888us 13.888us 1
aten::empty_strided 3.87% 31.520us 3.87% 31.520us 5.253us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 28.13% 229.215us 28.13% 229.215us 25.468us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.20% 17.931us 2.85% 23.181us 2.576us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.08% 8.800us 1.08% 8.800us 0.587us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.20% 9.790us 1.20% 9.790us 3.263us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.22% 9.900us 1.22% 9.900us 3.300us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.73% 5.980us 0.90% 7.360us 2.453us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 814.738us
Self CUDA time total: 209.630us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 6.92% 128.362us 54.16% 1.005ms 1.005ms 0.000us 0.00% 1.522ms 1.522ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.421ms 100.42% 1.421ms 1.421ms 1
aten::to 0.36% 6.621us 38.84% 720.727us 120.121us 0.000us 0.00% 826.557us 137.760us 6
aten::_to_copy 1.58% 29.231us 38.49% 714.106us 119.018us 0.000us 0.00% 826.557us 137.760us 6
aten::copy_ 2.91% 54.020us 26.66% 494.611us 82.435us 719.869us 50.86% 826.557us 137.760us 6
aten::conv1d 0.33% 6.200us 6.83% 126.803us 42.268us 0.000us 0.00% 695.450us 231.817us 3
aten::convolution 0.54% 10.000us 6.50% 120.603us 40.201us 0.000us 0.00% 695.450us 231.817us 3
aten::_convolution 1.37% 25.370us 5.96% 110.603us 36.868us 0.000us 0.00% 695.450us 231.817us 3
aten::_conv_depthwise2d 1.27% 23.500us 3.66% 67.942us 22.647us 695.450us 49.14% 695.450us 231.817us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 695.450us 49.14% 695.450us 231.817us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 408.829us 28.89% 408.829us 136.276us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 311.040us 21.98% 311.040us 103.680us 3
Activity Buffer Request 12.21% 226.485us 12.21% 226.485us 226.485us 106.688us 7.54% 106.688us 106.688us 1
aten::empty_strided 2.00% 37.161us 10.25% 190.264us 31.711us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 12.87% 238.737us 12.87% 238.737us 26.526us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.97% 18.050us 1.30% 24.121us 2.680us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.54% 9.951us 0.54% 9.951us 0.663us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.54% 10.110us 0.54% 10.110us 3.370us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.52% 9.701us 0.52% 9.701us 3.234us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.37% 6.860us 0.45% 8.350us 2.783us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.855ms
Self CUDA time total: 1.415ms
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 4.16% 128.483us 66.51% 2.056ms 2.056ms 0.000us 0.00% 1.499ms 1.499ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.430ms 100.42% 1.430ms 1.430ms 1
aten::to 0.21% 6.492us 57.40% 1.775ms 295.822us 0.000us 0.00% 760.863us 126.811us 6
aten::_to_copy 0.82% 25.449us 57.19% 1.768ms 294.739us 0.000us 0.00% 760.863us 126.811us 6
aten::copy_ 1.66% 51.471us 55.36% 1.712ms 285.278us 686.079us 48.17% 760.863us 126.811us 6
aten::conv1d 0.22% 6.820us 4.02% 124.423us 41.474us 0.000us 0.00% 738.336us 246.112us 3
aten::convolution 0.33% 10.111us 3.80% 117.603us 39.201us 0.000us 0.00% 738.336us 246.112us 3
aten::_convolution 0.82% 25.320us 3.48% 107.492us 35.831us 0.000us 0.00% 738.336us 246.112us 3
aten::_conv_depthwise2d 0.75% 23.320us 2.10% 65.022us 21.674us 738.336us 51.83% 738.336us 246.112us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 738.336us 51.83% 738.336us 246.112us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 395.071us 27.74% 395.071us 131.690us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 291.008us 20.43% 291.008us 97.003us 3
Activity Buffer Request 46.92% 1.451ms 46.92% 1.451ms 1.451ms 74.784us 5.25% 74.784us 74.784us 1
aten::empty_strided 1.01% 31.321us 1.01% 31.321us 5.220us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 7.49% 231.634us 7.49% 231.634us 25.737us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.61% 18.861us 0.79% 24.350us 2.706us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.29% 9.099us 0.29% 9.099us 0.607us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.32% 9.981us 0.32% 9.981us 3.327us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.31% 9.461us 0.31% 9.461us 3.154us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.20% 6.260us 0.25% 7.650us 2.550us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 3.092ms
Self CUDA time total: 1.424ms
impl wl p50(ms) ok
torch_eager cuda_B2_D2048_S128_W2 0.09 True
torch_eager cuda_B2_D2048_S128_W4 0.09 True
torch_eager cuda_B2_D2048_S2048_W2 0.14 True
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
torch_eager cuda_B2_D2048_S512_W2 0.09 True
torch_eager cuda_B2_D2048_S512_W4 0.09 True
torch_eager cuda_B2_D64_S128_W2 0.07 True
torch_eager cuda_B2_D64_S128_W4 0.09 True
torch_eager cuda_B2_D64_S2048_W2 0.09 True
torch_eager cuda_B2_D64_S2048_W4 0.09 True
torch_eager cuda_B2_D64_S512_W2 0.09 True
torch_eager cuda_B2_D64_S512_W4 0.09 True
torch_eager cuda_B4_D2048_S128_W2 0.09 True
torch_eager cuda_B4_D2048_S128_W4 0.09 True
torch_eager cuda_B4_D2048_S2048_W2 0.49 True
torch_eager cuda_B4_D2048_S2048_W4 0.50 True
torch_eager cuda_B4_D2048_S512_W2 0.10 True
torch_eager cuda_B4_D2048_S512_W4 0.10 True
torch_eager cuda_B4_D64_S128_W2 0.09 True
torch_eager cuda_B4_D64_S128_W4 0.09 True
torch_eager cuda_B4_D64_S2048_W2 0.09 True
torch_eager cuda_B4_D64_S2048_W4 0.09 True
torch_eager cuda_B4_D64_S512_W2 0.09 True
torch_eager cuda_B4_D64_S512_W4 0.09 True