Running causal_conv1d benchmark on cuda with 24 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 436.163us 2260.26% 436.163us 436.163us 1
torch_eager 9.04% 219.426us 99.40% 2.414ms 2.414ms 0.000us 0.00% 21.633us 21.633us 1
aten::to 0.42% 10.088us 82.02% 1.992ms 331.955us 0.000us 0.00% 14.273us 2.379us 6
aten::_to_copy 1.72% 41.805us 81.60% 1.982ms 330.274us 0.000us 0.00% 14.273us 2.379us 6
aten::copy_ 2.55% 61.862us 77.44% 1.881ms 313.438us 11.937us 61.86% 14.273us 2.379us 6
aten::conv1d 0.37% 9.100us 6.66% 161.626us 53.875us 0.000us 0.00% 7.360us 2.453us 3
aten::convolution 0.65% 15.771us 6.28% 152.526us 50.842us 0.000us 0.00% 7.360us 2.453us 3
aten::_convolution 1.37% 33.342us 5.63% 136.755us 45.585us 0.000us 0.00% 7.360us 2.453us 3
aten::_conv_depthwise2d 1.40% 34.091us 3.35% 81.282us 27.094us 7.360us 38.14% 7.360us 2.453us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 38.14% 7.360us 2.453us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 32.67% 6.304us 2.101us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.633us 29.19% 5.633us 1.878us 3
Activity Buffer Request 72.13% 1.752ms 72.13% 1.752ms 1.752ms 2.336us 12.11% 2.336us 2.336us 1
aten::empty_strided 2.44% 59.211us 2.44% 59.211us 9.868us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.74% 90.852us 3.74% 90.852us 10.095us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 1.11% 27.021us 1.43% 34.781us 3.865us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.54% 13.120us 0.54% 13.120us 0.875us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.50% 12.090us 0.50% 12.090us 4.030us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.47% 11.450us 0.47% 11.450us 3.817us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.39% 9.560us 0.46% 11.070us 3.690us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.428ms
Self CUDA time total: 19.297us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 323.230us 1639.85% 323.230us 323.230us 1
torch_eager 5.66% 125.716us 99.73% 2.215ms 2.215ms 0.000us 0.00% 21.919us 21.919us 1
aten::to 0.28% 6.120us 87.62% 1.946ms 324.308us 0.000us 0.00% 13.951us 2.325us 6
aten::_to_copy 1.09% 24.242us 87.34% 1.940ms 323.288us 0.000us 0.00% 13.951us 2.325us 6
aten::copy_ 2.08% 46.140us 84.89% 1.885ms 314.223us 11.743us 59.58% 13.951us 2.325us 6
aten::conv1d 0.26% 5.800us 5.28% 117.333us 39.111us 0.000us 0.00% 7.968us 2.656us 3
aten::convolution 0.51% 11.340us 5.02% 111.533us 37.178us 0.000us 0.00% 7.968us 2.656us 3
aten::_convolution 1.02% 22.760us 4.51% 100.193us 33.398us 0.000us 0.00% 7.968us 2.656us 3
aten::_conv_depthwise2d 0.99% 21.901us 2.76% 61.233us 20.411us 7.968us 40.42% 7.968us 2.656us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 40.42% 7.968us 2.656us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 31.33% 6.176us 2.059us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.567us 28.24% 5.567us 1.856us 3
Activity Buffer Request 80.76% 1.794ms 80.76% 1.794ms 1.794ms 2.208us 11.20% 2.208us 2.208us 1
aten::empty_strided 1.36% 30.150us 1.36% 30.150us 5.025us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 2.98% 66.273us 2.98% 66.273us 7.364us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.76% 16.860us 0.98% 21.769us 2.419us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.39% 8.679us 0.39% 8.679us 0.579us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.42% 9.331us 0.42% 9.331us 3.110us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.42% 9.410us 0.42% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.27% 6.040us 0.33% 7.320us 2.440us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.221ms
Self CUDA time total: 19.711us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 318.337us 1694.72% 318.337us 318.337us 1
torch_eager 5.96% 125.666us 99.74% 2.103ms 2.103ms 0.000us 0.00% 20.799us 20.799us 1
aten::to 0.29% 6.100us 87.07% 1.836ms 305.953us 0.000us 0.00% 13.822us 2.304us 6
aten::_to_copy 1.12% 23.630us 86.78% 1.830ms 304.936us 0.000us 0.00% 13.822us 2.304us 6
aten::copy_ 2.25% 47.463us 84.23% 1.776ms 295.964us 11.807us 62.86% 13.822us 2.304us 6
aten::conv1d 0.29% 6.140us 5.41% 114.033us 38.011us 0.000us 0.00% 6.977us 2.326us 3
aten::convolution 0.47% 9.960us 5.12% 107.893us 35.964us 0.000us 0.00% 6.977us 2.326us 3
aten::_convolution 1.11% 23.480us 4.65% 97.933us 32.644us 0.000us 0.00% 6.977us 2.326us 3
aten::_conv_depthwise2d 1.00% 21.010us 2.82% 59.553us 19.851us 6.977us 37.14% 6.977us 2.326us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.977us 37.14% 6.977us 2.326us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.047us 32.19% 6.047us 2.016us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.760us 30.66% 5.760us 1.920us 3
Activity Buffer Request 79.84% 1.683ms 79.84% 1.683ms 1.683ms 2.015us 10.73% 2.015us 2.015us 1
aten::empty_strided 1.43% 30.201us 1.43% 30.201us 5.033us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.12% 65.751us 3.12% 65.751us 7.306us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.82% 17.281us 1.06% 22.310us 2.479us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.41% 8.589us 0.41% 8.589us 0.573us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.43% 9.162us 0.43% 9.162us 3.054us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.41% 8.730us 0.41% 8.730us 2.910us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.25% 5.310us 0.31% 6.480us 2.160us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.108ms
Self CUDA time total: 18.784us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 331.012us 1687.37% 331.012us 331.012us 1
torch_eager 5.07% 123.976us 99.78% 2.439ms 2.439ms 0.000us 0.00% 21.729us 21.729us 1
aten::to 0.26% 6.330us 88.80% 2.171ms 361.846us 0.000us 0.00% 14.016us 2.336us 6
aten::_to_copy 0.96% 23.551us 88.54% 2.165ms 360.791us 0.000us 0.00% 14.016us 2.336us 6
aten::copy_ 1.90% 46.440us 86.33% 2.111ms 351.776us 11.904us 60.68% 14.016us 2.336us 6
aten::conv1d 0.24% 5.859us 4.84% 118.253us 39.418us 0.000us 0.00% 7.713us 2.571us 3
aten::convolution 0.37% 9.110us 4.60% 112.394us 37.465us 0.000us 0.00% 7.713us 2.571us 3
aten::_convolution 0.99% 24.110us 4.22% 103.284us 34.428us 0.000us 0.00% 7.713us 2.571us 3
aten::_conv_depthwise2d 0.86% 20.950us 2.48% 60.661us 20.220us 7.713us 39.32% 7.713us 2.571us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.713us 39.32% 7.713us 2.571us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 31.48% 6.176us 2.059us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 29.20% 5.728us 1.909us 3
Activity Buffer Request 74.74% 1.827ms 74.74% 1.827ms 1.827ms 2.112us 10.77% 2.112us 2.112us 1
aten::empty_strided 1.25% 30.541us 1.25% 30.541us 5.090us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.54% 257.579us 10.54% 257.579us 28.620us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.76% 18.570us 0.98% 23.841us 2.649us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.35% 8.661us 0.35% 8.661us 0.577us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.39% 9.560us 0.39% 9.560us 3.187us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.39% 9.470us 0.39% 9.470us 3.157us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.30% 7.292us 0.35% 8.562us 2.854us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.445ms
Self CUDA time total: 19.617us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 323.714us 1318.86% 323.714us 323.714us 1
torch_eager 5.46% 123.145us 99.75% 2.249ms 2.249ms 0.000us 0.00% 26.849us 26.849us 1
aten::to 0.25% 5.731us 88.05% 1.985ms 330.837us 0.000us 0.00% 15.297us 2.550us 6
aten::_to_copy 1.06% 23.869us 87.80% 1.979ms 329.882us 0.000us 0.00% 15.297us 2.550us 6
aten::copy_ 2.13% 47.950us 85.40% 1.925ms 320.895us 12.993us 52.94% 15.297us 2.550us 6
aten::conv1d 0.25% 5.619us 5.08% 114.483us 38.161us 0.000us 0.00% 11.552us 3.851us 3
aten::convolution 0.41% 9.331us 4.83% 108.864us 36.288us 0.000us 0.00% 11.552us 3.851us 3
aten::_convolution 0.98% 22.020us 4.42% 99.533us 33.178us 0.000us 0.00% 11.552us 3.851us 3
aten::_conv_depthwise2d 0.92% 20.831us 2.69% 60.562us 20.187us 11.552us 47.06% 11.552us 3.851us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 47.06% 11.552us 3.851us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.688us 27.25% 6.688us 2.229us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.305us 25.69% 6.305us 2.102us 3
Activity Buffer Request 74.33% 1.676ms 74.33% 1.676ms 1.676ms 2.304us 9.39% 2.304us 2.304us 1
aten::empty_strided 1.33% 30.051us 1.33% 30.051us 5.008us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.91% 223.307us 9.91% 223.307us 24.812us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.79% 17.850us 1.04% 23.451us 2.606us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.40% 8.951us 0.40% 8.951us 0.597us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.40% 9.121us 0.40% 9.121us 3.040us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.40% 8.980us 0.40% 8.980us 2.993us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.27% 6.160us 0.32% 7.280us 2.427us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.254ms
Self CUDA time total: 24.545us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 332.293us 1275.69% 332.293us 332.293us 1
torch_eager 4.94% 125.275us 99.79% 2.531ms 2.531ms 0.000us 0.00% 28.288us 28.288us 1
aten::to 0.24% 6.110us 89.17% 2.261ms 376.887us 0.000us 0.00% 15.199us 2.533us 6
aten::_to_copy 0.97% 24.638us 88.93% 2.255ms 375.868us 0.000us 0.00% 15.199us 2.533us 6
aten::copy_ 1.89% 47.902us 86.77% 2.200ms 366.727us 12.959us 49.75% 15.199us 2.533us 6
aten::conv1d 0.23% 5.820us 4.59% 116.433us 38.811us 0.000us 0.00% 13.089us 4.363us 3
aten::convolution 0.40% 10.180us 4.36% 110.613us 36.871us 0.000us 0.00% 13.089us 4.363us 3
aten::_convolution 0.89% 22.520us 3.96% 100.433us 33.478us 0.000us 0.00% 13.089us 4.363us 3
aten::_conv_depthwise2d 0.83% 21.002us 2.40% 60.753us 20.251us 13.089us 50.25% 13.089us 4.363us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.089us 50.25% 13.089us 4.363us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.591us 25.30% 6.591us 2.197us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 24.45% 6.368us 2.123us 3
Activity Buffer Request 77.38% 1.962ms 77.38% 1.962ms 1.962ms 2.240us 8.60% 2.240us 2.240us 1
aten::empty_strided 1.19% 30.211us 1.19% 30.211us 5.035us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.30% 210.437us 8.30% 210.437us 23.382us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.72% 18.342us 0.94% 23.791us 2.643us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.36% 9.059us 0.36% 9.059us 0.604us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.39% 9.940us 0.39% 9.940us 3.313us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.37% 9.490us 0.37% 9.490us 3.163us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.28% 7.000us 0.33% 8.470us 2.823us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.536ms
Self CUDA time total: 26.048us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 342.974us 895.45% 342.974us 342.974us 1
torch_eager 6.28% 155.441us 99.76% 2.468ms 2.468ms 0.000us 0.00% 40.893us 40.893us 1
aten::conv1d 0.26% 6.400us 4.96% 122.693us 40.898us 0.000us 0.00% 22.559us 7.520us 3
aten::convolution 0.44% 10.880us 4.70% 116.293us 38.764us 0.000us 0.00% 22.559us 7.520us 3
aten::_convolution 1.02% 25.290us 4.26% 105.413us 35.138us 0.000us 0.00% 22.559us 7.520us 3
aten::_conv_depthwise2d 0.93% 22.901us 2.53% 62.512us 20.837us 22.559us 58.90% 22.559us 7.520us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.559us 58.90% 22.559us 7.520us 3
aten::to 0.27% 6.673us 87.38% 2.161ms 360.194us 0.000us 0.00% 18.334us 3.056us 6
aten::_to_copy 1.14% 28.279us 87.11% 2.154ms 359.082us 0.000us 0.00% 18.334us 3.056us 6
aten::copy_ 2.07% 51.091us 84.63% 2.093ms 348.865us 15.743us 41.10% 18.334us 3.056us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.352us 21.81% 8.352us 2.784us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.391us 19.30% 7.391us 2.464us 3
Activity Buffer Request 70.52% 1.744ms 70.52% 1.744ms 1.744ms 2.591us 6.76% 2.591us 2.591us 1
aten::empty_strided 1.34% 33.023us 1.34% 33.023us 5.504us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 12.90% 319.187us 12.90% 319.187us 35.465us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.77% 18.952us 0.98% 24.192us 2.688us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.37% 9.200us 0.37% 9.200us 0.613us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.37% 9.140us 0.37% 9.140us 3.047us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.37% 9.211us 0.37% 9.211us 3.070us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.27% 6.591us 0.33% 8.191us 2.730us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.473ms
Self CUDA time total: 38.302us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 320.766us 781.90% 320.766us 320.766us 1
torch_eager 5.17% 120.612us 99.77% 2.326ms 2.326ms 0.000us 0.00% 43.616us 43.616us 1
aten::conv1d 0.25% 5.870us 4.90% 114.223us 38.074us 0.000us 0.00% 25.312us 8.437us 3
aten::convolution 0.44% 10.332us 4.65% 108.353us 36.118us 0.000us 0.00% 25.312us 8.437us 3
aten::_convolution 0.97% 22.638us 4.20% 98.021us 32.674us 0.000us 0.00% 25.312us 8.437us 3
aten::_conv_depthwise2d 0.87% 20.270us 2.54% 59.242us 19.747us 25.312us 61.70% 25.312us 8.437us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.312us 61.70% 25.312us 8.437us 3
aten::to 0.24% 5.591us 88.58% 2.066ms 344.257us 0.000us 0.00% 18.304us 3.051us 6
aten::_to_copy 1.00% 23.430us 88.34% 2.060ms 343.325us 0.000us 0.00% 18.304us 3.051us 6
aten::copy_ 2.09% 48.750us 86.05% 2.006ms 334.417us 15.712us 38.30% 18.304us 3.051us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.384us 20.44% 8.384us 2.795us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 17.86% 7.328us 2.443us 3
Activity Buffer Request 74.74% 1.743ms 74.74% 1.743ms 1.743ms 2.592us 6.32% 2.592us 2.592us 1
aten::empty_strided 1.29% 30.021us 1.29% 30.021us 5.004us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.10% 235.617us 10.10% 235.617us 26.180us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.74% 17.250us 0.95% 22.130us 2.459us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.36% 8.290us 0.36% 8.290us 0.553us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.39% 9.191us 0.39% 9.191us 3.064us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.39% 8.990us 0.39% 8.990us 2.997us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.27% 6.321us 0.32% 7.551us 2.517us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.332ms
Self CUDA time total: 41.024us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 327.997us 319.51% 327.997us 327.997us 1
torch_eager 5.09% 118.124us 99.77% 2.314ms 2.314ms 0.000us 0.00% 108.574us 108.574us 1
aten::conv1d 0.23% 5.441us 4.97% 115.193us 38.398us 0.000us 0.00% 70.464us 23.488us 3
aten::convolution 0.40% 9.290us 4.73% 109.752us 36.584us 0.000us 0.00% 70.464us 23.488us 3
aten::_convolution 1.03% 23.820us 4.33% 100.462us 33.487us 0.000us 0.00% 70.464us 23.488us 3
aten::_conv_depthwise2d 0.94% 21.771us 2.65% 61.512us 20.504us 70.464us 68.64% 70.464us 23.488us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.464us 68.64% 70.464us 23.488us 3
aten::to 0.25% 5.729us 88.48% 2.052ms 341.956us 0.000us 0.00% 38.110us 6.352us 6
aten::_to_copy 1.03% 23.980us 88.23% 2.046ms 341.002us 0.000us 0.00% 38.110us 6.352us 6
aten::copy_ 2.02% 46.950us 85.91% 1.992ms 332.012us 32.191us 31.36% 38.110us 6.352us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.567us 17.11% 17.567us 5.856us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.624us 14.25% 14.624us 4.875us 3
Activity Buffer Request 76.17% 1.766ms 76.17% 1.766ms 1.766ms 5.919us 5.77% 5.919us 5.919us 1
aten::empty_strided 1.29% 29.961us 1.29% 29.961us 4.993us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.62% 199.926us 8.62% 199.926us 22.214us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.81% 18.769us 1.04% 24.230us 2.692us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.38% 8.852us 0.38% 8.852us 0.590us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.41% 9.440us 0.41% 9.440us 3.147us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.40% 9.240us 0.40% 9.240us 3.080us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.24% 5.490us 0.29% 6.730us 2.243us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.319ms
Self CUDA time total: 102.655us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 324.863us 288.98% 324.863us 324.863us 1
torch_eager 5.27% 120.370us 99.77% 2.281ms 2.281ms 0.000us 0.00% 118.337us 118.337us 1
aten::conv1d 0.25% 5.800us 4.98% 113.813us 37.938us 0.000us 0.00% 80.416us 26.805us 3
aten::convolution 0.39% 8.920us 4.72% 108.013us 36.004us 0.000us 0.00% 80.416us 26.805us 3
aten::_convolution 0.94% 21.602us 4.33% 99.093us 33.031us 0.000us 0.00% 80.416us 26.805us 3
aten::_conv_depthwise2d 0.98% 22.480us 2.74% 62.561us 20.854us 80.416us 71.53% 80.416us 26.805us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.416us 71.53% 80.416us 26.805us 3
aten::to 0.26% 6.002us 88.33% 2.019ms 336.570us 0.000us 0.00% 37.921us 6.320us 6
aten::_to_copy 0.97% 22.148us 88.07% 2.013ms 335.570us 0.000us 0.00% 37.921us 6.320us 6
aten::copy_ 2.05% 46.852us 85.75% 1.960ms 326.725us 32.001us 28.47% 37.921us 6.320us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.472us 15.54% 17.472us 5.824us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.529us 12.92% 14.529us 4.843us 3
Activity Buffer Request 76.07% 1.739ms 76.07% 1.739ms 1.739ms 5.920us 5.27% 5.920us 5.920us 1
aten::empty_strided 1.35% 30.921us 1.35% 30.921us 5.153us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.60% 196.555us 8.60% 196.555us 21.839us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.81% 18.430us 1.02% 23.240us 2.582us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.36% 8.150us 0.36% 8.150us 0.543us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.40% 9.159us 0.40% 9.159us 3.053us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.39% 8.901us 0.39% 8.901us 2.967us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.24% 5.410us 0.29% 6.670us 2.223us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.286ms
Self CUDA time total: 112.417us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 4.92% 118.303us 96.45% 2.317ms 2.317ms 0.000us 0.00% 464.513us 464.513us 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 454.847us 106.94% 454.847us 454.847us 1
aten::conv1d 0.23% 5.570us 4.84% 116.393us 38.798us 0.000us 0.00% 280.159us 93.386us 3
aten::convolution 0.41% 9.781us 4.61% 110.823us 36.941us 0.000us 0.00% 280.159us 93.386us 3
aten::_convolution 0.97% 23.190us 4.21% 101.042us 33.681us 0.000us 0.00% 280.159us 93.386us 3
aten::_conv_depthwise2d 0.92% 22.172us 2.56% 61.572us 20.524us 280.159us 65.87% 280.159us 93.386us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 280.159us 65.87% 280.159us 93.386us 3
aten::to 0.25% 6.010us 85.54% 2.055ms 342.552us 0.000us 0.00% 184.354us 30.726us 6
aten::_to_copy 1.07% 25.702us 85.29% 2.049ms 341.550us 0.000us 0.00% 184.354us 30.726us 6
aten::copy_ 1.90% 45.633us 82.97% 1.993ms 332.247us 145.185us 34.13% 184.354us 30.726us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 104.993us 24.68% 104.993us 34.998us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.192us 9.45% 40.192us 13.397us 3
Activity Buffer Request 73.74% 1.772ms 73.74% 1.772ms 1.772ms 39.169us 9.21% 39.169us 39.169us 1
aten::empty_strided 1.25% 30.119us 1.25% 30.119us 5.020us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.18% 196.512us 8.18% 196.512us 21.835us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.74% 17.761us 0.97% 23.251us 2.583us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.39% 9.350us 0.39% 9.350us 0.623us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.39% 9.400us 0.39% 9.400us 3.133us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.40% 9.580us 0.40% 9.580us 3.193us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.27% 6.480us 0.34% 8.150us 2.717us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.403ms
Self CUDA time total: 425.344us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 5.04% 119.094us 95.77% 2.263ms 2.263ms 0.000us 0.00% 471.612us 471.612us 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 465.149us 106.82% 465.149us 465.149us 1
aten::conv1d 0.25% 5.900us 4.82% 113.812us 37.937us 0.000us 0.00% 297.789us 99.263us 3
aten::convolution 0.42% 9.940us 4.57% 107.912us 35.971us 0.000us 0.00% 297.789us 99.263us 3
aten::_convolution 0.91% 21.591us 4.15% 97.972us 32.657us 0.000us 0.00% 297.789us 99.263us 3
aten::_conv_depthwise2d 0.88% 20.792us 2.56% 60.392us 20.131us 297.789us 68.39% 297.789us 99.263us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 297.789us 68.39% 297.789us 99.263us 3
aten::to 0.25% 5.882us 84.81% 2.004ms 334.050us 0.000us 0.00% 173.823us 28.970us 6
aten::_to_copy 1.03% 24.309us 84.56% 1.998ms 333.069us 0.000us 0.00% 173.823us 28.970us 6
aten::copy_ 1.99% 46.992us 82.30% 1.945ms 324.149us 137.663us 31.61% 173.823us 28.970us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 97.280us 22.34% 97.280us 32.427us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.383us 9.27% 40.383us 13.461us 3
Activity Buffer Request 72.83% 1.721ms 72.83% 1.721ms 1.721ms 36.160us 8.30% 36.160us 36.160us 1
aten::empty_strided 1.24% 29.211us 1.24% 29.211us 4.869us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.35% 197.423us 8.35% 197.423us 21.936us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.69% 16.359us 0.90% 21.340us 2.371us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.37% 8.812us 0.37% 8.812us 0.587us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.41% 9.620us 0.41% 9.620us 3.207us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.40% 9.410us 0.40% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.27% 6.299us 0.32% 7.670us 2.557us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.363ms
Self CUDA time total: 435.452us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 313.695us 1667.17% 313.695us 313.695us 1
torch_eager 14.13% 113.614us 99.32% 798.339us 798.339us 0.000us 0.00% 20.800us 20.800us 1
aten::to 0.70% 5.659us 68.10% 547.391us 91.232us 0.000us 0.00% 13.664us 2.277us 6
aten::_to_copy 3.00% 24.130us 67.40% 541.732us 90.289us 0.000us 0.00% 13.664us 2.277us 6
aten::copy_ 5.94% 47.780us 60.67% 487.691us 81.282us 11.680us 62.07% 13.664us 2.277us 6
aten::conv1d 0.70% 5.600us 14.11% 113.433us 37.811us 0.000us 0.00% 7.136us 2.379us 3
aten::convolution 1.23% 9.922us 13.42% 107.833us 35.944us 0.000us 0.00% 7.136us 2.379us 3
aten::_convolution 2.79% 22.410us 12.18% 97.911us 32.637us 0.000us 0.00% 7.136us 2.379us 3
aten::_conv_depthwise2d 2.59% 20.852us 7.49% 60.242us 20.081us 7.136us 37.93% 7.136us 2.379us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.136us 37.93% 7.136us 2.379us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 31.80% 5.984us 1.995us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 30.27% 5.696us 1.899us 3
Activity Buffer Request 32.74% 263.147us 32.74% 263.147us 263.147us 1.984us 10.54% 1.984us 1.984us 1
aten::empty_strided 3.72% 29.911us 3.72% 29.911us 4.985us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.67% 198.304us 24.67% 198.304us 22.034us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 1.90% 15.298us 2.42% 19.460us 2.162us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.96% 7.694us 0.96% 7.694us 0.513us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.11% 8.950us 1.11% 8.950us 2.983us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.11% 8.900us 1.11% 8.900us 2.967us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.80% 6.399us 0.97% 7.810us 2.603us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 803.779us
Self CUDA time total: 18.816us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 315.484us 1613.56% 315.484us 315.484us 1
torch_eager 14.44% 114.641us 99.35% 788.539us 788.539us 0.000us 0.00% 21.568us 21.568us 1
aten::to 0.74% 5.859us 67.65% 536.963us 89.494us 0.000us 0.00% 13.632us 2.272us 6
aten::_to_copy 3.05% 24.201us 66.91% 531.104us 88.517us 0.000us 0.00% 13.632us 2.272us 6
aten::copy_ 6.01% 47.671us 60.17% 477.592us 79.599us 11.616us 59.41% 13.632us 2.272us 6
aten::conv1d 0.73% 5.820us 14.10% 111.923us 37.308us 0.000us 0.00% 7.936us 2.645us 3
aten::convolution 1.13% 8.991us 13.37% 106.103us 35.368us 0.000us 0.00% 7.936us 2.645us 3
aten::_convolution 2.72% 21.550us 12.24% 97.112us 32.371us 0.000us 0.00% 7.936us 2.645us 3
aten::_conv_depthwise2d 2.66% 21.080us 7.67% 60.892us 20.297us 7.936us 40.59% 7.936us 2.645us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 40.59% 7.936us 2.645us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 30.28% 5.920us 1.973us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 29.13% 5.696us 1.899us 3
Activity Buffer Request 32.17% 255.357us 32.17% 255.357us 255.357us 2.016us 10.31% 2.016us 2.016us 1
aten::empty_strided 3.69% 29.311us 3.69% 29.311us 4.885us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.68% 195.894us 24.68% 195.894us 21.766us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.03% 16.101us 2.61% 20.721us 2.302us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.02% 8.070us 1.02% 8.070us 0.538us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.14% 9.082us 1.14% 9.082us 3.027us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.18% 9.400us 1.18% 9.400us 3.133us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.66% 5.230us 0.82% 6.480us 2.160us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 793.700us
Self CUDA time total: 19.552us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 319.227us 1646.26% 319.227us 319.227us 1
torch_eager 14.35% 115.202us 99.35% 797.750us 797.750us 0.000us 0.00% 21.567us 21.567us 1
aten::to 0.73% 5.879us 67.70% 543.593us 90.599us 0.000us 0.00% 14.367us 2.394us 6
aten::_to_copy 2.99% 23.982us 66.96% 537.714us 89.619us 0.000us 0.00% 14.367us 2.394us 6
aten::copy_ 5.85% 46.982us 60.24% 483.712us 80.619us 12.191us 62.87% 14.367us 2.394us 6
aten::conv1d 0.69% 5.560us 14.06% 112.933us 37.644us 0.000us 0.00% 7.200us 2.400us 3
aten::convolution 1.13% 9.060us 13.37% 107.373us 35.791us 0.000us 0.00% 7.200us 2.400us 3
aten::_convolution 2.73% 21.891us 12.24% 98.313us 32.771us 0.000us 0.00% 7.200us 2.400us 3
aten::_conv_depthwise2d 2.58% 20.719us 7.42% 59.602us 19.867us 7.200us 37.13% 7.200us 2.400us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.200us 37.13% 7.200us 2.400us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 32.34% 6.272us 2.091us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.919us 30.52% 5.919us 1.973us 3
Activity Buffer Request 32.74% 262.897us 32.74% 262.897us 262.897us 2.176us 11.22% 2.176us 2.176us 1
aten::empty_strided 3.74% 30.020us 3.74% 30.020us 5.003us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.16% 194.015us 24.16% 194.015us 21.557us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.13% 17.072us 2.73% 21.901us 2.433us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.03% 8.259us 1.03% 8.259us 0.551us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.15% 9.210us 1.15% 9.210us 3.070us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.18% 9.491us 1.18% 9.491us 3.164us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.90% 7.200us 1.06% 8.540us 2.847us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 802.990us
Self CUDA time total: 19.391us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 318.585us 1587.92% 318.585us 318.585us 1
torch_eager 14.32% 114.402us 99.36% 793.540us 793.540us 0.000us 0.00% 22.271us 22.271us 1
aten::to 0.74% 5.889us 67.47% 538.893us 89.816us 0.000us 0.00% 14.368us 2.395us 6
aten::_to_copy 2.86% 22.853us 66.74% 533.004us 88.834us 0.000us 0.00% 14.368us 2.395us 6
aten::copy_ 5.91% 47.181us 60.05% 479.601us 79.933us 12.160us 60.61% 14.368us 2.395us 6
aten::conv1d 0.72% 5.730us 14.38% 114.863us 38.288us 0.000us 0.00% 7.903us 2.634us 3
aten::convolution 1.15% 9.210us 13.66% 109.133us 36.378us 0.000us 0.00% 7.903us 2.634us 3
aten::_convolution 2.90% 23.191us 12.51% 99.923us 33.308us 0.000us 0.00% 7.903us 2.634us 3
aten::_conv_depthwise2d 2.58% 20.640us 7.49% 59.852us 19.951us 7.903us 39.39% 7.903us 2.634us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.903us 39.39% 7.903us 2.634us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 31.42% 6.304us 2.101us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.856us 29.19% 5.856us 1.952us 3
Activity Buffer Request 32.57% 260.117us 32.57% 260.117us 260.117us 2.208us 11.01% 2.208us 2.208us 1
aten::empty_strided 3.83% 30.550us 3.83% 30.550us 5.092us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.14% 192.815us 24.14% 192.815us 21.424us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.21% 17.653us 2.81% 22.412us 2.490us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.03% 8.198us 1.03% 8.198us 0.547us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.14% 9.130us 1.14% 9.130us 3.043us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.20% 9.570us 1.20% 9.570us 3.190us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.77% 6.130us 0.93% 7.460us 2.487us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 798.670us
Self CUDA time total: 20.063us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 325.920us 904.53% 325.920us 325.920us 1
torch_eager 14.34% 116.794us 99.35% 809.360us 809.360us 0.000us 0.00% 38.624us 38.624us 1
aten::conv1d 0.72% 5.870us 14.19% 115.602us 38.534us 0.000us 0.00% 20.192us 6.731us 3
aten::convolution 1.16% 9.420us 13.47% 109.732us 36.577us 0.000us 0.00% 20.192us 6.731us 3
aten::_convolution 2.80% 22.850us 12.31% 100.312us 33.437us 0.000us 0.00% 20.192us 6.731us 3
aten::_conv_depthwise2d 2.58% 21.022us 7.49% 61.052us 20.351us 20.192us 56.04% 20.192us 6.731us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.192us 56.04% 20.192us 6.731us 3
aten::to 0.73% 5.980us 67.60% 550.703us 91.784us 0.000us 0.00% 18.432us 3.072us 6
aten::_to_copy 2.92% 23.789us 66.86% 544.723us 90.787us 0.000us 0.00% 18.432us 3.072us 6
aten::copy_ 5.93% 48.281us 60.23% 490.683us 81.781us 15.840us 43.96% 18.432us 3.072us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.448us 23.45% 8.448us 2.816us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 20.52% 7.392us 2.464us 3
Activity Buffer Request 30.82% 251.076us 30.82% 251.076us 251.076us 2.592us 7.19% 2.592us 2.592us 1
aten::empty_strided 3.71% 30.251us 3.71% 30.251us 5.042us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 26.06% 212.336us 26.06% 212.336us 23.593us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.07% 16.901us 2.67% 21.731us 2.415us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.01% 8.191us 1.01% 8.191us 0.546us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.17% 9.540us 1.17% 9.540us 3.180us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.16% 9.480us 1.16% 9.480us 3.160us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.88% 7.140us 1.02% 8.340us 2.780us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 814.681us
Self CUDA time total: 36.032us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 322.527us 848.40% 322.527us 322.527us 1
torch_eager 5.15% 118.831us 99.77% 2.301ms 2.301ms 0.000us 0.00% 40.608us 40.608us 1
aten::conv1d 0.25% 5.790us 4.95% 114.162us 38.054us 0.000us 0.00% 22.272us 7.424us 3
aten::convolution 0.40% 9.280us 4.70% 108.372us 36.124us 0.000us 0.00% 22.272us 7.424us 3
aten::_convolution 0.97% 22.470us 4.30% 99.092us 33.031us 0.000us 0.00% 22.272us 7.424us 3
aten::_conv_depthwise2d 0.93% 21.479us 2.62% 60.532us 20.177us 22.272us 58.59% 22.272us 7.424us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.272us 58.59% 22.272us 7.424us 3
aten::to 0.26% 6.081us 88.49% 2.041ms 340.127us 0.000us 0.00% 18.336us 3.056us 6
aten::_to_copy 1.04% 23.880us 88.22% 2.035ms 339.113us 0.000us 0.00% 18.336us 3.056us 6
aten::copy_ 2.05% 47.372us 85.91% 1.981ms 330.228us 15.744us 41.41% 18.336us 3.056us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.416us 22.14% 8.416us 2.805us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 19.28% 7.328us 2.443us 3
Activity Buffer Request 76.31% 1.760ms 76.31% 1.760ms 1.760ms 2.592us 6.82% 2.592us 2.592us 1
aten::empty_strided 1.28% 29.431us 1.28% 29.431us 4.905us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.43% 194.465us 8.43% 194.465us 21.607us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.76% 17.531us 0.99% 22.821us 2.536us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.40% 9.119us 0.40% 9.119us 0.608us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.40% 9.181us 0.40% 9.181us 3.060us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.41% 9.511us 0.41% 9.511us 3.170us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.28% 6.360us 0.35% 8.040us 2.680us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.306ms
Self CUDA time total: 38.016us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 334.615us 522.09% 334.615us 334.615us 1
torch_eager 14.27% 115.903us 99.37% 807.269us 807.269us 0.000us 0.00% 68.188us 68.188us 1
aten::conv1d 0.69% 5.580us 15.60% 126.753us 42.251us 0.000us 0.00% 41.662us 13.887us 3
aten::convolution 1.31% 10.620us 14.92% 121.173us 40.391us 0.000us 0.00% 41.662us 13.887us 3
aten::_convolution 3.95% 32.112us 13.61% 110.553us 36.851us 0.000us 0.00% 41.662us 13.887us 3
aten::_conv_depthwise2d 2.71% 21.990us 7.82% 63.492us 21.164us 41.662us 65.00% 41.662us 13.887us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.662us 65.00% 41.662us 13.887us 3
aten::to 0.71% 5.728us 66.32% 538.813us 89.802us 0.000us 0.00% 26.526us 4.421us 6
aten::_to_copy 2.82% 22.881us 65.62% 533.085us 88.848us 0.000us 0.00% 26.526us 4.421us 6
aten::copy_ 5.89% 47.814us 59.03% 479.514us 79.919us 22.430us 35.00% 26.526us 4.421us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 18.72% 12.000us 4.000us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.430us 16.27% 10.430us 3.477us 3
Activity Buffer Request 31.09% 252.576us 31.09% 252.576us 252.576us 4.096us 6.39% 4.096us 4.096us 1
aten::empty_strided 3.78% 30.690us 3.78% 30.690us 5.115us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.86% 201.964us 24.86% 201.964us 22.440us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.01% 16.320us 2.66% 21.600us 2.400us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.07% 8.700us 1.07% 8.700us 0.580us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.17% 9.541us 1.17% 9.541us 3.180us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.12% 9.121us 1.12% 9.121us 3.040us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.69% 5.569us 0.85% 6.929us 2.310us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 812.390us
Self CUDA time total: 64.092us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 333.628us 479.14% 333.628us 333.628us 1
torch_eager 20.40% 175.467us 99.34% 854.292us 854.292us 0.000us 0.00% 73.758us 73.758us 1
aten::conv1d 0.66% 5.659us 14.42% 124.043us 41.348us 0.000us 0.00% 47.231us 15.744us 3
aten::convolution 1.14% 9.780us 13.77% 118.384us 39.461us 0.000us 0.00% 47.231us 15.744us 3
aten::_convolution 2.67% 22.962us 12.63% 108.604us 36.201us 0.000us 0.00% 47.231us 15.744us 3
aten::_conv_depthwise2d 2.49% 21.432us 8.04% 69.183us 23.061us 47.231us 67.83% 47.231us 15.744us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.231us 67.83% 47.231us 15.744us 3
aten::to 0.69% 5.899us 61.45% 528.412us 88.069us 0.000us 0.00% 26.527us 4.421us 6
aten::_to_copy 2.66% 22.900us 60.76% 522.513us 87.086us 0.000us 0.00% 26.527us 4.421us 6
aten::copy_ 5.49% 47.171us 54.63% 469.832us 78.305us 22.399us 32.17% 26.527us 4.421us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.968us 17.19% 11.968us 3.989us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 14.98% 10.431us 3.477us 3
Activity Buffer Request 28.72% 246.977us 28.72% 246.977us 246.977us 4.128us 5.93% 4.128us 4.128us 1
aten::empty_strided 3.46% 29.781us 3.46% 29.781us 4.964us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 22.90% 196.964us 22.90% 196.964us 21.885us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 1.98% 17.050us 2.57% 22.090us 2.454us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.01% 8.700us 1.01% 8.700us 0.580us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.95% 16.810us 1.95% 16.810us 5.603us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.12% 9.661us 1.12% 9.661us 3.220us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.74% 6.399us 0.92% 7.879us 2.626us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 859.952us
Self CUDA time total: 69.630us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 346.806us 186.86% 346.806us 346.806us 1
torch_eager 14.53% 117.325us 99.36% 802.090us 802.090us 0.000us 0.00% 195.516us 195.516us 1
aten::conv1d 0.70% 5.679us 15.36% 123.982us 41.327us 0.000us 0.00% 133.212us 44.404us 3
aten::convolution 1.23% 9.941us 14.65% 118.303us 39.434us 0.000us 0.00% 133.212us 44.404us 3
aten::_convolution 2.76% 22.310us 13.42% 108.362us 36.121us 0.000us 0.00% 133.212us 44.404us 3
aten::_conv_depthwise2d 3.54% 28.551us 8.44% 68.171us 22.724us 133.212us 71.78% 133.212us 44.404us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.212us 71.78% 133.212us 44.404us 3
aten::to 0.73% 5.930us 66.21% 534.532us 89.089us 0.000us 0.00% 62.304us 10.384us 6
aten::_to_copy 2.86% 23.081us 65.48% 528.602us 88.100us 0.000us 0.00% 62.304us 10.384us 6
aten::copy_ 5.96% 48.102us 58.83% 474.941us 79.157us 52.384us 28.22% 62.304us 10.384us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.504us 15.90% 29.504us 9.835us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.880us 12.33% 22.880us 7.627us 3
Activity Buffer Request 31.11% 251.176us 31.11% 251.176us 251.176us 9.920us 5.34% 9.920us 9.920us 1
aten::empty_strided 3.79% 30.580us 3.79% 30.580us 5.097us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.34% 196.463us 24.34% 196.463us 21.829us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.06% 16.611us 2.71% 21.892us 2.432us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.12% 9.041us 1.12% 9.041us 0.603us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.13% 9.090us 1.13% 9.090us 3.030us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.21% 9.730us 1.21% 9.730us 3.243us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.95% 7.689us 1.15% 9.269us 3.090us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 807.270us
Self CUDA time total: 185.596us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.579us 162.49% 339.579us 339.579us 1
torch_eager 14.47% 115.372us 99.34% 792.230us 792.230us 0.000us 0.00% 222.202us 222.202us 1
aten::conv1d 0.71% 5.650us 14.39% 114.783us 38.261us 0.000us 0.00% 153.629us 51.210us 3
aten::convolution 1.12% 8.960us 13.68% 109.133us 36.378us 0.000us 0.00% 153.629us 51.210us 3
aten::_convolution 2.81% 22.409us 12.56% 100.173us 33.391us 0.000us 0.00% 153.629us 51.210us 3
aten::_conv_depthwise2d 2.65% 21.140us 7.72% 61.571us 20.524us 153.629us 73.51% 153.629us 51.210us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 153.629us 73.51% 153.629us 51.210us 3
aten::to 0.74% 5.880us 67.17% 535.703us 89.284us 0.000us 0.00% 68.573us 11.429us 6
aten::_to_copy 2.96% 23.602us 66.44% 529.823us 88.304us 0.000us 0.00% 68.573us 11.429us 6
aten::copy_ 5.90% 47.080us 59.79% 476.780us 79.463us 55.357us 26.49% 68.573us 11.429us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 32.351us 15.48% 32.351us 10.784us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.006us 11.01% 23.006us 7.669us 3
Activity Buffer Request 31.17% 248.536us 31.17% 248.536us 248.536us 13.216us 6.32% 13.216us 13.216us 1
aten::empty_strided 3.69% 29.441us 3.69% 29.441us 4.907us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.34% 202.095us 25.34% 202.095us 22.455us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.16% 17.223us 2.82% 22.464us 2.496us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.10% 8.783us 1.10% 8.783us 0.586us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.15% 9.160us 1.15% 9.160us 3.053us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.30% 10.340us 1.30% 10.340us 3.447us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.78% 6.220us 0.94% 7.501us 2.500us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 797.480us
Self CUDA time total: 208.986us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 6.78% 123.653us 52.87% 964.884us 964.884us 0.000us 0.00% 1.509ms 1.509ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.412ms 100.39% 1.412ms 1.412ms 1
aten::to 0.34% 6.170us 38.05% 694.387us 115.731us 0.000us 0.00% 816.698us 136.116us 6
aten::_to_copy 1.57% 28.621us 37.71% 688.217us 114.703us 0.000us 0.00% 816.698us 136.116us 6
aten::copy_ 2.79% 50.981us 25.74% 469.702us 78.284us 713.755us 50.75% 816.698us 136.116us 6
aten::conv1d 0.31% 5.748us 6.52% 119.002us 39.667us 0.000us 0.00% 692.571us 230.857us 3
aten::convolution 0.58% 10.581us 6.21% 113.254us 37.751us 0.000us 0.00% 692.571us 230.857us 3
aten::_convolution 1.23% 22.532us 5.63% 102.673us 34.224us 0.000us 0.00% 692.571us 230.857us 3
aten::_conv_depthwise2d 1.21% 22.101us 3.46% 63.221us 21.074us 692.571us 49.25% 692.571us 230.857us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 692.571us 49.25% 692.571us 230.857us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 406.878us 28.93% 406.878us 135.626us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 306.877us 21.82% 306.877us 102.292us 3
Activity Buffer Request 13.13% 239.566us 13.13% 239.566us 239.566us 102.943us 7.32% 102.943us 102.943us 1
aten::empty_strided 2.01% 36.730us 10.40% 189.894us 31.649us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 11.02% 201.035us 11.02% 201.035us 22.337us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.99% 18.011us 1.28% 23.402us 2.600us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.50% 9.042us 0.50% 9.042us 0.603us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.52% 9.410us 0.52% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.54% 9.830us 0.54% 9.830us 3.277us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.39% 7.200us 0.47% 8.510us 2.837us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.825ms
Self CUDA time total: 1.406ms
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 3.67% 125.416us 69.04% 2.359ms 2.359ms 0.000us 0.00% 1.503ms 1.503ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.434ms 100.40% 1.434ms 1.434ms 1
aten::to 0.17% 5.940us 61.15% 2.089ms 348.142us 0.000us 0.00% 760.702us 126.784us 6
aten::_to_copy 0.72% 24.663us 60.97% 2.083ms 347.152us 0.000us 0.00% 760.702us 126.784us 6
aten::copy_ 1.35% 46.170us 59.36% 2.028ms 337.971us 686.110us 48.03% 760.702us 126.784us 6
aten::conv1d 0.17% 5.960us 3.43% 117.002us 39.001us 0.000us 0.00% 742.490us 247.497us 3
aten::convolution 0.28% 9.489us 3.25% 111.042us 37.014us 0.000us 0.00% 742.490us 247.497us 3
aten::_convolution 0.69% 23.709us 2.97% 101.553us 33.851us 0.000us 0.00% 742.490us 247.497us 3
aten::_conv_depthwise2d 0.61% 20.921us 1.79% 61.223us 20.408us 742.490us 51.97% 742.490us 247.497us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 742.490us 51.97% 742.490us 247.497us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 400.959us 28.07% 400.959us 133.653us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 285.151us 19.96% 285.151us 95.050us 3
Activity Buffer Request 52.88% 1.806ms 52.88% 1.806ms 1.806ms 74.592us 5.22% 74.592us 74.592us 1
aten::empty_strided 0.89% 30.420us 0.89% 30.420us 5.070us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 5.75% 196.514us 5.75% 196.514us 21.835us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.52% 17.690us 0.68% 23.179us 2.575us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.27% 9.290us 0.27% 9.290us 0.619us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.28% 9.521us 0.28% 9.521us 3.174us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.28% 9.690us 0.28% 9.690us 3.230us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.19% 6.410us 0.23% 7.991us 2.664us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 3.416ms
Self CUDA time total: 1.429ms
impl wl p50(ms) ok
torch_eager cuda_B2_D2048_S128_W2 0.08 True
torch_eager cuda_B2_D2048_S128_W4 0.08 True
torch_eager cuda_B2_D2048_S2048_W2 0.16 True
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
torch_eager cuda_B2_D2048_S512_W2 0.08 True
torch_eager cuda_B2_D2048_S512_W4 0.08 True
torch_eager cuda_B2_D64_S128_W2 0.07 True
torch_eager cuda_B2_D64_S128_W4 0.08 True
torch_eager cuda_B2_D64_S2048_W2 0.08 True
torch_eager cuda_B2_D64_S2048_W4 0.08 True
torch_eager cuda_B2_D64_S512_W2 0.08 True
torch_eager cuda_B2_D64_S512_W4 0.08 True
torch_eager cuda_B4_D2048_S128_W2 0.08 True
torch_eager cuda_B4_D2048_S128_W4 0.08 True
torch_eager cuda_B4_D2048_S2048_W2 0.49 True
torch_eager cuda_B4_D2048_S2048_W4 0.50 True
torch_eager cuda_B4_D2048_S512_W2 0.09 True
torch_eager cuda_B4_D2048_S512_W4 0.10 True
torch_eager cuda_B4_D64_S128_W2 0.08 True
torch_eager cuda_B4_D64_S128_W4 0.08 True
torch_eager cuda_B4_D64_S2048_W2 0.08 True
torch_eager cuda_B4_D64_S2048_W4 0.08 True
torch_eager cuda_B4_D64_S512_W2 0.08 True
torch_eager cuda_B4_D64_S512_W4 0.08 True