Running causal_conv1d benchmark on cuda with 24 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 420.863us 2177.59% 420.863us 420.863us 1
torch_eager 8.44% 198.071us 99.39% 2.333ms 2.333ms 0.000us 0.00% 21.695us 21.695us 1
aten::to 0.42% 9.821us 82.68% 1.941ms 323.474us 0.000us 0.00% 14.367us 2.395us 6
aten::_to_copy 1.47% 34.509us 82.26% 1.931ms 321.837us 0.000us 0.00% 14.367us 2.395us 6
aten::copy_ 2.58% 60.614us 78.68% 1.847ms 307.834us 11.999us 62.08% 14.367us 2.395us 6
aten::conv1d 0.31% 7.291us 6.62% 155.424us 51.808us 0.000us 0.00% 7.328us 2.443us 3
aten::convolution 0.56% 13.171us 6.31% 148.133us 49.378us 0.000us 0.00% 7.328us 2.443us 3
aten::_convolution 1.45% 33.972us 5.75% 134.962us 44.987us 0.000us 0.00% 7.328us 2.443us 3
aten::_conv_depthwise2d 1.44% 33.841us 3.61% 84.652us 28.217us 7.328us 37.92% 7.328us 2.443us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 37.92% 7.328us 2.443us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.335us 32.78% 6.335us 2.112us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 29.31% 5.664us 1.888us 3
Activity Buffer Request 73.26% 1.720ms 73.26% 1.720ms 1.720ms 2.368us 12.25% 2.368us 2.368us 1
aten::empty_strided 2.11% 49.511us 2.11% 49.511us 8.252us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.97% 93.272us 3.97% 93.272us 10.364us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 1.01% 23.716us 1.33% 31.137us 3.460us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.50% 11.722us 0.50% 11.722us 0.781us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.50% 11.630us 0.50% 11.630us 3.877us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.53% 12.550us 0.53% 12.550us 4.183us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.29% 6.740us 0.34% 8.071us 2.690us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.348ms
Self CUDA time total: 19.327us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 313.250us 1596.75% 313.250us 313.250us 1
torch_eager 6.72% 145.881us 99.75% 2.164ms 2.164ms 0.000us 0.00% 21.826us 21.826us 1
aten::to 0.27% 5.881us 86.54% 1.878ms 312.937us 0.000us 0.00% 13.921us 2.320us 6
aten::_to_copy 1.01% 21.932us 86.27% 1.872ms 311.957us 0.000us 0.00% 13.921us 2.320us 6
aten::copy_ 2.19% 47.440us 83.99% 1.822ms 303.712us 11.713us 59.71% 13.921us 2.320us 6
aten::conv1d 0.28% 6.160us 5.31% 115.143us 38.381us 0.000us 0.00% 7.905us 2.635us 3
aten::convolution 0.41% 8.901us 5.02% 108.983us 36.328us 0.000us 0.00% 7.905us 2.635us 3
aten::_convolution 1.01% 21.821us 4.61% 100.082us 33.361us 0.000us 0.00% 7.905us 2.635us 3
aten::_conv_depthwise2d 0.97% 21.100us 2.86% 62.101us 20.700us 7.905us 40.29% 7.905us 2.635us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.905us 40.29% 7.905us 2.635us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.113us 31.16% 6.113us 2.038us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.600us 28.55% 5.600us 1.867us 3
Activity Buffer Request 79.65% 1.728ms 79.65% 1.728ms 1.728ms 2.208us 11.25% 2.208us 2.208us 1
aten::empty_strided 1.27% 27.539us 1.27% 27.539us 4.590us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.21% 69.681us 3.21% 69.681us 7.742us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.78% 16.828us 1.04% 22.610us 2.512us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.44% 9.512us 0.44% 9.512us 0.634us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.43% 9.331us 0.43% 9.331us 3.110us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.41% 8.800us 0.41% 8.800us 2.933us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.26% 5.580us 0.33% 7.220us 2.407us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.170ms
Self CUDA time total: 19.618us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 311.776us 1662.54% 311.776us 311.776us 1
torch_eager 6.35% 136.063us 99.73% 2.136ms 2.136ms 0.000us 0.00% 20.769us 20.769us 1
aten::to 0.26% 5.629us 86.85% 1.860ms 310.039us 0.000us 0.00% 13.823us 2.304us 6
aten::_to_copy 0.98% 21.032us 86.59% 1.855ms 309.100us 0.000us 0.00% 13.823us 2.304us 6
aten::copy_ 2.10% 44.987us 84.33% 1.806ms 301.043us 11.807us 62.96% 13.823us 2.304us 6
aten::conv1d 0.27% 5.831us 5.33% 114.232us 38.077us 0.000us 0.00% 6.946us 2.315us 3
aten::convolution 0.40% 8.671us 5.06% 108.401us 36.134us 0.000us 0.00% 6.946us 2.315us 3
aten::_convolution 0.98% 20.999us 4.66% 99.730us 33.243us 0.000us 0.00% 6.946us 2.315us 3
aten::_conv_depthwise2d 0.97% 20.860us 2.94% 63.051us 21.017us 6.946us 37.04% 6.946us 2.315us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.946us 37.04% 6.946us 2.315us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.047us 32.25% 6.047us 2.016us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.760us 30.72% 5.760us 1.920us 3
Activity Buffer Request 79.94% 1.712ms 79.94% 1.712ms 1.712ms 2.016us 10.75% 2.016us 2.016us 1
aten::empty_strided 1.28% 27.310us 1.28% 27.310us 4.552us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.44% 73.575us 3.44% 73.575us 8.175us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.79% 16.901us 1.05% 22.540us 2.504us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.42% 9.019us 0.42% 9.019us 0.601us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.43% 9.230us 0.43% 9.230us 3.077us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.40% 8.580us 0.40% 8.580us 2.860us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.25% 5.250us 0.31% 6.610us 2.203us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.142ms
Self CUDA time total: 18.753us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 315.647us 1619.78% 315.647us 315.647us 1
torch_eager 6.63% 154.700us 99.79% 2.328ms 2.328ms 0.000us 0.00% 21.599us 21.599us 1
aten::to 0.26% 6.031us 87.19% 2.034ms 338.971us 0.000us 0.00% 13.983us 2.331us 6
aten::_to_copy 0.93% 21.742us 86.93% 2.028ms 337.966us 0.000us 0.00% 13.983us 2.331us 6
aten::copy_ 2.02% 47.211us 84.82% 1.979ms 329.757us 11.871us 60.92% 13.983us 2.331us 6
aten::conv1d 0.25% 5.920us 4.90% 114.312us 38.104us 0.000us 0.00% 7.616us 2.539us 3
aten::convolution 0.39% 8.981us 4.65% 108.392us 36.131us 0.000us 0.00% 7.616us 2.539us 3
aten::_convolution 0.89% 20.821us 4.26% 99.411us 33.137us 0.000us 0.00% 7.616us 2.539us 3
aten::_conv_depthwise2d 0.89% 20.790us 2.71% 63.250us 21.083us 7.616us 39.08% 7.616us 2.539us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 39.08% 7.616us 2.539us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.143us 31.52% 6.143us 2.048us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 29.39% 5.728us 1.909us 3
Activity Buffer Request 71.63% 1.671ms 71.63% 1.671ms 1.671ms 2.112us 10.84% 2.112us 2.112us 1
aten::empty_strided 1.18% 27.509us 1.18% 27.509us 4.585us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 12.15% 283.514us 12.15% 283.514us 31.502us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.70% 16.421us 0.91% 21.332us 2.370us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.37% 8.571us 0.37% 8.571us 0.571us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.43% 10.140us 0.43% 10.140us 3.380us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.39% 9.150us 0.39% 9.150us 3.050us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.23% 5.450us 0.30% 7.100us 2.367us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.333ms
Self CUDA time total: 19.487us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 316.162us 1286.41% 316.162us 316.162us 1
torch_eager 6.08% 142.813us 99.78% 2.344ms 2.344ms 0.000us 0.00% 26.881us 26.881us 1
aten::to 0.26% 6.099us 87.82% 2.063ms 343.816us 0.000us 0.00% 15.329us 2.555us 6
aten::_to_copy 0.96% 22.501us 87.56% 2.057ms 342.799us 0.000us 0.00% 15.329us 2.555us 6
aten::copy_ 2.00% 46.941us 85.41% 2.006ms 334.376us 13.025us 53.00% 15.329us 2.555us 6
aten::conv1d 0.27% 6.279us 4.78% 112.281us 37.427us 0.000us 0.00% 11.552us 3.851us 3
aten::convolution 0.37% 8.701us 4.51% 106.002us 35.334us 0.000us 0.00% 11.552us 3.851us 3
aten::_convolution 0.88% 20.659us 4.14% 97.301us 32.434us 0.000us 0.00% 11.552us 3.851us 3
aten::_conv_depthwise2d 0.89% 20.940us 2.63% 61.791us 20.597us 11.552us 47.00% 11.552us 3.851us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 47.00% 11.552us 3.851us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.689us 27.22% 6.689us 2.230us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 25.78% 6.336us 2.112us 3
Activity Buffer Request 74.92% 1.760ms 74.92% 1.760ms 1.760ms 2.304us 9.37% 2.304us 2.304us 1
aten::empty_strided 1.19% 28.040us 1.19% 28.040us 4.673us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.45% 222.075us 9.45% 222.075us 24.675us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.70% 16.419us 0.91% 21.470us 2.386us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.36% 8.480us 0.36% 8.480us 0.565us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.40% 9.310us 0.40% 9.310us 3.103us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.38% 8.880us 0.38% 8.880us 2.960us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.24% 5.612us 0.29% 6.911us 2.304us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.349ms
Self CUDA time total: 24.577us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 315.548us 1212.95% 315.548us 315.548us 1
torch_eager 6.12% 137.996us 99.74% 2.248ms 2.248ms 0.000us 0.00% 28.287us 28.287us 1
aten::to 0.25% 5.659us 87.43% 1.970ms 328.377us 0.000us 0.00% 15.295us 2.549us 6
aten::_to_copy 0.93% 21.030us 87.17% 1.965ms 327.434us 0.000us 0.00% 15.295us 2.549us 6
aten::copy_ 2.11% 47.618us 85.03% 1.916ms 319.389us 13.023us 50.06% 15.295us 2.549us 6
aten::conv1d 0.25% 5.679us 5.07% 114.160us 38.053us 0.000us 0.00% 12.992us 4.331us 3
aten::convolution 0.38% 8.670us 4.81% 108.481us 36.160us 0.000us 0.00% 12.992us 4.331us 3
aten::_convolution 0.97% 21.770us 4.43% 99.811us 33.270us 0.000us 0.00% 12.992us 4.331us 3
aten::_conv_depthwise2d 0.95% 21.450us 2.80% 63.011us 21.004us 12.992us 49.94% 12.992us 4.331us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 12.992us 49.94% 12.992us 4.331us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.687us 25.70% 6.687us 2.229us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 24.36% 6.336us 2.112us 3
Activity Buffer Request 74.56% 1.680ms 74.56% 1.680ms 1.680ms 2.272us 8.73% 2.272us 2.272us 1
aten::empty_strided 1.21% 27.241us 1.21% 27.241us 4.540us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.39% 211.596us 9.39% 211.596us 23.511us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.73% 16.539us 0.97% 21.840us 2.427us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.39% 8.730us 0.39% 8.730us 0.582us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.41% 9.190us 0.41% 9.190us 3.063us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.40% 9.090us 0.40% 9.090us 3.030us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.24% 5.411us 0.30% 6.770us 2.257us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.254ms
Self CUDA time total: 26.015us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 309.088us 806.93% 309.088us 309.088us 1
torch_eager 6.25% 144.584us 99.79% 2.308ms 2.308ms 0.000us 0.00% 40.896us 40.896us 1
aten::conv1d 0.25% 5.670us 4.80% 111.051us 37.017us 0.000us 0.00% 22.560us 7.520us 3
aten::convolution 0.37% 8.509us 4.56% 105.381us 35.127us 0.000us 0.00% 22.560us 7.520us 3
aten::_convolution 0.91% 21.140us 4.19% 96.872us 32.291us 0.000us 0.00% 22.560us 7.520us 3
aten::_conv_depthwise2d 0.90% 20.721us 2.62% 60.611us 20.204us 22.560us 58.90% 22.560us 7.520us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.560us 58.90% 22.560us 7.520us 3
aten::to 0.25% 5.820us 87.66% 2.027ms 337.907us 0.000us 0.00% 18.336us 3.056us 6
aten::_to_copy 0.93% 21.482us 87.41% 2.022ms 336.937us 0.000us 0.00% 18.336us 3.056us 6
aten::copy_ 1.96% 45.321us 85.30% 1.973ms 328.809us 15.744us 41.10% 18.336us 3.056us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.416us 21.97% 8.416us 2.805us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 19.13% 7.328us 2.443us 3
Activity Buffer Request 75.52% 1.747ms 75.52% 1.747ms 1.747ms 2.592us 6.77% 2.592us 2.592us 1
aten::empty_strided 1.18% 27.288us 1.18% 27.288us 4.548us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.77% 202.733us 8.77% 202.733us 22.526us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.70% 16.129us 0.91% 20.960us 2.329us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.36% 8.361us 0.36% 8.361us 0.557us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.40% 9.150us 0.40% 9.150us 3.050us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.39% 8.930us 0.39% 8.930us 2.977us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.24% 5.551us 0.30% 6.831us 2.277us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.313ms
Self CUDA time total: 38.304us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 314.049us 763.16% 314.049us 314.049us 1
torch_eager 6.27% 140.693us 99.78% 2.240ms 2.240ms 0.000us 0.00% 43.775us 43.775us 1
aten::conv1d 0.27% 6.020us 5.00% 112.161us 37.387us 0.000us 0.00% 25.408us 8.469us 3
aten::convolution 0.39% 8.730us 4.73% 106.141us 35.380us 0.000us 0.00% 25.408us 8.469us 3
aten::_convolution 0.96% 21.462us 4.34% 97.411us 32.470us 0.000us 0.00% 25.408us 8.469us 3
aten::_conv_depthwise2d 0.92% 20.739us 2.73% 61.250us 20.417us 25.408us 61.74% 25.408us 8.469us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.408us 61.74% 25.408us 8.469us 3
aten::to 0.26% 5.842us 87.39% 1.962ms 327.034us 0.000us 0.00% 18.367us 3.061us 6
aten::_to_copy 0.95% 21.318us 87.13% 1.956ms 326.060us 0.000us 0.00% 18.367us 3.061us 6
aten::copy_ 2.12% 47.620us 84.95% 1.908ms 317.917us 15.743us 38.26% 18.367us 3.061us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.384us 20.37% 8.384us 2.795us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.359us 17.88% 7.359us 2.453us 3
Activity Buffer Request 74.98% 1.684ms 74.98% 1.684ms 1.684ms 2.624us 6.38% 2.624us 2.624us 1
aten::empty_strided 1.23% 27.541us 1.23% 27.541us 4.590us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.81% 197.723us 8.81% 197.723us 21.969us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.72% 16.172us 0.94% 21.010us 2.334us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.37% 8.238us 0.37% 8.238us 0.549us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.46% 10.261us 0.46% 10.261us 3.420us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.39% 8.800us 0.39% 8.800us 2.933us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.24% 5.440us 0.30% 6.680us 2.227us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.245ms
Self CUDA time total: 41.151us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 321.281us 312.48% 321.281us 321.281us 1
torch_eager 5.86% 138.423us 99.79% 2.355ms 2.355ms 0.000us 0.00% 108.833us 108.833us 1
aten::conv1d 0.25% 5.850us 4.76% 112.341us 37.447us 0.000us 0.00% 70.720us 23.573us 3
aten::convolution 0.36% 8.520us 4.51% 106.491us 35.497us 0.000us 0.00% 70.720us 23.573us 3
aten::_convolution 0.88% 20.802us 4.15% 97.971us 32.657us 0.000us 0.00% 70.720us 23.573us 3
aten::_conv_depthwise2d 0.89% 20.951us 2.63% 62.101us 20.700us 70.720us 68.78% 70.720us 23.573us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.720us 68.78% 70.720us 23.573us 3
aten::to 0.25% 6.012us 88.03% 2.078ms 346.293us 0.000us 0.00% 38.113us 6.352us 6
aten::_to_copy 0.93% 22.001us 87.78% 2.072ms 345.291us 0.000us 0.00% 38.113us 6.352us 6
aten::copy_ 1.93% 45.601us 85.70% 2.023ms 337.137us 32.097us 31.22% 38.113us 6.352us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.601us 17.12% 17.601us 5.867us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.496us 14.10% 14.496us 4.832us 3
Activity Buffer Request 76.10% 1.796ms 76.10% 1.796ms 1.796ms 6.016us 5.85% 6.016us 6.016us 1
aten::empty_strided 1.14% 26.919us 1.14% 26.919us 4.487us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.62% 203.563us 8.62% 203.563us 22.618us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.74% 17.469us 0.97% 22.809us 2.534us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.37% 8.760us 0.37% 8.760us 0.584us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.40% 9.490us 0.40% 9.490us 3.163us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.39% 9.140us 0.39% 9.140us 3.047us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.23% 5.370us 0.28% 6.650us 2.217us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.360ms
Self CUDA time total: 102.817us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 314.234us 278.42% 314.234us 314.234us 1
torch_eager 14.56% 110.034us 99.36% 750.943us 750.943us 0.000us 0.00% 118.847us 118.847us 1
aten::conv1d 0.76% 5.759us 14.83% 112.060us 37.353us 0.000us 0.00% 80.768us 26.923us 3
aten::convolution 1.14% 8.591us 14.06% 106.301us 35.434us 0.000us 0.00% 80.768us 26.923us 3
aten::_convolution 2.73% 20.639us 12.93% 97.710us 32.570us 0.000us 0.00% 80.768us 26.923us 3
aten::_conv_depthwise2d 2.77% 20.950us 8.22% 62.160us 20.720us 80.768us 71.56% 80.768us 26.923us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.768us 71.56% 80.768us 26.923us 3
aten::to 0.75% 5.692us 66.62% 503.480us 83.913us 0.000us 0.00% 38.079us 6.346us 6
aten::_to_copy 2.84% 21.440us 65.86% 497.788us 82.965us 0.000us 0.00% 38.079us 6.346us 6
aten::copy_ 6.27% 47.370us 59.47% 449.478us 74.913us 32.095us 28.44% 38.079us 6.346us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.567us 15.56% 17.567us 5.856us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.528us 12.87% 14.528us 4.843us 3
Activity Buffer Request 28.71% 217.014us 28.71% 217.014us 217.014us 5.984us 5.30% 5.984us 5.984us 1
aten::empty_strided 3.56% 26.870us 3.56% 26.870us 4.478us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 27.58% 208.434us 27.58% 208.434us 23.159us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.15% 16.260us 2.82% 21.342us 2.371us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.12% 8.483us 1.12% 8.483us 0.566us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.19% 8.980us 1.19% 8.980us 2.993us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.18% 8.890us 1.18% 8.890us 2.963us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.73% 5.530us 0.90% 6.840us 2.280us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 755.803us
Self CUDA time total: 112.863us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 13.44% 111.290us 87.96% 728.462us 728.462us 0.000us 0.00% 465.598us 465.598us 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 451.198us 105.76% 451.198us 451.198us 1
aten::conv1d 0.71% 5.852us 13.50% 111.822us 37.274us 0.000us 0.00% 279.070us 93.023us 3
aten::convolution 1.02% 8.470us 12.80% 105.970us 35.323us 0.000us 0.00% 279.070us 93.023us 3
aten::_convolution 2.46% 20.349us 11.77% 97.500us 32.500us 0.000us 0.00% 279.070us 93.023us 3
aten::_conv_depthwise2d 2.50% 20.731us 7.53% 62.371us 20.790us 279.070us 65.41% 279.070us 93.023us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 279.070us 65.41% 279.070us 93.023us 3
aten::to 0.69% 5.693us 58.10% 481.149us 80.192us 0.000us 0.00% 186.528us 31.088us 6
aten::_to_copy 2.52% 20.868us 57.41% 475.456us 79.243us 0.000us 0.00% 186.528us 31.088us 6
aten::copy_ 5.92% 49.010us 51.66% 427.857us 71.310us 147.552us 34.59% 186.528us 31.088us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 107.296us 25.15% 107.296us 35.765us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.256us 9.44% 40.256us 13.419us 3
Activity Buffer Request 24.18% 200.274us 24.18% 200.274us 200.274us 38.976us 9.14% 38.976us 38.976us 1
aten::empty_strided 3.23% 26.731us 3.23% 26.731us 4.455us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.27% 200.993us 24.27% 200.993us 22.333us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 1.86% 15.430us 2.41% 19.970us 2.219us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.00% 8.282us 1.00% 8.282us 0.552us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.22% 10.070us 1.22% 10.070us 3.357us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.10% 9.150us 1.10% 9.150us 3.050us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.65% 5.408us 0.83% 6.890us 2.297us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 828.193us
Self CUDA time total: 426.622us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 12.75% 112.781us 87.94% 777.793us 777.793us 0.000us 0.00% 472.535us 472.535us 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 465.081us 106.66% 465.081us 465.081us 1
aten::conv1d 0.64% 5.670us 12.88% 113.921us 37.974us 0.000us 0.00% 298.235us 99.412us 3
aten::convolution 0.99% 8.800us 12.24% 108.251us 36.084us 0.000us 0.00% 298.235us 99.412us 3
aten::_convolution 2.35% 20.829us 11.24% 99.451us 33.150us 0.000us 0.00% 298.235us 99.412us 3
aten::_conv_depthwise2d 2.33% 20.611us 7.13% 63.071us 21.024us 298.235us 68.40% 298.235us 99.412us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 298.235us 68.40% 298.235us 99.412us 3
aten::to 0.69% 6.129us 59.45% 525.809us 87.635us 0.000us 0.00% 174.300us 29.050us 6
aten::_to_copy 2.38% 21.040us 58.76% 519.680us 86.613us 0.000us 0.00% 174.300us 29.050us 6
aten::copy_ 5.38% 47.582us 53.27% 471.169us 78.528us 137.789us 31.60% 174.300us 29.050us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 97.534us 22.37% 97.534us 32.511us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.255us 9.23% 40.255us 13.418us 3
Activity Buffer Request 27.57% 243.834us 27.57% 243.834us 243.834us 36.511us 8.37% 36.511us 36.511us 1
aten::empty_strided 3.11% 27.471us 3.11% 27.471us 4.579us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 23.03% 203.723us 23.03% 203.723us 22.636us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 1.93% 17.041us 2.45% 21.672us 2.408us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.93% 8.218us 0.93% 8.218us 0.548us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.01% 8.950us 1.01% 8.950us 2.983us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.08% 9.540us 1.08% 9.540us 3.180us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.64% 5.653us 0.81% 7.121us 2.374us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 884.464us
Self CUDA time total: 436.024us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 322.331us 1707.26% 322.331us 322.331us 1
torch_eager 13.79% 111.240us 99.37% 801.573us 801.573us 0.000us 0.00% 20.864us 20.864us 1
aten::to 0.71% 5.762us 67.35% 543.311us 90.552us 0.000us 0.00% 13.696us 2.283us 6
aten::_to_copy 2.83% 22.852us 66.64% 537.549us 89.592us 0.000us 0.00% 13.696us 2.283us 6
aten::copy_ 5.86% 47.269us 60.36% 486.896us 81.149us 11.712us 62.03% 13.696us 2.283us 6
aten::conv1d 0.71% 5.710us 14.24% 114.861us 38.287us 0.000us 0.00% 7.168us 2.389us 3
aten::convolution 1.09% 8.781us 13.53% 109.151us 36.384us 0.000us 0.00% 7.168us 2.389us 3
aten::_convolution 2.85% 22.969us 12.44% 100.370us 33.457us 0.000us 0.00% 7.168us 2.389us 3
aten::_conv_depthwise2d 2.60% 20.991us 7.73% 62.361us 20.787us 7.168us 37.97% 7.168us 2.389us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.168us 37.97% 7.168us 2.389us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.016us 31.86% 6.016us 2.005us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 30.17% 5.696us 1.899us 3
Activity Buffer Request 32.81% 264.695us 32.81% 264.695us 264.695us 1.984us 10.51% 1.984us 1.984us 1
aten::empty_strided 3.45% 27.801us 3.45% 27.801us 4.633us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.60% 198.402us 24.60% 198.402us 22.045us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.08% 16.740us 3.51% 28.341us 3.149us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.86% 15.030us 1.86% 15.030us 1.002us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.16% 9.330us 1.16% 9.330us 3.110us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.06% 8.570us 1.06% 8.570us 2.857us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.67% 5.391us 0.83% 6.660us 2.220us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 806.653us
Self CUDA time total: 18.880us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 310.011us 1593.40% 310.011us 310.011us 1
torch_eager 14.19% 114.063us 99.41% 798.944us 798.944us 0.000us 0.00% 21.440us 21.440us 1
aten::to 0.69% 5.570us 68.28% 548.719us 91.453us 0.000us 0.00% 13.535us 2.256us 6
aten::_to_copy 2.58% 20.749us 67.58% 543.149us 90.525us 0.000us 0.00% 13.535us 2.256us 6
aten::copy_ 5.87% 47.160us 61.68% 495.718us 82.620us 11.551us 59.37% 13.535us 2.256us 6
aten::conv1d 0.70% 5.640us 13.72% 110.252us 36.751us 0.000us 0.00% 7.905us 2.635us 3
aten::convolution 1.10% 8.879us 13.02% 104.612us 34.871us 0.000us 0.00% 7.905us 2.635us 3
aten::_convolution 2.60% 20.881us 11.91% 95.733us 31.911us 0.000us 0.00% 7.905us 2.635us 3
aten::_conv_depthwise2d 2.57% 20.619us 7.48% 60.111us 20.037us 7.905us 40.63% 7.905us 2.635us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.905us 40.63% 7.905us 2.635us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 30.26% 5.888us 1.963us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.663us 29.11% 5.663us 1.888us 3
Activity Buffer Request 34.15% 274.474us 34.15% 274.474us 274.474us 1.984us 10.20% 1.984us 1.984us 1
aten::empty_strided 3.32% 26.682us 3.32% 26.682us 4.447us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.32% 195.444us 24.32% 195.444us 21.716us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.12% 17.050us 2.77% 22.260us 2.473us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.06% 8.520us 1.06% 8.520us 0.568us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.13% 9.091us 1.13% 9.091us 3.030us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.12% 9.041us 1.12% 9.041us 3.014us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.64% 5.131us 0.80% 6.391us 2.130us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 803.684us
Self CUDA time total: 19.456us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 314.048us 1627.61% 314.048us 314.048us 1
torch_eager 6.09% 141.071us 99.78% 2.312ms 2.312ms 0.000us 0.00% 21.471us 21.471us 1
aten::to 0.25% 5.868us 87.74% 2.033ms 338.847us 0.000us 0.00% 14.239us 2.373us 6
aten::_to_copy 0.91% 21.009us 87.49% 2.027ms 337.869us 0.000us 0.00% 14.239us 2.373us 6
aten::copy_ 2.05% 47.413us 85.30% 1.977ms 329.427us 12.063us 62.52% 14.239us 2.373us 6
aten::conv1d 0.24% 5.651us 4.86% 112.612us 37.537us 0.000us 0.00% 7.232us 2.411us 3
aten::convolution 0.38% 8.789us 4.62% 106.961us 35.654us 0.000us 0.00% 7.232us 2.411us 3
aten::_convolution 0.89% 20.661us 4.24% 98.172us 32.724us 0.000us 0.00% 7.232us 2.411us 3
aten::_conv_depthwise2d 0.90% 20.862us 2.69% 62.371us 20.790us 7.232us 37.48% 7.232us 2.411us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.232us 37.48% 7.232us 2.411us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 32.17% 6.208us 2.069us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.855us 30.34% 5.855us 1.952us 3
Activity Buffer Request 75.73% 1.755ms 75.73% 1.755ms 1.755ms 2.176us 11.28% 2.176us 2.176us 1
aten::empty_strided 1.28% 29.642us 1.28% 29.642us 4.940us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.54% 197.852us 8.54% 197.852us 21.984us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.71% 16.502us 0.92% 21.412us 2.379us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.36% 8.400us 0.36% 8.400us 0.560us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.40% 9.250us 0.40% 9.250us 3.083us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.37% 8.629us 0.37% 8.629us 2.876us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.24% 5.530us 0.30% 6.920us 2.307us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.317ms
Self CUDA time total: 19.295us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 312.955us 1552.43% 312.955us 312.955us 1
torch_eager 14.84% 110.223us 99.32% 737.732us 737.732us 0.000us 0.00% 22.399us 22.399us 1
aten::to 0.81% 6.020us 65.77% 488.537us 81.423us 0.000us 0.00% 14.464us 2.411us 6
aten::_to_copy 2.95% 21.919us 64.96% 482.517us 80.420us 0.000us 0.00% 14.464us 2.411us 6
aten::copy_ 6.27% 46.582us 58.16% 432.008us 72.001us 12.224us 60.64% 14.464us 2.411us 6
aten::conv1d 0.73% 5.430us 15.33% 113.861us 37.954us 0.000us 0.00% 7.935us 2.645us 3
aten::convolution 1.23% 9.170us 14.60% 108.431us 36.144us 0.000us 0.00% 7.935us 2.645us 3
aten::_convolution 2.83% 21.009us 13.36% 99.261us 33.087us 0.000us 0.00% 7.935us 2.645us 3
aten::_conv_depthwise2d 2.83% 21.021us 8.51% 63.211us 21.070us 7.935us 39.36% 7.935us 2.645us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.935us 39.36% 7.935us 2.645us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 31.27% 6.304us 2.101us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 29.37% 5.920us 1.973us 3
Activity Buffer Request 28.04% 208.284us 28.04% 208.284us 208.284us 2.240us 11.11% 2.240us 2.240us 1
aten::empty_strided 3.85% 28.590us 3.85% 28.590us 4.765us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 26.94% 200.092us 26.94% 200.092us 22.232us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.19% 16.299us 2.86% 21.260us 2.362us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.14% 8.481us 1.14% 8.481us 0.565us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.35% 10.000us 1.35% 10.000us 3.333us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.24% 9.240us 1.24% 9.240us 3.080us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.74% 5.520us 0.94% 6.960us 2.320us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 742.812us
Self CUDA time total: 20.159us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 313.245us 871.70% 313.245us 313.245us 1
torch_eager 14.61% 110.582us 99.35% 752.253us 752.253us 0.000us 0.00% 38.527us 38.527us 1
aten::conv1d 0.75% 5.651us 14.90% 112.841us 37.614us 0.000us 0.00% 20.160us 6.720us 3
aten::convolution 1.16% 8.759us 14.16% 107.190us 35.730us 0.000us 0.00% 20.160us 6.720us 3
aten::_convolution 2.72% 20.612us 13.00% 98.431us 32.810us 0.000us 0.00% 20.160us 6.720us 3
aten::_conv_depthwise2d 2.88% 21.780us 8.26% 62.540us 20.847us 20.160us 56.10% 20.160us 6.720us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.160us 56.10% 20.160us 6.720us 3
aten::to 0.75% 5.701us 66.54% 503.830us 83.972us 0.000us 0.00% 18.367us 3.061us 6
aten::_to_copy 2.88% 21.780us 65.79% 498.129us 83.021us 0.000us 0.00% 18.367us 3.061us 6
aten::copy_ 6.12% 46.334us 59.23% 448.449us 74.742us 15.775us 43.90% 18.367us 3.061us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.415us 23.42% 8.415us 2.805us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 20.48% 7.360us 2.453us 3
Activity Buffer Request 29.82% 225.793us 29.82% 225.793us 225.793us 2.592us 7.21% 2.592us 2.592us 1
aten::empty_strided 3.68% 27.900us 3.68% 27.900us 4.650us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 26.28% 198.992us 26.28% 198.992us 22.110us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.15% 16.291us 2.79% 21.110us 2.346us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.09% 8.258us 1.09% 8.258us 0.551us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.19% 9.040us 1.19% 9.040us 3.013us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.20% 9.050us 1.20% 9.050us 3.017us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.73% 5.530us 0.90% 6.810us 2.270us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 757.144us
Self CUDA time total: 35.935us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 311.265us 814.68% 311.265us 311.265us 1
torch_eager 15.57% 112.202us 99.29% 715.482us 715.482us 0.000us 0.00% 40.798us 40.798us 1
aten::conv1d 0.79% 5.710us 15.58% 112.241us 37.414us 0.000us 0.00% 22.336us 7.445us 3
aten::convolution 1.20% 8.631us 14.78% 106.531us 35.510us 0.000us 0.00% 22.336us 7.445us 3
aten::_convolution 2.96% 21.310us 13.59% 97.900us 32.633us 0.000us 0.00% 22.336us 7.445us 3
aten::_conv_depthwise2d 2.84% 20.470us 8.57% 61.720us 20.573us 22.336us 58.46% 22.336us 7.445us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.336us 58.46% 22.336us 7.445us 3
aten::to 0.77% 5.551us 64.63% 465.749us 77.625us 0.000us 0.00% 18.462us 3.077us 6
aten::_to_copy 2.88% 20.749us 63.86% 460.198us 76.700us 0.000us 0.00% 18.462us 3.077us 6
aten::copy_ 6.44% 46.412us 57.18% 412.048us 68.675us 15.871us 41.54% 18.462us 3.077us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.511us 22.28% 8.511us 2.837us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 19.26% 7.360us 2.453us 3
Activity Buffer Request 26.39% 190.134us 26.39% 190.134us 190.134us 2.591us 6.78% 2.591us 2.591us 1
aten::empty_strided 3.80% 27.401us 3.80% 27.401us 4.567us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 27.38% 197.312us 27.38% 197.312us 21.924us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.26% 16.250us 2.97% 21.400us 2.378us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.19% 8.579us 1.19% 8.579us 0.572us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.47% 10.600us 1.47% 10.600us 3.533us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.23% 8.840us 1.23% 8.840us 2.947us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.77% 5.530us 0.95% 6.840us 2.280us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 720.602us
Self CUDA time total: 38.207us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 314.752us 491.08% 314.752us 314.752us 1
torch_eager 14.41% 112.890us 99.37% 778.613us 778.613us 0.000us 0.00% 68.189us 68.189us 1
aten::conv1d 0.75% 5.840us 14.26% 111.761us 37.254us 0.000us 0.00% 41.759us 13.920us 3
aten::convolution 1.10% 8.641us 13.52% 105.921us 35.307us 0.000us 0.00% 41.759us 13.920us 3
aten::_convolution 2.66% 20.830us 12.41% 97.280us 32.427us 0.000us 0.00% 41.759us 13.920us 3
aten::_conv_depthwise2d 2.71% 21.201us 7.84% 61.431us 20.477us 41.759us 65.15% 41.759us 13.920us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.759us 65.15% 41.759us 13.920us 3
aten::to 0.73% 5.701us 67.43% 528.381us 88.063us 0.000us 0.00% 26.430us 4.405us 6
aten::_to_copy 2.70% 21.142us 66.70% 522.680us 87.113us 0.000us 0.00% 26.430us 4.405us 6
aten::copy_ 6.06% 47.459us 59.50% 466.248us 77.708us 22.335us 34.85% 26.430us 4.405us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.903us 18.57% 11.903us 3.968us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 16.28% 10.432us 3.477us 3
Activity Buffer Request 30.62% 239.904us 30.62% 239.904us 239.904us 4.095us 6.39% 4.095us 4.095us 1
aten::empty_strided 4.50% 35.290us 4.50% 35.290us 5.882us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.58% 200.475us 25.58% 200.475us 22.275us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.10% 16.441us 2.75% 21.561us 2.396us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.10% 8.620us 1.10% 8.620us 0.575us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.18% 9.220us 1.18% 9.220us 3.073us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.20% 9.420us 1.20% 9.420us 3.140us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.71% 5.569us 0.87% 6.849us 2.283us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 783.583us
Self CUDA time total: 64.094us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 341.050us 488.00% 341.050us 341.050us 1
torch_eager 14.90% 121.512us 99.29% 809.864us 809.864us 0.000us 0.00% 73.952us 73.952us 1
aten::conv1d 0.74% 6.030us 14.63% 119.332us 39.777us 0.000us 0.00% 47.456us 15.819us 3
aten::convolution 1.12% 9.160us 13.89% 113.302us 37.767us 0.000us 0.00% 47.456us 15.819us 3
aten::_convolution 2.63% 21.441us 12.77% 104.142us 34.714us 0.000us 0.00% 47.456us 15.819us 3
aten::_conv_depthwise2d 2.71% 22.133us 8.21% 66.923us 22.308us 47.456us 67.90% 47.456us 15.819us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.456us 67.90% 47.456us 15.819us 3
aten::to 0.75% 6.151us 66.55% 542.789us 90.465us 0.000us 0.00% 26.496us 4.416us 6
aten::_to_copy 2.90% 23.661us 65.79% 536.638us 89.440us 0.000us 0.00% 26.496us 4.416us 6
aten::copy_ 6.09% 49.711us 59.32% 483.818us 80.636us 22.432us 32.10% 26.496us 4.416us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.904us 17.03% 11.904us 3.968us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.528us 15.06% 10.528us 3.509us 3
Activity Buffer Request 30.67% 250.175us 30.67% 250.175us 250.175us 4.064us 5.82% 4.064us 4.064us 1
aten::empty_strided 3.58% 29.159us 3.58% 29.159us 4.860us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.38% 207.002us 25.38% 207.002us 23.000us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.12% 17.271us 2.72% 22.171us 2.463us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.06% 8.630us 1.06% 8.630us 0.575us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.48% 12.040us 1.48% 12.040us 4.013us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.19% 9.680us 1.19% 9.680us 3.227us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.69% 5.638us 0.87% 7.098us 2.366us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 815.634us
Self CUDA time total: 69.888us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 333.665us 179.68% 333.665us 333.665us 1
torch_eager 6.31% 142.852us 99.78% 2.260ms 2.260ms 0.000us 0.00% 195.711us 195.711us 1
aten::conv1d 0.27% 6.061us 5.13% 116.202us 38.734us 0.000us 0.00% 133.407us 44.469us 3
aten::convolution 0.39% 8.731us 4.86% 110.141us 36.714us 0.000us 0.00% 133.407us 44.469us 3
aten::_convolution 0.93% 21.019us 4.48% 101.410us 33.803us 0.000us 0.00% 133.407us 44.469us 3
aten::_conv_depthwise2d 0.95% 21.630us 2.86% 64.671us 21.557us 133.407us 71.84% 133.407us 44.469us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.407us 71.84% 133.407us 44.469us 3
aten::to 0.26% 5.950us 87.17% 1.974ms 329.070us 0.000us 0.00% 62.304us 10.384us 6
aten::_to_copy 0.93% 21.080us 86.91% 1.968ms 328.079us 0.000us 0.00% 62.304us 10.384us 6
aten::copy_ 2.06% 46.600us 84.72% 1.919ms 319.815us 52.288us 28.16% 62.304us 10.384us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.503us 15.89% 29.503us 9.834us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.785us 12.27% 22.785us 7.595us 3
Activity Buffer Request 74.89% 1.696ms 74.89% 1.696ms 1.696ms 10.016us 5.39% 10.016us 10.016us 1
aten::empty_strided 1.26% 28.502us 1.26% 28.502us 4.750us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.73% 197.643us 8.73% 197.643us 21.960us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.77% 17.390us 1.01% 22.821us 2.536us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.41% 9.201us 0.41% 9.201us 0.613us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.51% 11.561us 0.51% 11.561us 3.854us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.43% 9.850us 0.43% 9.850us 3.283us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.25% 5.550us 0.31% 7.130us 2.377us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.265ms
Self CUDA time total: 185.695us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 336.127us 160.20% 336.127us 336.127us 1
torch_eager 15.02% 112.950us 99.30% 746.873us 746.873us 0.000us 0.00% 223.453us 223.453us 1
aten::conv1d 0.82% 6.200us 15.13% 113.793us 37.931us 0.000us 0.00% 154.078us 51.359us 3
aten::convolution 1.21% 9.111us 14.30% 107.593us 35.864us 0.000us 0.00% 154.078us 51.359us 3
aten::_convolution 2.76% 20.789us 13.09% 98.482us 32.827us 0.000us 0.00% 154.078us 51.359us 3
aten::_conv_depthwise2d 2.85% 21.433us 8.36% 62.872us 20.957us 154.078us 73.43% 154.078us 51.359us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 154.078us 73.43% 154.078us 51.359us 3
aten::to 0.77% 5.760us 65.90% 495.650us 82.608us 0.000us 0.00% 69.375us 11.562us 6
aten::_to_copy 2.76% 20.790us 65.13% 489.890us 81.648us 0.000us 0.00% 69.375us 11.562us 6
aten::copy_ 6.23% 46.852us 58.63% 441.010us 73.502us 55.743us 26.57% 69.375us 11.562us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 32.736us 15.60% 32.736us 10.912us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.007us 10.97% 23.007us 7.669us 3
Activity Buffer Request 28.60% 215.113us 28.60% 215.113us 215.113us 13.632us 6.50% 13.632us 13.632us 1
aten::empty_strided 3.73% 28.090us 3.73% 28.090us 4.682us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 26.86% 202.035us 26.86% 202.035us 22.448us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.10% 15.770us 2.68% 20.191us 2.243us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.08% 8.131us 1.08% 8.131us 0.542us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.19% 8.950us 1.19% 8.950us 2.983us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.26% 9.499us 1.26% 9.499us 3.166us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.71% 5.360us 0.95% 7.120us 2.373us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 752.153us
Self CUDA time total: 209.821us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 6.57% 117.922us 51.18% 918.465us 918.465us 0.000us 0.00% 1.509ms 1.509ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.411ms 100.40% 1.411ms 1.411ms 1
aten::to 0.34% 6.101us 36.65% 657.591us 109.598us 0.000us 0.00% 815.747us 135.958us 6
aten::_to_copy 1.52% 27.229us 36.31% 651.490us 108.582us 0.000us 0.00% 815.747us 135.958us 6
aten::copy_ 2.73% 48.941us 24.59% 441.187us 73.531us 712.610us 50.71% 815.747us 135.958us 6
aten::conv1d 0.34% 6.120us 6.48% 116.312us 38.771us 0.000us 0.00% 692.768us 230.923us 3
aten::convolution 0.51% 9.129us 6.14% 110.192us 36.731us 0.000us 0.00% 692.768us 230.923us 3
aten::_convolution 1.19% 21.333us 5.63% 101.063us 33.688us 0.000us 0.00% 692.768us 230.923us 3
aten::_conv_depthwise2d 1.24% 22.308us 3.59% 64.481us 21.494us 692.768us 49.29% 692.768us 230.923us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 692.768us 49.29% 692.768us 230.923us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 407.041us 28.96% 407.041us 135.680us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 305.569us 21.74% 305.569us 101.856us 3
Activity Buffer Request 11.66% 209.253us 11.66% 209.253us 209.253us 103.137us 7.34% 103.137us 103.137us 1
aten::empty_strided 1.92% 34.411us 10.20% 183.074us 30.512us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 11.50% 206.424us 11.50% 206.424us 22.936us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.94% 16.930us 1.24% 22.220us 2.469us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.50% 8.941us 0.50% 8.941us 0.596us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.52% 9.381us 0.52% 9.381us 3.127us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.52% 9.361us 0.52% 9.361us 3.120us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.31% 5.508us 0.38% 6.869us 2.290us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.794ms
Self CUDA time total: 1.405ms
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 6.19% 116.332us 43.22% 812.174us 812.174us 0.000us 0.00% 1.498ms 1.498ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.429ms 100.40% 1.429ms 1.429ms 1
aten::to 0.31% 5.839us 29.49% 554.079us 92.346us 0.000us 0.00% 756.667us 126.111us 6
aten::_to_copy 1.13% 21.183us 29.18% 548.240us 91.373us 0.000us 0.00% 756.667us 126.111us 6
aten::copy_ 2.52% 47.321us 26.61% 500.037us 83.340us 681.756us 47.91% 756.667us 126.111us 6
aten::conv1d 0.30% 5.691us 6.12% 114.963us 38.321us 0.000us 0.00% 741.276us 247.092us 3
aten::convolution 0.46% 8.681us 5.82% 109.272us 36.424us 0.000us 0.00% 741.276us 247.092us 3
aten::_convolution 1.12% 21.121us 5.35% 100.591us 33.530us 0.000us 0.00% 741.276us 247.092us 3
aten::_conv_depthwise2d 1.12% 21.039us 3.43% 64.490us 21.497us 741.276us 52.09% 741.276us 247.092us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 741.276us 52.09% 741.276us 247.092us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 397.598us 27.94% 397.598us 132.533us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 284.158us 19.97% 284.158us 94.719us 3
Activity Buffer Request 11.11% 208.773us 11.11% 208.773us 208.773us 74.911us 5.26% 74.911us 74.911us 1
aten::empty_strided 1.44% 27.020us 1.44% 27.020us 4.503us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 14.26% 268.023us 14.26% 268.023us 29.780us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.93% 17.452us 1.22% 23.001us 2.556us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.48% 8.988us 0.48% 8.988us 0.599us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.52% 9.701us 0.52% 9.701us 3.234us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.51% 9.670us 0.51% 9.670us 3.223us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.29% 5.400us 0.35% 6.640us 2.213us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.879ms
Self CUDA time total: 1.423ms
impl wl p50(ms) ok
torch_eager cuda_B2_D2048_S128_W2 0.08 True
torch_eager cuda_B2_D2048_S128_W4 0.08 True
torch_eager cuda_B2_D2048_S2048_W2 0.16 True
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
torch_eager cuda_B2_D2048_S512_W2 0.08 True
torch_eager cuda_B2_D2048_S512_W4 0.08 True
torch_eager cuda_B2_D64_S128_W2 0.07 True
torch_eager cuda_B2_D64_S128_W4 0.08 True
torch_eager cuda_B2_D64_S2048_W2 0.08 True
torch_eager cuda_B2_D64_S2048_W4 0.08 True
torch_eager cuda_B2_D64_S512_W2 0.08 True
torch_eager cuda_B2_D64_S512_W4 0.08 True
torch_eager cuda_B4_D2048_S128_W2 0.08 True
torch_eager cuda_B4_D2048_S128_W4 0.08 True
torch_eager cuda_B4_D2048_S2048_W2 0.48 True
torch_eager cuda_B4_D2048_S2048_W4 0.50 True
torch_eager cuda_B4_D2048_S512_W2 0.09 True
torch_eager cuda_B4_D2048_S512_W4 0.10 True
torch_eager cuda_B4_D64_S128_W2 0.08 True
torch_eager cuda_B4_D64_S128_W4 0.08 True
torch_eager cuda_B4_D64_S2048_W2 0.08 True
torch_eager cuda_B4_D64_S2048_W4 0.08 True
torch_eager cuda_B4_D64_S512_W2 0.08 True
torch_eager cuda_B4_D64_S512_W4 0.08 True