diff --git "a/causal_conv1d/impls/torch_causal_conv1d.html" "b/causal_conv1d/impls/torch_causal_conv1d.html" --- "a/causal_conv1d/impls/torch_causal_conv1d.html" +++ "b/causal_conv1d/impls/torch_causal_conv1d.html" @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.25s +Cell: nv | 0.24s | Raw @@ -3904,7 +3904,7 @@ Cell: nv | 0.25s
Fri Dec 19 19:54:47 2025 +Fri Dec 19 22:48:25 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 | +-----------------------------------------+------------------------+----------------------+ @@ -3913,7 +3913,7 @@ Cell: nv | 0.25s | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 37C P0 82W / 350W | 0MiB / 46068MiB | 14% Default | +| N/A 28C P0 90W / 350W | 0MiB / 46068MiB | 10% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -3937,7 +3937,7 @@ Cell: nv | 0.25s ▼ output ▶ uv-logs | -Cell: benchmark | 7.79s +Cell: benchmark | 33.94s | Raw @@ -3999,29 +3999,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 436.163us 2260.26% 436.163us 436.163us 1 - torch_eager 9.04% 219.426us 99.40% 2.414ms 2.414ms 0.000us 0.00% 21.633us 21.633us 1 - aten::to 0.42% 10.088us 82.02% 1.992ms 331.955us 0.000us 0.00% 14.273us 2.379us 6 - aten::_to_copy 1.72% 41.805us 81.60% 1.982ms 330.274us 0.000us 0.00% 14.273us 2.379us 6 - aten::copy_ 2.55% 61.862us 77.44% 1.881ms 313.438us 11.937us 61.86% 14.273us 2.379us 6 - aten::conv1d 0.37% 9.100us 6.66% 161.626us 53.875us 0.000us 0.00% 7.360us 2.453us 3 - aten::convolution 0.65% 15.771us 6.28% 152.526us 50.842us 0.000us 0.00% 7.360us 2.453us 3 - aten::_convolution 1.37% 33.342us 5.63% 136.755us 45.585us 0.000us 0.00% 7.360us 2.453us 3 - aten::_conv_depthwise2d 1.40% 34.091us 3.35% 81.282us 27.094us 7.360us 38.14% 7.360us 2.453us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 38.14% 7.360us 2.453us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 32.67% 6.304us 2.101us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.633us 29.19% 5.633us 1.878us 3 - Activity Buffer Request 72.13% 1.752ms 72.13% 1.752ms 1.752ms 2.336us 12.11% 2.336us 2.336us 1 - aten::empty_strided 2.44% 59.211us 2.44% 59.211us 9.868us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 3.74% 90.852us 3.74% 90.852us 10.095us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 1.11% 27.021us 1.43% 34.781us 3.865us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.54% 13.120us 0.54% 13.120us 0.875us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.50% 12.090us 0.50% 12.090us 4.030us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.47% 11.450us 0.47% 11.450us 3.817us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.39% 9.560us 0.46% 11.070us 3.690us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 443.583us 2294.79% 443.583us 443.583us 1 + torch_eager 5.05% 224.934us 75.09% 3.343ms 3.343ms 0.000us 0.00% 21.698us 21.698us 1 + aten::to 0.24% 10.861us 65.47% 2.914ms 485.726us 0.000us 0.00% 14.370us 2.395us 6 + aten::_to_copy 1.00% 44.312us 65.22% 2.903ms 483.916us 0.000us 0.00% 14.370us 2.395us 6 + aten::copy_ 1.44% 63.961us 42.41% 1.888ms 314.646us 12.002us 62.09% 14.370us 2.395us 6 + aten::conv1d 0.19% 8.510us 3.64% 161.973us 53.991us 0.000us 0.00% 7.328us 2.443us 3 + aten::convolution 0.39% 17.300us 3.45% 153.463us 51.154us 0.000us 0.00% 7.328us 2.443us 3 + aten::_convolution 0.75% 33.601us 3.06% 136.163us 45.388us 0.000us 0.00% 7.328us 2.443us 3 + aten::_conv_depthwise2d 0.77% 34.142us 1.86% 82.892us 27.631us 7.328us 37.91% 7.328us 2.443us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 37.91% 7.328us 2.443us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.337us 32.78% 6.337us 2.112us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.665us 29.31% 5.665us 1.888us 3 + Activity Buffer Request 39.41% 1.754ms 39.41% 1.754ms 1.754ms 2.368us 12.25% 2.368us 2.368us 1 + aten::empty_strided 21.82% 971.311us 21.82% 971.311us 161.885us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 2.12% 94.383us 2.12% 94.383us 10.487us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.60% 26.802us 0.78% 34.811us 3.868us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.29% 12.970us 0.29% 12.970us 0.865us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.27% 12.049us 0.27% 12.049us 4.016us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.27% 12.050us 0.27% 12.050us 4.017us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.16% 7.339us 0.20% 8.900us 2.967us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.428ms -Self CUDA time total: 19.297us +Self CPU time total: 4.452ms +Self CUDA time total: 19.330us @@ -4031,29 +4031,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 323.230us 1639.85% 323.230us 323.230us 1 - torch_eager 5.66% 125.716us 99.73% 2.215ms 2.215ms 0.000us 0.00% 21.919us 21.919us 1 - aten::to 0.28% 6.120us 87.62% 1.946ms 324.308us 0.000us 0.00% 13.951us 2.325us 6 - aten::_to_copy 1.09% 24.242us 87.34% 1.940ms 323.288us 0.000us 0.00% 13.951us 2.325us 6 - aten::copy_ 2.08% 46.140us 84.89% 1.885ms 314.223us 11.743us 59.58% 13.951us 2.325us 6 - aten::conv1d 0.26% 5.800us 5.28% 117.333us 39.111us 0.000us 0.00% 7.968us 2.656us 3 - aten::convolution 0.51% 11.340us 5.02% 111.533us 37.178us 0.000us 0.00% 7.968us 2.656us 3 - aten::_convolution 1.02% 22.760us 4.51% 100.193us 33.398us 0.000us 0.00% 7.968us 2.656us 3 - aten::_conv_depthwise2d 0.99% 21.901us 2.76% 61.233us 20.411us 7.968us 40.42% 7.968us 2.656us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 40.42% 7.968us 2.656us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 31.33% 6.176us 2.059us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.567us 28.24% 5.567us 1.856us 3 - Activity Buffer Request 80.76% 1.794ms 80.76% 1.794ms 1.794ms 2.208us 11.20% 2.208us 2.208us 1 - aten::empty_strided 1.36% 30.150us 1.36% 30.150us 5.025us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 2.98% 66.273us 2.98% 66.273us 7.364us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.76% 16.860us 0.98% 21.769us 2.419us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.39% 8.679us 0.39% 8.679us 0.579us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.42% 9.331us 0.42% 9.331us 3.110us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.42% 9.410us 0.42% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.27% 6.040us 0.33% 7.320us 2.440us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 332.318us 1699.66% 332.318us 332.318us 1 + torch_eager 5.93% 124.693us 99.74% 2.097ms 2.097ms 0.000us 0.00% 21.696us 21.696us 1 + aten::to 0.26% 5.560us 86.81% 1.825ms 304.128us 0.000us 0.00% 13.792us 2.299us 6 + aten::_to_copy 1.04% 21.892us 86.55% 1.819ms 303.202us 0.000us 0.00% 13.792us 2.299us 6 + aten::copy_ 2.25% 47.330us 83.99% 1.765ms 294.235us 11.648us 59.57% 13.792us 2.299us 6 + aten::conv1d 0.33% 6.849us 5.77% 121.312us 40.437us 0.000us 0.00% 7.904us 2.635us 3 + aten::convolution 0.42% 8.801us 5.45% 114.463us 38.154us 0.000us 0.00% 7.904us 2.635us 3 + aten::_convolution 1.06% 22.231us 5.03% 105.662us 35.221us 0.000us 0.00% 7.904us 2.635us 3 + aten::_conv_depthwise2d 1.31% 27.600us 3.25% 68.311us 22.770us 7.904us 40.43% 7.904us 2.635us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 40.43% 7.904us 2.635us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.080us 31.10% 6.080us 2.027us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.568us 28.48% 5.568us 1.856us 3 + Activity Buffer Request 79.40% 1.669ms 79.40% 1.669ms 1.669ms 2.144us 10.97% 2.144us 2.144us 1 + aten::empty_strided 1.52% 31.910us 1.52% 31.910us 5.318us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 3.37% 70.742us 3.37% 70.742us 7.860us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.77% 16.170us 1.03% 21.551us 2.395us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.43% 9.002us 0.43% 9.002us 0.600us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.47% 9.820us 0.47% 9.820us 3.273us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.44% 9.180us 0.44% 9.180us 3.060us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.25% 5.320us 0.31% 6.610us 2.203us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.221ms -Self CUDA time total: 19.711us +Self CPU time total: 2.102ms +Self CUDA time total: 19.552us @@ -4063,29 +4063,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 318.337us 1694.72% 318.337us 318.337us 1 - torch_eager 5.96% 125.666us 99.74% 2.103ms 2.103ms 0.000us 0.00% 20.799us 20.799us 1 - aten::to 0.29% 6.100us 87.07% 1.836ms 305.953us 0.000us 0.00% 13.822us 2.304us 6 - aten::_to_copy 1.12% 23.630us 86.78% 1.830ms 304.936us 0.000us 0.00% 13.822us 2.304us 6 - aten::copy_ 2.25% 47.463us 84.23% 1.776ms 295.964us 11.807us 62.86% 13.822us 2.304us 6 - aten::conv1d 0.29% 6.140us 5.41% 114.033us 38.011us 0.000us 0.00% 6.977us 2.326us 3 - aten::convolution 0.47% 9.960us 5.12% 107.893us 35.964us 0.000us 0.00% 6.977us 2.326us 3 - aten::_convolution 1.11% 23.480us 4.65% 97.933us 32.644us 0.000us 0.00% 6.977us 2.326us 3 - aten::_conv_depthwise2d 1.00% 21.010us 2.82% 59.553us 19.851us 6.977us 37.14% 6.977us 2.326us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.977us 37.14% 6.977us 2.326us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.047us 32.19% 6.047us 2.016us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.760us 30.66% 5.760us 1.920us 3 - Activity Buffer Request 79.84% 1.683ms 79.84% 1.683ms 1.683ms 2.015us 10.73% 2.015us 2.015us 1 - aten::empty_strided 1.43% 30.201us 1.43% 30.201us 5.033us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 3.12% 65.751us 3.12% 65.751us 7.306us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.82% 17.281us 1.06% 22.310us 2.479us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.41% 8.589us 0.41% 8.589us 0.573us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.43% 9.162us 0.43% 9.162us 3.054us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.41% 8.730us 0.41% 8.730us 2.910us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.25% 5.310us 0.31% 6.480us 2.160us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 321.886us 1719.48% 321.886us 321.886us 1 + torch_eager 5.64% 122.557us 99.74% 2.168ms 2.168ms 0.000us 0.00% 20.704us 20.704us 1 + aten::to 0.28% 6.020us 87.70% 1.906ms 317.672us 0.000us 0.00% 13.728us 2.288us 6 + aten::_to_copy 1.02% 22.261us 87.43% 1.900ms 316.669us 0.000us 0.00% 13.728us 2.288us 6 + aten::copy_ 2.09% 45.470us 85.00% 1.847ms 307.868us 11.744us 62.74% 13.728us 2.288us 6 + aten::conv1d 0.30% 6.500us 5.24% 113.822us 37.941us 0.000us 0.00% 6.976us 2.325us 3 + aten::convolution 0.46% 10.091us 4.94% 107.322us 35.774us 0.000us 0.00% 6.976us 2.325us 3 + aten::_convolution 0.98% 21.191us 4.47% 97.231us 32.410us 0.000us 0.00% 6.976us 2.325us 3 + aten::_conv_depthwise2d 1.00% 21.810us 2.83% 61.480us 20.493us 6.976us 37.26% 6.976us 2.325us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.976us 37.26% 6.976us 2.325us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.016us 32.14% 6.016us 2.005us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 30.60% 5.728us 1.909us 3 + Activity Buffer Request 80.69% 1.754ms 80.69% 1.754ms 1.754ms 1.984us 10.60% 1.984us 1.984us 1 + aten::empty_strided 1.41% 30.541us 1.41% 30.541us 5.090us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 3.18% 69.051us 3.18% 69.051us 7.672us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.74% 16.087us 0.96% 20.947us 2.327us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.39% 8.541us 0.39% 8.541us 0.569us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.46% 9.970us 0.46% 9.970us 3.323us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.40% 8.750us 0.40% 8.750us 2.917us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.23% 5.051us 0.29% 6.381us 2.127us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.108ms -Self CUDA time total: 18.784us +Self CPU time total: 2.173ms +Self CUDA time total: 18.720us @@ -4095,29 +4095,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 331.012us 1687.37% 331.012us 331.012us 1 - torch_eager 5.07% 123.976us 99.78% 2.439ms 2.439ms 0.000us 0.00% 21.729us 21.729us 1 - aten::to 0.26% 6.330us 88.80% 2.171ms 361.846us 0.000us 0.00% 14.016us 2.336us 6 - aten::_to_copy 0.96% 23.551us 88.54% 2.165ms 360.791us 0.000us 0.00% 14.016us 2.336us 6 - aten::copy_ 1.90% 46.440us 86.33% 2.111ms 351.776us 11.904us 60.68% 14.016us 2.336us 6 - aten::conv1d 0.24% 5.859us 4.84% 118.253us 39.418us 0.000us 0.00% 7.713us 2.571us 3 - aten::convolution 0.37% 9.110us 4.60% 112.394us 37.465us 0.000us 0.00% 7.713us 2.571us 3 - aten::_convolution 0.99% 24.110us 4.22% 103.284us 34.428us 0.000us 0.00% 7.713us 2.571us 3 - aten::_conv_depthwise2d 0.86% 20.950us 2.48% 60.661us 20.220us 7.713us 39.32% 7.713us 2.571us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.713us 39.32% 7.713us 2.571us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 31.48% 6.176us 2.059us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 29.20% 5.728us 1.909us 3 - Activity Buffer Request 74.74% 1.827ms 74.74% 1.827ms 1.827ms 2.112us 10.77% 2.112us 2.112us 1 - aten::empty_strided 1.25% 30.541us 1.25% 30.541us 5.090us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.54% 257.579us 10.54% 257.579us 28.620us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.76% 18.570us 0.98% 23.841us 2.649us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.35% 8.661us 0.35% 8.661us 0.577us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.39% 9.560us 0.39% 9.560us 3.187us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.39% 9.470us 0.39% 9.470us 3.157us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.30% 7.292us 0.35% 8.562us 2.854us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 323.964us 1656.85% 323.964us 323.964us 1 + torch_eager 5.53% 125.552us 99.77% 2.267ms 2.267ms 0.000us 0.00% 21.665us 21.665us 1 + aten::to 0.26% 5.900us 88.05% 2.001ms 333.419us 0.000us 0.00% 13.985us 2.331us 6 + aten::_to_copy 0.97% 21.980us 87.79% 1.995ms 332.436us 0.000us 0.00% 13.985us 2.331us 6 + aten::copy_ 2.06% 46.813us 85.51% 1.943ms 323.777us 11.873us 60.72% 13.985us 2.331us 6 + aten::conv1d 0.29% 6.601us 5.07% 115.272us 38.424us 0.000us 0.00% 7.680us 2.560us 3 + aten::convolution 0.40% 9.150us 4.78% 108.671us 36.224us 0.000us 0.00% 7.680us 2.560us 3 + aten::_convolution 0.96% 21.770us 4.38% 99.521us 33.174us 0.000us 0.00% 7.680us 2.560us 3 + aten::_conv_depthwise2d 0.97% 21.939us 2.77% 62.961us 20.987us 7.680us 39.28% 7.680us 2.560us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.680us 39.28% 7.680us 2.560us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.177us 31.59% 6.177us 2.059us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 29.13% 5.696us 1.899us 3 + Activity Buffer Request 73.99% 1.681ms 73.99% 1.681ms 1.681ms 2.112us 10.80% 2.112us 2.112us 1 + aten::empty_strided 1.32% 29.970us 1.32% 29.970us 4.995us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.42% 236.845us 10.42% 236.845us 26.316us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.72% 16.321us 0.94% 21.351us 2.372us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.38% 8.680us 0.38% 8.680us 0.579us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.43% 9.870us 0.43% 9.870us 3.290us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.41% 9.231us 0.41% 9.231us 3.077us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.22% 5.110us 0.28% 6.420us 2.140us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.445ms -Self CUDA time total: 19.617us +Self CPU time total: 2.272ms +Self CUDA time total: 19.553us @@ -4127,29 +4127,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 323.714us 1318.86% 323.714us 323.714us 1 - torch_eager 5.46% 123.145us 99.75% 2.249ms 2.249ms 0.000us 0.00% 26.849us 26.849us 1 - aten::to 0.25% 5.731us 88.05% 1.985ms 330.837us 0.000us 0.00% 15.297us 2.550us 6 - aten::_to_copy 1.06% 23.869us 87.80% 1.979ms 329.882us 0.000us 0.00% 15.297us 2.550us 6 - aten::copy_ 2.13% 47.950us 85.40% 1.925ms 320.895us 12.993us 52.94% 15.297us 2.550us 6 - aten::conv1d 0.25% 5.619us 5.08% 114.483us 38.161us 0.000us 0.00% 11.552us 3.851us 3 - aten::convolution 0.41% 9.331us 4.83% 108.864us 36.288us 0.000us 0.00% 11.552us 3.851us 3 - aten::_convolution 0.98% 22.020us 4.42% 99.533us 33.178us 0.000us 0.00% 11.552us 3.851us 3 - aten::_conv_depthwise2d 0.92% 20.831us 2.69% 60.562us 20.187us 11.552us 47.06% 11.552us 3.851us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 47.06% 11.552us 3.851us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.688us 27.25% 6.688us 2.229us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.305us 25.69% 6.305us 2.102us 3 - Activity Buffer Request 74.33% 1.676ms 74.33% 1.676ms 1.676ms 2.304us 9.39% 2.304us 2.304us 1 - aten::empty_strided 1.33% 30.051us 1.33% 30.051us 5.008us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 9.91% 223.307us 9.91% 223.307us 24.812us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.79% 17.850us 1.04% 23.451us 2.606us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.40% 8.951us 0.40% 8.951us 0.597us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.40% 9.121us 0.40% 9.121us 3.040us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.40% 8.980us 0.40% 8.980us 2.993us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.27% 6.160us 0.32% 7.280us 2.427us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 323.612us 1313.41% 323.612us 323.612us 1 + torch_eager 5.38% 121.196us 99.77% 2.248ms 2.248ms 0.000us 0.00% 26.943us 26.943us 1 + aten::to 0.27% 6.030us 88.19% 1.987ms 331.149us 0.000us 0.00% 15.327us 2.555us 6 + aten::_to_copy 1.03% 23.263us 87.92% 1.981ms 330.144us 0.000us 0.00% 15.327us 2.555us 6 + aten::copy_ 2.13% 48.030us 85.56% 1.928ms 321.255us 13.023us 52.86% 15.327us 2.555us 6 + aten::conv1d 0.25% 5.560us 5.05% 113.792us 37.931us 0.000us 0.00% 11.616us 3.872us 3 + aten::convolution 0.43% 9.790us 4.80% 108.232us 36.077us 0.000us 0.00% 11.616us 3.872us 3 + aten::_convolution 0.97% 21.854us 4.37% 98.442us 32.814us 0.000us 0.00% 11.616us 3.872us 3 + aten::_conv_depthwise2d 0.93% 20.910us 2.73% 61.610us 20.537us 11.616us 47.14% 11.616us 3.872us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.616us 47.14% 11.616us 3.872us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.687us 27.14% 6.687us 2.229us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 25.72% 6.336us 2.112us 3 + Activity Buffer Request 75.30% 1.697ms 75.30% 1.697ms 1.697ms 2.304us 9.35% 2.304us 2.304us 1 + aten::empty_strided 1.33% 30.069us 1.33% 30.069us 5.011us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 9.09% 204.783us 9.09% 204.783us 22.754us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.75% 16.840us 0.97% 21.869us 2.430us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.39% 8.780us 0.39% 8.780us 0.585us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.43% 9.680us 0.43% 9.680us 3.227us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.41% 9.200us 0.41% 9.200us 3.067us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.23% 5.139us 0.29% 6.519us 2.173us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.254ms -Self CUDA time total: 24.545us +Self CPU time total: 2.253ms +Self CUDA time total: 24.639us @@ -4159,29 +4159,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 332.293us 1275.69% 332.293us 332.293us 1 - torch_eager 4.94% 125.275us 99.79% 2.531ms 2.531ms 0.000us 0.00% 28.288us 28.288us 1 - aten::to 0.24% 6.110us 89.17% 2.261ms 376.887us 0.000us 0.00% 15.199us 2.533us 6 - aten::_to_copy 0.97% 24.638us 88.93% 2.255ms 375.868us 0.000us 0.00% 15.199us 2.533us 6 - aten::copy_ 1.89% 47.902us 86.77% 2.200ms 366.727us 12.959us 49.75% 15.199us 2.533us 6 - aten::conv1d 0.23% 5.820us 4.59% 116.433us 38.811us 0.000us 0.00% 13.089us 4.363us 3 - aten::convolution 0.40% 10.180us 4.36% 110.613us 36.871us 0.000us 0.00% 13.089us 4.363us 3 - aten::_convolution 0.89% 22.520us 3.96% 100.433us 33.478us 0.000us 0.00% 13.089us 4.363us 3 - aten::_conv_depthwise2d 0.83% 21.002us 2.40% 60.753us 20.251us 13.089us 50.25% 13.089us 4.363us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.089us 50.25% 13.089us 4.363us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.591us 25.30% 6.591us 2.197us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 24.45% 6.368us 2.123us 3 - Activity Buffer Request 77.38% 1.962ms 77.38% 1.962ms 1.962ms 2.240us 8.60% 2.240us 2.240us 1 - aten::empty_strided 1.19% 30.211us 1.19% 30.211us 5.035us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 8.30% 210.437us 8.30% 210.437us 23.382us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.72% 18.342us 0.94% 23.791us 2.643us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.36% 9.059us 0.36% 9.059us 0.604us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.39% 9.940us 0.39% 9.940us 3.313us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.37% 9.490us 0.37% 9.490us 3.163us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.28% 7.000us 0.33% 8.470us 2.823us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 323.741us 1242.82% 323.741us 323.741us 1 + torch_eager 5.26% 120.985us 99.76% 2.294ms 2.294ms 0.000us 0.00% 28.321us 28.321us 1 + aten::to 0.24% 5.569us 88.38% 2.032ms 338.735us 0.000us 0.00% 15.233us 2.539us 6 + aten::_to_copy 0.96% 21.983us 88.14% 2.027ms 337.807us 0.000us 0.00% 15.233us 2.539us 6 + aten::copy_ 2.04% 46.990us 85.81% 1.973ms 328.880us 12.961us 49.76% 15.233us 2.539us 6 + aten::conv1d 0.28% 6.482us 4.99% 114.833us 38.278us 0.000us 0.00% 13.088us 4.363us 3 + aten::convolution 0.42% 9.729us 4.71% 108.351us 36.117us 0.000us 0.00% 13.088us 4.363us 3 + aten::_convolution 0.95% 21.778us 4.29% 98.622us 32.874us 0.000us 0.00% 13.088us 4.363us 3 + aten::_conv_depthwise2d 0.90% 20.730us 2.70% 62.081us 20.694us 13.088us 50.24% 13.088us 4.363us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.088us 50.24% 13.088us 4.363us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.657us 25.56% 6.657us 2.219us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 24.20% 6.304us 2.101us 3 + Activity Buffer Request 76.06% 1.749ms 76.06% 1.749ms 1.749ms 2.272us 8.72% 2.272us 2.272us 1 + aten::empty_strided 1.37% 31.579us 1.37% 31.579us 5.263us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 8.65% 198.944us 8.65% 198.944us 22.105us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.72% 16.561us 0.94% 21.621us 2.402us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.38% 8.738us 0.38% 8.738us 0.583us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.43% 9.970us 0.43% 9.970us 3.323us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.42% 9.600us 0.42% 9.600us 3.200us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.23% 5.282us 0.28% 6.541us 2.180us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.536ms -Self CUDA time total: 26.048us +Self CPU time total: 2.300ms +Self CUDA time total: 26.049us @@ -4191,29 +4191,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 342.974us 895.45% 342.974us 342.974us 1 - torch_eager 6.28% 155.441us 99.76% 2.468ms 2.468ms 0.000us 0.00% 40.893us 40.893us 1 - aten::conv1d 0.26% 6.400us 4.96% 122.693us 40.898us 0.000us 0.00% 22.559us 7.520us 3 - aten::convolution 0.44% 10.880us 4.70% 116.293us 38.764us 0.000us 0.00% 22.559us 7.520us 3 - aten::_convolution 1.02% 25.290us 4.26% 105.413us 35.138us 0.000us 0.00% 22.559us 7.520us 3 - aten::_conv_depthwise2d 0.93% 22.901us 2.53% 62.512us 20.837us 22.559us 58.90% 22.559us 7.520us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.559us 58.90% 22.559us 7.520us 3 - aten::to 0.27% 6.673us 87.38% 2.161ms 360.194us 0.000us 0.00% 18.334us 3.056us 6 - aten::_to_copy 1.14% 28.279us 87.11% 2.154ms 359.082us 0.000us 0.00% 18.334us 3.056us 6 - aten::copy_ 2.07% 51.091us 84.63% 2.093ms 348.865us 15.743us 41.10% 18.334us 3.056us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.352us 21.81% 8.352us 2.784us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.391us 19.30% 7.391us 2.464us 3 - Activity Buffer Request 70.52% 1.744ms 70.52% 1.744ms 1.744ms 2.591us 6.76% 2.591us 2.591us 1 - aten::empty_strided 1.34% 33.023us 1.34% 33.023us 5.504us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 12.90% 319.187us 12.90% 319.187us 35.465us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.77% 18.952us 0.98% 24.192us 2.688us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.37% 9.200us 0.37% 9.200us 0.613us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.37% 9.140us 0.37% 9.140us 3.047us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.37% 9.211us 0.37% 9.211us 3.070us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.27% 6.591us 0.33% 8.191us 2.730us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 353.533us 917.60% 353.533us 353.533us 1 + torch_eager 6.39% 158.244us 99.77% 2.472ms 2.472ms 0.000us 0.00% 41.120us 41.120us 1 + aten::conv1d 0.31% 7.740us 5.07% 125.493us 41.831us 0.000us 0.00% 22.624us 7.541us 3 + aten::convolution 0.49% 12.210us 4.75% 117.753us 39.251us 0.000us 0.00% 22.624us 7.541us 3 + aten::_convolution 0.97% 24.030us 4.26% 105.543us 35.181us 0.000us 0.00% 22.624us 7.541us 3 + aten::_conv_depthwise2d 0.92% 22.692us 2.61% 64.593us 21.531us 22.624us 58.72% 22.624us 7.541us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.624us 58.72% 22.624us 7.541us 3 + aten::to 0.28% 6.850us 87.14% 2.159ms 359.798us 0.000us 0.00% 18.496us 3.083us 6 + aten::_to_copy 1.20% 29.847us 86.86% 2.152ms 358.656us 0.000us 0.00% 18.496us 3.083us 6 + aten::copy_ 2.06% 51.053us 84.23% 2.087ms 347.776us 15.904us 41.28% 18.496us 3.083us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.544us 22.18% 8.544us 2.848us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 19.10% 7.360us 2.453us 3 + Activity Buffer Request 69.84% 1.730ms 69.84% 1.730ms 1.730ms 2.592us 6.73% 2.592us 2.592us 1 + aten::empty_strided 1.43% 35.432us 1.43% 35.432us 5.905us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 13.25% 328.377us 13.25% 328.377us 36.486us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.82% 20.390us 1.01% 25.110us 2.790us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.33% 8.220us 0.33% 8.220us 0.548us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.40% 9.850us 0.40% 9.850us 3.283us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.37% 9.091us 0.37% 9.091us 3.030us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.23% 5.680us 0.28% 7.040us 2.347us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.473ms -Self CUDA time total: 38.302us +Self CPU time total: 2.477ms +Self CUDA time total: 38.528us @@ -4223,29 +4223,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 320.766us 781.90% 320.766us 320.766us 1 - torch_eager 5.17% 120.612us 99.77% 2.326ms 2.326ms 0.000us 0.00% 43.616us 43.616us 1 - aten::conv1d 0.25% 5.870us 4.90% 114.223us 38.074us 0.000us 0.00% 25.312us 8.437us 3 - aten::convolution 0.44% 10.332us 4.65% 108.353us 36.118us 0.000us 0.00% 25.312us 8.437us 3 - aten::_convolution 0.97% 22.638us 4.20% 98.021us 32.674us 0.000us 0.00% 25.312us 8.437us 3 - aten::_conv_depthwise2d 0.87% 20.270us 2.54% 59.242us 19.747us 25.312us 61.70% 25.312us 8.437us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.312us 61.70% 25.312us 8.437us 3 - aten::to 0.24% 5.591us 88.58% 2.066ms 344.257us 0.000us 0.00% 18.304us 3.051us 6 - aten::_to_copy 1.00% 23.430us 88.34% 2.060ms 343.325us 0.000us 0.00% 18.304us 3.051us 6 - aten::copy_ 2.09% 48.750us 86.05% 2.006ms 334.417us 15.712us 38.30% 18.304us 3.051us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.384us 20.44% 8.384us 2.795us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 17.86% 7.328us 2.443us 3 - Activity Buffer Request 74.74% 1.743ms 74.74% 1.743ms 1.743ms 2.592us 6.32% 2.592us 2.592us 1 - aten::empty_strided 1.29% 30.021us 1.29% 30.021us 5.004us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.10% 235.617us 10.10% 235.617us 26.180us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.74% 17.250us 0.95% 22.130us 2.459us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.36% 8.290us 0.36% 8.290us 0.553us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.39% 9.191us 0.39% 9.191us 3.064us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.39% 8.990us 0.39% 8.990us 2.997us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.27% 6.321us 0.32% 7.551us 2.517us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 324.796us 788.65% 324.796us 324.796us 1 + torch_eager 5.71% 139.250us 99.77% 2.432ms 2.432ms 0.000us 0.00% 43.776us 43.776us 1 + aten::conv1d 0.23% 5.613us 4.72% 115.084us 38.361us 0.000us 0.00% 25.408us 8.469us 3 + aten::convolution 0.40% 9.718us 4.49% 109.471us 36.490us 0.000us 0.00% 25.408us 8.469us 3 + aten::_convolution 0.88% 21.472us 4.09% 99.753us 33.251us 0.000us 0.00% 25.408us 8.469us 3 + aten::_conv_depthwise2d 0.87% 21.279us 2.58% 62.930us 20.977us 25.408us 61.69% 25.408us 8.469us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.408us 61.69% 25.408us 8.469us 3 + aten::to 0.27% 6.521us 88.29% 2.152ms 358.726us 0.000us 0.00% 18.368us 3.061us 6 + aten::_to_copy 0.94% 22.910us 88.02% 2.146ms 357.639us 0.000us 0.00% 18.368us 3.061us 6 + aten::copy_ 1.92% 46.901us 85.84% 2.093ms 348.779us 15.776us 38.31% 18.368us 3.061us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.384us 20.36% 8.384us 2.795us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 17.95% 7.392us 2.464us 3 + Activity Buffer Request 72.16% 1.759ms 72.16% 1.759ms 1.759ms 2.592us 6.29% 2.592us 2.592us 1 + aten::empty_strided 1.24% 30.251us 1.24% 30.251us 5.042us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 12.68% 309.107us 12.68% 309.107us 34.345us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.70% 16.981us 0.91% 22.141us 2.460us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.36% 8.721us 0.36% 8.721us 0.581us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.41% 10.030us 0.41% 10.030us 3.343us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.38% 9.261us 0.38% 9.261us 3.087us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.21% 5.070us 0.26% 6.391us 2.130us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.332ms -Self CUDA time total: 41.024us +Self CPU time total: 2.438ms +Self CUDA time total: 41.184us @@ -4255,29 +4255,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 327.997us 319.51% 327.997us 327.997us 1 - torch_eager 5.09% 118.124us 99.77% 2.314ms 2.314ms 0.000us 0.00% 108.574us 108.574us 1 - aten::conv1d 0.23% 5.441us 4.97% 115.193us 38.398us 0.000us 0.00% 70.464us 23.488us 3 - aten::convolution 0.40% 9.290us 4.73% 109.752us 36.584us 0.000us 0.00% 70.464us 23.488us 3 - aten::_convolution 1.03% 23.820us 4.33% 100.462us 33.487us 0.000us 0.00% 70.464us 23.488us 3 - aten::_conv_depthwise2d 0.94% 21.771us 2.65% 61.512us 20.504us 70.464us 68.64% 70.464us 23.488us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.464us 68.64% 70.464us 23.488us 3 - aten::to 0.25% 5.729us 88.48% 2.052ms 341.956us 0.000us 0.00% 38.110us 6.352us 6 - aten::_to_copy 1.03% 23.980us 88.23% 2.046ms 341.002us 0.000us 0.00% 38.110us 6.352us 6 - aten::copy_ 2.02% 46.950us 85.91% 1.992ms 332.012us 32.191us 31.36% 38.110us 6.352us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.567us 17.11% 17.567us 5.856us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.624us 14.25% 14.624us 4.875us 3 - Activity Buffer Request 76.17% 1.766ms 76.17% 1.766ms 1.766ms 5.919us 5.77% 5.919us 5.919us 1 - aten::empty_strided 1.29% 29.961us 1.29% 29.961us 4.993us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 8.62% 199.926us 8.62% 199.926us 22.214us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.81% 18.769us 1.04% 24.230us 2.692us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.38% 8.852us 0.38% 8.852us 0.590us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.41% 9.440us 0.41% 9.440us 3.147us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.40% 9.240us 0.40% 9.240us 3.080us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.24% 5.490us 0.29% 6.730us 2.243us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 323.325us 314.08% 323.325us 323.325us 1 + torch_eager 5.12% 121.144us 99.77% 2.360ms 2.360ms 0.000us 0.00% 108.958us 108.958us 1 + aten::conv1d 0.23% 5.500us 4.82% 114.012us 38.004us 0.000us 0.00% 70.719us 23.573us 3 + aten::convolution 0.39% 9.200us 4.59% 108.512us 36.171us 0.000us 0.00% 70.719us 23.573us 3 + aten::_convolution 0.91% 21.421us 4.20% 99.312us 33.104us 0.000us 0.00% 70.719us 23.573us 3 + aten::_conv_depthwise2d 0.94% 22.190us 2.67% 63.201us 21.067us 70.719us 68.70% 70.719us 23.573us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.719us 68.70% 70.719us 23.573us 3 + aten::to 0.25% 5.870us 88.73% 2.099ms 349.767us 0.000us 0.00% 38.239us 6.373us 6 + aten::_to_copy 0.93% 21.960us 88.49% 2.093ms 348.789us 0.000us 0.00% 38.239us 6.373us 6 + aten::copy_ 1.93% 45.542us 86.30% 2.041ms 340.189us 32.223us 31.30% 38.239us 6.373us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.664us 17.16% 17.664us 5.888us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.559us 14.14% 14.559us 4.853us 3 + Activity Buffer Request 73.13% 1.729ms 73.13% 1.729ms 1.729ms 6.016us 5.84% 6.016us 6.016us 1 + aten::empty_strided 1.25% 29.640us 1.25% 29.640us 4.940us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 12.19% 288.286us 12.19% 288.286us 32.032us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.69% 16.350us 0.91% 21.571us 2.397us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.37% 8.791us 0.37% 8.791us 0.586us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.41% 9.750us 0.41% 9.750us 3.250us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.39% 9.120us 0.39% 9.120us 3.040us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.22% 5.150us 0.27% 6.400us 2.133us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.319ms -Self CUDA time total: 102.655us +Self CPU time total: 2.365ms +Self CUDA time total: 102.942us @@ -4287,29 +4287,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 324.863us 288.98% 324.863us 324.863us 1 - torch_eager 5.27% 120.370us 99.77% 2.281ms 2.281ms 0.000us 0.00% 118.337us 118.337us 1 - aten::conv1d 0.25% 5.800us 4.98% 113.813us 37.938us 0.000us 0.00% 80.416us 26.805us 3 - aten::convolution 0.39% 8.920us 4.72% 108.013us 36.004us 0.000us 0.00% 80.416us 26.805us 3 - aten::_convolution 0.94% 21.602us 4.33% 99.093us 33.031us 0.000us 0.00% 80.416us 26.805us 3 - aten::_conv_depthwise2d 0.98% 22.480us 2.74% 62.561us 20.854us 80.416us 71.53% 80.416us 26.805us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.416us 71.53% 80.416us 26.805us 3 - aten::to 0.26% 6.002us 88.33% 2.019ms 336.570us 0.000us 0.00% 37.921us 6.320us 6 - aten::_to_copy 0.97% 22.148us 88.07% 2.013ms 335.570us 0.000us 0.00% 37.921us 6.320us 6 - aten::copy_ 2.05% 46.852us 85.75% 1.960ms 326.725us 32.001us 28.47% 37.921us 6.320us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.472us 15.54% 17.472us 5.824us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.529us 12.92% 14.529us 4.843us 3 - Activity Buffer Request 76.07% 1.739ms 76.07% 1.739ms 1.739ms 5.920us 5.27% 5.920us 5.920us 1 - aten::empty_strided 1.35% 30.921us 1.35% 30.921us 5.153us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 8.60% 196.555us 8.60% 196.555us 21.839us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.81% 18.430us 1.02% 23.240us 2.582us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.36% 8.150us 0.36% 8.150us 0.543us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.40% 9.159us 0.40% 9.159us 3.053us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.39% 8.901us 0.39% 8.901us 2.967us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.24% 5.410us 0.29% 6.670us 2.223us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 319.583us 282.68% 319.583us 319.583us 1 + torch_eager 5.26% 120.047us 99.77% 2.277ms 2.277ms 0.000us 0.00% 119.039us 119.039us 1 + aten::conv1d 0.28% 6.329us 5.02% 114.481us 38.160us 0.000us 0.00% 80.863us 26.954us 3 + aten::convolution 0.40% 9.201us 4.74% 108.152us 36.051us 0.000us 0.00% 80.863us 26.954us 3 + aten::_convolution 0.92% 21.079us 4.34% 98.951us 32.984us 0.000us 0.00% 80.863us 26.954us 3 + aten::_conv_depthwise2d 0.91% 20.811us 2.77% 63.141us 21.047us 80.863us 71.53% 80.863us 26.954us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.863us 71.53% 80.863us 26.954us 3 + aten::to 0.25% 5.630us 88.39% 2.017ms 336.237us 0.000us 0.00% 38.176us 6.363us 6 + aten::_to_copy 0.94% 21.550us 88.14% 2.012ms 335.299us 0.000us 0.00% 38.176us 6.363us 6 + aten::copy_ 1.99% 45.392us 85.93% 1.961ms 326.887us 32.192us 28.47% 38.176us 6.363us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.600us 15.57% 17.600us 5.867us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.592us 12.91% 14.592us 4.864us 3 + Activity Buffer Request 75.53% 1.724ms 75.53% 1.724ms 1.724ms 5.984us 5.29% 5.984us 5.984us 1 + aten::empty_strided 1.27% 28.920us 1.27% 28.920us 4.820us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 9.42% 215.013us 9.42% 215.013us 23.890us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.71% 16.099us 0.92% 21.000us 2.333us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.37% 8.402us 0.37% 8.402us 0.560us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.44% 10.070us 0.44% 10.070us 3.357us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.40% 9.140us 0.40% 9.140us 3.047us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.23% 5.200us 0.28% 6.490us 2.163us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.286ms -Self CUDA time total: 112.417us +Self CPU time total: 2.283ms +Self CUDA time total: 113.055us @@ -4319,29 +4319,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 4.92% 118.303us 96.45% 2.317ms 2.317ms 0.000us 0.00% 464.513us 464.513us 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 454.847us 106.94% 454.847us 454.847us 1 - aten::conv1d 0.23% 5.570us 4.84% 116.393us 38.798us 0.000us 0.00% 280.159us 93.386us 3 - aten::convolution 0.41% 9.781us 4.61% 110.823us 36.941us 0.000us 0.00% 280.159us 93.386us 3 - aten::_convolution 0.97% 23.190us 4.21% 101.042us 33.681us 0.000us 0.00% 280.159us 93.386us 3 - aten::_conv_depthwise2d 0.92% 22.172us 2.56% 61.572us 20.524us 280.159us 65.87% 280.159us 93.386us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 280.159us 65.87% 280.159us 93.386us 3 - aten::to 0.25% 6.010us 85.54% 2.055ms 342.552us 0.000us 0.00% 184.354us 30.726us 6 - aten::_to_copy 1.07% 25.702us 85.29% 2.049ms 341.550us 0.000us 0.00% 184.354us 30.726us 6 - aten::copy_ 1.90% 45.633us 82.97% 1.993ms 332.247us 145.185us 34.13% 184.354us 30.726us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 104.993us 24.68% 104.993us 34.998us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.192us 9.45% 40.192us 13.397us 3 - Activity Buffer Request 73.74% 1.772ms 73.74% 1.772ms 1.772ms 39.169us 9.21% 39.169us 39.169us 1 - aten::empty_strided 1.25% 30.119us 1.25% 30.119us 5.020us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 8.18% 196.512us 8.18% 196.512us 21.835us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.74% 17.761us 0.97% 23.251us 2.583us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.39% 9.350us 0.39% 9.350us 0.623us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.39% 9.400us 0.39% 9.400us 3.133us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.40% 9.580us 0.40% 9.580us 3.193us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.27% 6.480us 0.34% 8.150us 2.717us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 4.95% 116.473us 96.30% 2.266ms 2.266ms 0.000us 0.00% 463.098us 463.098us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 450.875us 106.36% 450.875us 450.875us 1 + aten::conv1d 0.27% 6.419us 4.91% 115.592us 38.531us 0.000us 0.00% 279.421us 93.140us 3 + aten::convolution 0.39% 9.200us 4.64% 109.173us 36.391us 0.000us 0.00% 279.421us 93.140us 3 + aten::_convolution 0.90% 21.142us 4.25% 99.973us 33.324us 0.000us 0.00% 279.421us 93.140us 3 + aten::_conv_depthwise2d 0.92% 21.621us 2.71% 63.861us 21.287us 279.421us 65.92% 279.421us 93.140us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 279.421us 65.92% 279.421us 93.140us 3 + aten::to 0.25% 5.769us 85.33% 2.008ms 334.609us 0.000us 0.00% 183.677us 30.613us 6 + aten::_to_copy 1.28% 30.122us 85.09% 2.002ms 333.647us 0.000us 0.00% 183.677us 30.613us 6 + aten::copy_ 1.97% 46.430us 82.58% 1.943ms 323.810us 144.478us 34.08% 183.677us 30.613us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 104.191us 24.58% 104.191us 34.730us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.287us 9.50% 40.287us 13.429us 3 + Activity Buffer Request 73.06% 1.719ms 73.06% 1.719ms 1.719ms 39.199us 9.25% 39.199us 39.199us 1 + aten::empty_strided 1.23% 28.900us 1.23% 28.900us 4.817us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 8.51% 200.175us 8.51% 200.175us 22.242us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.69% 16.350us 0.92% 21.710us 2.412us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.38% 8.919us 0.38% 8.919us 0.595us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.42% 9.910us 0.42% 9.910us 3.303us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.41% 9.610us 0.41% 9.610us 3.203us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.22% 5.091us 0.27% 6.340us 2.113us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.403ms -Self CUDA time total: 425.344us +Self CPU time total: 2.353ms +Self CUDA time total: 423.899us @@ -4351,29 +4351,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 5.04% 119.094us 95.77% 2.263ms 2.263ms 0.000us 0.00% 471.612us 471.612us 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 465.149us 106.82% 465.149us 465.149us 1 - aten::conv1d 0.25% 5.900us 4.82% 113.812us 37.937us 0.000us 0.00% 297.789us 99.263us 3 - aten::convolution 0.42% 9.940us 4.57% 107.912us 35.971us 0.000us 0.00% 297.789us 99.263us 3 - aten::_convolution 0.91% 21.591us 4.15% 97.972us 32.657us 0.000us 0.00% 297.789us 99.263us 3 - aten::_conv_depthwise2d 0.88% 20.792us 2.56% 60.392us 20.131us 297.789us 68.39% 297.789us 99.263us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 297.789us 68.39% 297.789us 99.263us 3 - aten::to 0.25% 5.882us 84.81% 2.004ms 334.050us 0.000us 0.00% 173.823us 28.970us 6 - aten::_to_copy 1.03% 24.309us 84.56% 1.998ms 333.069us 0.000us 0.00% 173.823us 28.970us 6 - aten::copy_ 1.99% 46.992us 82.30% 1.945ms 324.149us 137.663us 31.61% 173.823us 28.970us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 97.280us 22.34% 97.280us 32.427us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.383us 9.27% 40.383us 13.461us 3 - Activity Buffer Request 72.83% 1.721ms 72.83% 1.721ms 1.721ms 36.160us 8.30% 36.160us 36.160us 1 - aten::empty_strided 1.24% 29.211us 1.24% 29.211us 4.869us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 8.35% 197.423us 8.35% 197.423us 21.936us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.69% 16.359us 0.90% 21.340us 2.371us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.37% 8.812us 0.37% 8.812us 0.587us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.41% 9.620us 0.41% 9.620us 3.207us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.40% 9.410us 0.40% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.27% 6.299us 0.32% 7.670us 2.557us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 485.403us 111.78% 485.403us 485.403us 1 + torch_eager 5.11% 119.851us 95.84% 2.247ms 2.247ms 0.000us 0.00% 469.691us 469.691us 1 + aten::conv1d 0.25% 5.911us 5.75% 134.703us 44.901us 0.000us 0.00% 298.875us 99.625us 3 + aten::convolution 0.39% 9.088us 5.49% 128.792us 42.931us 0.000us 0.00% 298.875us 99.625us 3 + aten::_convolution 0.90% 20.984us 5.11% 119.704us 39.901us 0.000us 0.00% 298.875us 99.625us 3 + aten::_conv_depthwise2d 0.91% 21.360us 3.56% 83.490us 27.830us 298.875us 68.83% 298.875us 99.625us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 298.875us 68.83% 298.875us 99.625us 3 + aten::to 0.24% 5.731us 83.88% 1.966ms 327.731us 0.000us 0.00% 170.816us 28.469us 6 + aten::_to_copy 0.95% 22.370us 83.64% 1.961ms 326.776us 0.000us 0.00% 170.816us 28.469us 6 + aten::copy_ 2.00% 46.872us 81.44% 1.909ms 318.188us 135.360us 31.17% 170.816us 28.469us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 95.008us 21.88% 95.008us 31.669us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.352us 9.29% 40.352us 13.451us 3 + Activity Buffer Request 71.79% 1.683ms 71.79% 1.683ms 1.683ms 35.456us 8.17% 35.456us 35.456us 1 + aten::empty_strided 1.24% 29.152us 1.24% 29.152us 4.859us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 8.71% 204.083us 8.71% 204.083us 22.676us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.70% 16.420us 0.92% 21.581us 2.398us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.38% 8.941us 0.38% 8.941us 0.596us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.19% 27.990us 1.19% 27.990us 9.330us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.41% 9.520us 0.41% 9.520us 3.173us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.23% 5.350us 0.29% 6.870us 2.290us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.363ms -Self CUDA time total: 435.452us +Self CPU time total: 2.344ms +Self CUDA time total: 434.235us @@ -4383,29 +4383,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 313.695us 1667.17% 313.695us 313.695us 1 - torch_eager 14.13% 113.614us 99.32% 798.339us 798.339us 0.000us 0.00% 20.800us 20.800us 1 - aten::to 0.70% 5.659us 68.10% 547.391us 91.232us 0.000us 0.00% 13.664us 2.277us 6 - aten::_to_copy 3.00% 24.130us 67.40% 541.732us 90.289us 0.000us 0.00% 13.664us 2.277us 6 - aten::copy_ 5.94% 47.780us 60.67% 487.691us 81.282us 11.680us 62.07% 13.664us 2.277us 6 - aten::conv1d 0.70% 5.600us 14.11% 113.433us 37.811us 0.000us 0.00% 7.136us 2.379us 3 - aten::convolution 1.23% 9.922us 13.42% 107.833us 35.944us 0.000us 0.00% 7.136us 2.379us 3 - aten::_convolution 2.79% 22.410us 12.18% 97.911us 32.637us 0.000us 0.00% 7.136us 2.379us 3 - aten::_conv_depthwise2d 2.59% 20.852us 7.49% 60.242us 20.081us 7.136us 37.93% 7.136us 2.379us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.136us 37.93% 7.136us 2.379us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 31.80% 5.984us 1.995us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 30.27% 5.696us 1.899us 3 - Activity Buffer Request 32.74% 263.147us 32.74% 263.147us 263.147us 1.984us 10.54% 1.984us 1.984us 1 - aten::empty_strided 3.72% 29.911us 3.72% 29.911us 4.985us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 24.67% 198.304us 24.67% 198.304us 22.034us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 1.90% 15.298us 2.42% 19.460us 2.162us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.96% 7.694us 0.96% 7.694us 0.513us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.11% 8.950us 1.11% 8.950us 2.983us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.11% 8.900us 1.11% 8.900us 2.967us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.80% 6.399us 0.97% 7.810us 2.603us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 312.893us 1660.09% 312.893us 312.893us 1 + torch_eager 14.08% 114.574us 99.37% 808.348us 808.348us 0.000us 0.00% 20.800us 20.800us 1 + aten::to 0.69% 5.578us 68.43% 556.700us 92.783us 0.000us 0.00% 13.600us 2.267us 6 + aten::_to_copy 2.67% 21.680us 67.75% 551.122us 91.854us 0.000us 0.00% 13.600us 2.267us 6 + aten::copy_ 5.64% 45.852us 61.26% 498.351us 83.058us 11.648us 61.80% 13.600us 2.267us 6 + aten::conv1d 0.80% 6.521us 13.85% 112.642us 37.547us 0.000us 0.00% 7.200us 2.400us 3 + aten::convolution 1.21% 9.859us 13.05% 106.121us 35.374us 0.000us 0.00% 7.200us 2.400us 3 + aten::_convolution 2.61% 21.222us 11.83% 96.262us 32.087us 0.000us 0.00% 7.200us 2.400us 3 + aten::_conv_depthwise2d 2.67% 21.701us 7.50% 61.001us 20.334us 7.200us 38.20% 7.200us 2.400us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.200us 38.20% 7.200us 2.400us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 31.75% 5.984us 1.995us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 30.05% 5.664us 1.888us 3 + Activity Buffer Request 33.80% 274.946us 33.80% 274.946us 274.946us 1.952us 10.36% 1.952us 1.952us 1 + aten::empty_strided 3.82% 31.091us 3.82% 31.091us 5.182us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 24.38% 198.353us 24.38% 198.353us 22.039us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.88% 15.291us 2.44% 19.860us 2.207us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.00% 8.160us 1.00% 8.160us 0.544us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.19% 9.720us 1.19% 9.720us 3.240us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.08% 8.780us 1.08% 8.780us 2.927us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.64% 5.210us 0.80% 6.500us 2.167us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 803.779us -Self CUDA time total: 18.816us +Self CPU time total: 813.488us +Self CUDA time total: 18.848us @@ -4415,29 +4415,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 315.484us 1613.56% 315.484us 315.484us 1 - torch_eager 14.44% 114.641us 99.35% 788.539us 788.539us 0.000us 0.00% 21.568us 21.568us 1 - aten::to 0.74% 5.859us 67.65% 536.963us 89.494us 0.000us 0.00% 13.632us 2.272us 6 - aten::_to_copy 3.05% 24.201us 66.91% 531.104us 88.517us 0.000us 0.00% 13.632us 2.272us 6 - aten::copy_ 6.01% 47.671us 60.17% 477.592us 79.599us 11.616us 59.41% 13.632us 2.272us 6 - aten::conv1d 0.73% 5.820us 14.10% 111.923us 37.308us 0.000us 0.00% 7.936us 2.645us 3 - aten::convolution 1.13% 8.991us 13.37% 106.103us 35.368us 0.000us 0.00% 7.936us 2.645us 3 - aten::_convolution 2.72% 21.550us 12.24% 97.112us 32.371us 0.000us 0.00% 7.936us 2.645us 3 - aten::_conv_depthwise2d 2.66% 21.080us 7.67% 60.892us 20.297us 7.936us 40.59% 7.936us 2.645us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 40.59% 7.936us 2.645us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 30.28% 5.920us 1.973us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 29.13% 5.696us 1.899us 3 - Activity Buffer Request 32.17% 255.357us 32.17% 255.357us 255.357us 2.016us 10.31% 2.016us 2.016us 1 - aten::empty_strided 3.69% 29.311us 3.69% 29.311us 4.885us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 24.68% 195.894us 24.68% 195.894us 21.766us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.03% 16.101us 2.61% 20.721us 2.302us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.02% 8.070us 1.02% 8.070us 0.538us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.14% 9.082us 1.14% 9.082us 3.027us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.18% 9.400us 1.18% 9.400us 3.133us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.66% 5.230us 0.82% 6.480us 2.160us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 316.950us 1626.55% 316.950us 316.950us 1 + torch_eager 14.00% 113.783us 99.35% 807.628us 807.628us 0.000us 0.00% 21.437us 21.437us 1 + aten::to 0.71% 5.740us 68.34% 555.502us 92.584us 0.000us 0.00% 13.502us 2.250us 6 + aten::_to_copy 2.66% 21.649us 67.63% 549.762us 91.627us 0.000us 0.00% 13.502us 2.250us 6 + aten::copy_ 5.63% 45.791us 61.33% 498.512us 83.085us 11.551us 59.28% 13.502us 2.250us 6 + aten::conv1d 0.67% 5.481us 13.97% 113.553us 37.851us 0.000us 0.00% 7.935us 2.645us 3 + aten::convolution 1.11% 9.000us 13.29% 108.072us 36.024us 0.000us 0.00% 7.935us 2.645us 3 + aten::_convolution 2.66% 21.620us 12.19% 99.072us 33.024us 0.000us 0.00% 7.935us 2.645us 3 + aten::_conv_depthwise2d 2.55% 20.761us 7.74% 62.951us 20.984us 7.935us 40.72% 7.935us 2.645us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.935us 40.72% 7.935us 2.645us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.887us 30.21% 5.887us 1.962us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 29.07% 5.664us 1.888us 3 + Activity Buffer Request 33.87% 275.306us 33.87% 275.306us 275.306us 1.951us 10.01% 1.951us 1.951us 1 + aten::empty_strided 3.64% 29.601us 3.64% 29.601us 4.934us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 24.66% 200.485us 24.66% 200.485us 22.276us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.94% 15.809us 2.55% 20.761us 2.307us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.03% 8.362us 1.03% 8.362us 0.557us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.19% 9.700us 1.19% 9.700us 3.233us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.16% 9.420us 1.16% 9.420us 3.140us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.65% 5.250us 0.78% 6.380us 2.127us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 793.700us -Self CUDA time total: 19.552us +Self CPU time total: 812.878us +Self CUDA time total: 19.486us @@ -4447,29 +4447,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 319.227us 1646.26% 319.227us 319.227us 1 - torch_eager 14.35% 115.202us 99.35% 797.750us 797.750us 0.000us 0.00% 21.567us 21.567us 1 - aten::to 0.73% 5.879us 67.70% 543.593us 90.599us 0.000us 0.00% 14.367us 2.394us 6 - aten::_to_copy 2.99% 23.982us 66.96% 537.714us 89.619us 0.000us 0.00% 14.367us 2.394us 6 - aten::copy_ 5.85% 46.982us 60.24% 483.712us 80.619us 12.191us 62.87% 14.367us 2.394us 6 - aten::conv1d 0.69% 5.560us 14.06% 112.933us 37.644us 0.000us 0.00% 7.200us 2.400us 3 - aten::convolution 1.13% 9.060us 13.37% 107.373us 35.791us 0.000us 0.00% 7.200us 2.400us 3 - aten::_convolution 2.73% 21.891us 12.24% 98.313us 32.771us 0.000us 0.00% 7.200us 2.400us 3 - aten::_conv_depthwise2d 2.58% 20.719us 7.42% 59.602us 19.867us 7.200us 37.13% 7.200us 2.400us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.200us 37.13% 7.200us 2.400us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 316.954us 1634.46% 316.954us 316.954us 1 + torch_eager 14.55% 117.095us 99.34% 799.337us 799.337us 0.000us 0.00% 21.600us 21.600us 1 + aten::to 0.68% 5.480us 67.45% 542.761us 90.460us 0.000us 0.00% 14.432us 2.405us 6 + aten::_to_copy 2.68% 21.530us 66.77% 537.281us 89.547us 0.000us 0.00% 14.432us 2.405us 6 + aten::copy_ 5.75% 46.293us 60.38% 485.871us 80.978us 12.224us 63.04% 14.432us 2.405us 6 + aten::conv1d 0.85% 6.860us 14.15% 113.831us 37.944us 0.000us 0.00% 7.168us 2.389us 3 + aten::convolution 1.15% 9.280us 13.29% 106.971us 35.657us 0.000us 0.00% 7.168us 2.389us 3 + aten::_convolution 2.68% 21.591us 12.14% 97.691us 32.564us 0.000us 0.00% 7.168us 2.389us 3 + aten::_conv_depthwise2d 2.56% 20.631us 7.61% 61.261us 20.420us 7.168us 36.96% 7.168us 2.389us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.168us 36.96% 7.168us 2.389us 3 void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 32.34% 6.272us 2.091us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.919us 30.52% 5.919us 1.973us 3 - Activity Buffer Request 32.74% 262.897us 32.74% 262.897us 262.897us 2.176us 11.22% 2.176us 2.176us 1 - aten::empty_strided 3.74% 30.020us 3.74% 30.020us 5.003us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 24.16% 194.015us 24.16% 194.015us 21.557us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.13% 17.072us 2.73% 21.901us 2.433us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.03% 8.259us 1.03% 8.259us 0.551us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.15% 9.210us 1.15% 9.210us 3.070us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.18% 9.491us 1.18% 9.491us 3.164us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.90% 7.200us 1.06% 8.540us 2.847us 0.000us 0.00% 0.000us 0.000us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.952us 30.69% 5.952us 1.984us 3 + Activity Buffer Request 32.96% 265.175us 32.96% 265.175us 265.175us 2.208us 11.39% 2.208us 2.208us 1 + aten::empty_strided 3.71% 29.880us 3.71% 29.880us 4.980us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 24.44% 196.663us 24.44% 196.663us 21.851us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.01% 16.189us 2.67% 21.519us 2.391us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.09% 8.750us 1.09% 8.750us 0.583us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.23% 9.890us 1.23% 9.890us 3.297us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.05% 8.480us 1.05% 8.480us 2.827us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.65% 5.220us 0.80% 6.399us 2.133us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 802.990us -Self CUDA time total: 19.391us +Self CPU time total: 804.637us +Self CUDA time total: 19.392us @@ -4479,29 +4479,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 318.585us 1587.92% 318.585us 318.585us 1 - torch_eager 14.32% 114.402us 99.36% 793.540us 793.540us 0.000us 0.00% 22.271us 22.271us 1 - aten::to 0.74% 5.889us 67.47% 538.893us 89.816us 0.000us 0.00% 14.368us 2.395us 6 - aten::_to_copy 2.86% 22.853us 66.74% 533.004us 88.834us 0.000us 0.00% 14.368us 2.395us 6 - aten::copy_ 5.91% 47.181us 60.05% 479.601us 79.933us 12.160us 60.61% 14.368us 2.395us 6 - aten::conv1d 0.72% 5.730us 14.38% 114.863us 38.288us 0.000us 0.00% 7.903us 2.634us 3 - aten::convolution 1.15% 9.210us 13.66% 109.133us 36.378us 0.000us 0.00% 7.903us 2.634us 3 - aten::_convolution 2.90% 23.191us 12.51% 99.923us 33.308us 0.000us 0.00% 7.903us 2.634us 3 - aten::_conv_depthwise2d 2.58% 20.640us 7.49% 59.852us 19.951us 7.903us 39.39% 7.903us 2.634us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.903us 39.39% 7.903us 2.634us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 31.42% 6.304us 2.101us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.856us 29.19% 5.856us 1.952us 3 - Activity Buffer Request 32.57% 260.117us 32.57% 260.117us 260.117us 2.208us 11.01% 2.208us 2.208us 1 - aten::empty_strided 3.83% 30.550us 3.83% 30.550us 5.092us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 24.14% 192.815us 24.14% 192.815us 21.424us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.21% 17.653us 2.81% 22.412us 2.490us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.03% 8.198us 1.03% 8.198us 0.547us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.14% 9.130us 1.14% 9.130us 3.043us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.20% 9.570us 1.20% 9.570us 3.190us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.77% 6.130us 0.93% 7.460us 2.487us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 317.850us 1579.14% 317.850us 317.850us 1 + torch_eager 14.46% 115.693us 99.28% 794.297us 794.297us 0.000us 0.00% 22.336us 22.336us 1 + aten::to 0.72% 5.761us 67.50% 540.021us 90.003us 0.000us 0.00% 14.368us 2.395us 6 + aten::_to_copy 2.69% 21.489us 66.78% 534.260us 89.043us 0.000us 0.00% 14.368us 2.395us 6 + aten::copy_ 5.80% 46.383us 60.34% 482.741us 80.457us 12.160us 60.41% 14.368us 2.395us 6 + aten::conv1d 0.71% 5.650us 14.21% 113.673us 37.891us 0.000us 0.00% 7.968us 2.656us 3 + aten::convolution 1.34% 10.740us 13.50% 108.023us 36.008us 0.000us 0.00% 7.968us 2.656us 3 + aten::_convolution 2.73% 21.881us 12.16% 97.283us 32.428us 0.000us 0.00% 7.968us 2.656us 3 + aten::_conv_depthwise2d 2.59% 20.703us 7.59% 60.702us 20.234us 7.968us 39.59% 7.968us 2.656us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 39.59% 7.968us 2.656us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.240us 31.00% 6.240us 2.080us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 29.41% 5.920us 1.973us 3 + Activity Buffer Request 32.71% 261.746us 32.71% 261.746us 261.746us 2.208us 10.97% 2.208us 2.208us 1 + aten::empty_strided 3.75% 30.030us 3.75% 30.030us 5.005us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 24.46% 195.702us 24.46% 195.702us 21.745us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.97% 15.730us 2.60% 20.791us 2.310us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.06% 8.481us 1.06% 8.481us 0.565us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.20% 9.570us 1.20% 9.570us 3.190us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.17% 9.339us 1.17% 9.339us 3.113us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.65% 5.220us 0.79% 6.359us 2.120us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 798.670us -Self CUDA time total: 20.063us +Self CPU time total: 800.086us +Self CUDA time total: 20.128us @@ -4511,29 +4511,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 325.920us 904.53% 325.920us 325.920us 1 - torch_eager 14.34% 116.794us 99.35% 809.360us 809.360us 0.000us 0.00% 38.624us 38.624us 1 - aten::conv1d 0.72% 5.870us 14.19% 115.602us 38.534us 0.000us 0.00% 20.192us 6.731us 3 - aten::convolution 1.16% 9.420us 13.47% 109.732us 36.577us 0.000us 0.00% 20.192us 6.731us 3 - aten::_convolution 2.80% 22.850us 12.31% 100.312us 33.437us 0.000us 0.00% 20.192us 6.731us 3 - aten::_conv_depthwise2d 2.58% 21.022us 7.49% 61.052us 20.351us 20.192us 56.04% 20.192us 6.731us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.192us 56.04% 20.192us 6.731us 3 - aten::to 0.73% 5.980us 67.60% 550.703us 91.784us 0.000us 0.00% 18.432us 3.072us 6 - aten::_to_copy 2.92% 23.789us 66.86% 544.723us 90.787us 0.000us 0.00% 18.432us 3.072us 6 - aten::copy_ 5.93% 48.281us 60.23% 490.683us 81.781us 15.840us 43.96% 18.432us 3.072us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.448us 23.45% 8.448us 2.816us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 20.52% 7.392us 2.464us 3 - Activity Buffer Request 30.82% 251.076us 30.82% 251.076us 251.076us 2.592us 7.19% 2.592us 2.592us 1 - aten::empty_strided 3.71% 30.251us 3.71% 30.251us 5.042us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 26.06% 212.336us 26.06% 212.336us 23.593us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.07% 16.901us 2.67% 21.731us 2.415us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.01% 8.191us 1.01% 8.191us 0.546us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.17% 9.540us 1.17% 9.540us 3.180us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.16% 9.480us 1.16% 9.480us 3.160us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.88% 7.140us 1.02% 8.340us 2.780us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 317.817us 880.50% 317.817us 317.817us 1 + torch_eager 14.26% 114.771us 99.34% 799.407us 799.407us 0.000us 0.00% 38.718us 38.718us 1 + aten::conv1d 0.73% 5.881us 14.18% 114.103us 38.034us 0.000us 0.00% 20.223us 6.741us 3 + aten::convolution 1.23% 9.899us 13.45% 108.222us 36.074us 0.000us 0.00% 20.223us 6.741us 3 + aten::_convolution 2.58% 20.781us 12.22% 98.323us 32.774us 0.000us 0.00% 20.223us 6.741us 3 + aten::_conv_depthwise2d 2.56% 20.581us 7.80% 62.731us 20.910us 20.223us 56.03% 20.223us 6.741us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.223us 56.03% 20.223us 6.741us 3 + aten::to 0.70% 5.603us 67.84% 545.903us 90.984us 0.000us 0.00% 18.495us 3.083us 6 + aten::_to_copy 2.70% 21.701us 67.14% 540.300us 90.050us 0.000us 0.00% 18.495us 3.083us 6 + aten::copy_ 5.63% 45.281us 60.69% 488.359us 81.393us 15.872us 43.97% 18.495us 3.083us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.511us 23.58% 8.511us 2.837us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.361us 20.39% 7.361us 2.454us 3 + Activity Buffer Request 33.10% 266.365us 33.10% 266.365us 266.365us 2.623us 7.27% 2.623us 2.623us 1 + aten::empty_strided 3.76% 30.240us 3.76% 30.240us 5.040us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 24.83% 199.834us 24.83% 199.834us 22.204us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.98% 15.970us 2.59% 20.860us 2.318us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.05% 8.430us 1.05% 8.430us 0.562us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.23% 9.879us 1.23% 9.879us 3.293us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.14% 9.150us 1.14% 9.150us 3.050us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.64% 5.171us 0.80% 6.441us 2.147us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 814.681us -Self CUDA time total: 36.032us +Self CPU time total: 804.678us +Self CUDA time total: 36.095us @@ -4543,29 +4543,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 322.527us 848.40% 322.527us 322.527us 1 - torch_eager 5.15% 118.831us 99.77% 2.301ms 2.301ms 0.000us 0.00% 40.608us 40.608us 1 - aten::conv1d 0.25% 5.790us 4.95% 114.162us 38.054us 0.000us 0.00% 22.272us 7.424us 3 - aten::convolution 0.40% 9.280us 4.70% 108.372us 36.124us 0.000us 0.00% 22.272us 7.424us 3 - aten::_convolution 0.97% 22.470us 4.30% 99.092us 33.031us 0.000us 0.00% 22.272us 7.424us 3 - aten::_conv_depthwise2d 0.93% 21.479us 2.62% 60.532us 20.177us 22.272us 58.59% 22.272us 7.424us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.272us 58.59% 22.272us 7.424us 3 - aten::to 0.26% 6.081us 88.49% 2.041ms 340.127us 0.000us 0.00% 18.336us 3.056us 6 - aten::_to_copy 1.04% 23.880us 88.22% 2.035ms 339.113us 0.000us 0.00% 18.336us 3.056us 6 - aten::copy_ 2.05% 47.372us 85.91% 1.981ms 330.228us 15.744us 41.41% 18.336us 3.056us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.416us 22.14% 8.416us 2.805us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 19.28% 7.328us 2.443us 3 - Activity Buffer Request 76.31% 1.760ms 76.31% 1.760ms 1.760ms 2.592us 6.82% 2.592us 2.592us 1 - aten::empty_strided 1.28% 29.431us 1.28% 29.431us 4.905us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 8.43% 194.465us 8.43% 194.465us 21.607us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.76% 17.531us 0.99% 22.821us 2.536us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.40% 9.119us 0.40% 9.119us 0.608us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.40% 9.181us 0.40% 9.181us 3.060us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.41% 9.511us 0.41% 9.511us 3.170us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.28% 6.360us 0.35% 8.040us 2.680us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 317.472us 827.44% 317.472us 317.472us 1 + torch_eager 5.39% 120.521us 99.74% 2.231ms 2.231ms 0.000us 0.00% 40.993us 40.993us 1 + aten::conv1d 0.34% 7.591us 5.06% 113.112us 37.704us 0.000us 0.00% 22.432us 7.477us 3 + aten::convolution 0.40% 8.869us 4.72% 105.521us 35.174us 0.000us 0.00% 22.432us 7.477us 3 + aten::_convolution 0.93% 20.812us 4.32% 96.652us 32.217us 0.000us 0.00% 22.432us 7.477us 3 + aten::_conv_depthwise2d 0.93% 20.800us 2.73% 61.101us 20.367us 22.432us 58.47% 22.432us 7.477us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.432us 58.47% 22.432us 7.477us 3 + aten::to 0.25% 5.601us 88.17% 1.973ms 328.759us 0.000us 0.00% 18.561us 3.093us 6 + aten::_to_copy 0.97% 21.650us 87.92% 1.967ms 327.825us 0.000us 0.00% 18.561us 3.093us 6 + aten::copy_ 2.07% 46.341us 85.60% 1.915ms 319.179us 15.936us 41.53% 18.561us 3.093us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.545us 22.27% 8.545us 2.848us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.391us 19.26% 7.391us 2.464us 3 + Activity Buffer Request 75.81% 1.696ms 75.81% 1.696ms 1.696ms 2.625us 6.84% 2.625us 2.625us 1 + aten::empty_strided 1.35% 30.230us 1.35% 30.230us 5.038us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 8.67% 194.015us 8.67% 194.015us 21.557us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.71% 15.960us 0.93% 20.871us 2.319us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.38% 8.501us 0.38% 8.501us 0.567us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.44% 9.900us 0.44% 9.900us 3.300us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.41% 9.100us 0.41% 9.100us 3.033us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.23% 5.230us 0.29% 6.510us 2.170us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.306ms -Self CUDA time total: 38.016us +Self CPU time total: 2.237ms +Self CUDA time total: 38.368us @@ -4575,29 +4575,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 334.615us 522.09% 334.615us 334.615us 1 - torch_eager 14.27% 115.903us 99.37% 807.269us 807.269us 0.000us 0.00% 68.188us 68.188us 1 - aten::conv1d 0.69% 5.580us 15.60% 126.753us 42.251us 0.000us 0.00% 41.662us 13.887us 3 - aten::convolution 1.31% 10.620us 14.92% 121.173us 40.391us 0.000us 0.00% 41.662us 13.887us 3 - aten::_convolution 3.95% 32.112us 13.61% 110.553us 36.851us 0.000us 0.00% 41.662us 13.887us 3 - aten::_conv_depthwise2d 2.71% 21.990us 7.82% 63.492us 21.164us 41.662us 65.00% 41.662us 13.887us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.662us 65.00% 41.662us 13.887us 3 - aten::to 0.71% 5.728us 66.32% 538.813us 89.802us 0.000us 0.00% 26.526us 4.421us 6 - aten::_to_copy 2.82% 22.881us 65.62% 533.085us 88.848us 0.000us 0.00% 26.526us 4.421us 6 - aten::copy_ 5.89% 47.814us 59.03% 479.514us 79.919us 22.430us 35.00% 26.526us 4.421us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 18.72% 12.000us 4.000us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.430us 16.27% 10.430us 3.477us 3 - Activity Buffer Request 31.09% 252.576us 31.09% 252.576us 252.576us 4.096us 6.39% 4.096us 4.096us 1 - aten::empty_strided 3.78% 30.690us 3.78% 30.690us 5.115us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 24.86% 201.964us 24.86% 201.964us 22.440us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.01% 16.320us 2.66% 21.600us 2.400us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.07% 8.700us 1.07% 8.700us 0.580us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.17% 9.541us 1.17% 9.541us 3.180us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.12% 9.121us 1.12% 9.121us 3.040us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.69% 5.569us 0.85% 6.929us 2.310us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 317.307us 494.31% 317.307us 317.307us 1 + torch_eager 14.64% 116.902us 99.38% 793.617us 793.617us 0.000us 0.00% 68.288us 68.288us 1 + aten::conv1d 0.71% 5.701us 14.04% 112.123us 37.374us 0.000us 0.00% 41.760us 13.920us 3 + aten::convolution 1.27% 10.150us 13.33% 106.422us 35.474us 0.000us 0.00% 41.760us 13.920us 3 + aten::_convolution 2.67% 21.299us 12.06% 96.272us 32.091us 0.000us 0.00% 41.760us 13.920us 3 + aten::_conv_depthwise2d 2.60% 20.762us 7.62% 60.882us 20.294us 41.760us 65.05% 41.760us 13.920us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.760us 65.05% 41.760us 13.920us 3 + aten::to 0.69% 5.531us 67.64% 540.181us 90.030us 0.000us 0.00% 26.528us 4.421us 6 + aten::_to_copy 2.67% 21.339us 66.95% 534.650us 89.108us 0.000us 0.00% 26.528us 4.421us 6 + aten::copy_ 5.72% 45.689us 60.26% 481.250us 80.208us 22.432us 34.95% 26.528us 4.421us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 18.69% 12.000us 4.000us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 16.25% 10.432us 3.477us 3 + Activity Buffer Request 32.36% 258.416us 32.36% 258.416us 258.416us 4.096us 6.38% 4.096us 4.096us 1 + aten::empty_strided 4.01% 32.061us 4.01% 32.061us 5.344us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 24.87% 198.596us 24.87% 198.596us 22.066us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.93% 15.422us 2.51% 20.041us 2.227us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.02% 8.120us 1.02% 8.120us 0.541us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.20% 9.610us 1.20% 9.610us 3.203us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.13% 9.059us 1.13% 9.059us 3.020us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.65% 5.190us 0.81% 6.450us 2.150us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 812.390us -Self CUDA time total: 64.092us +Self CPU time total: 798.567us +Self CUDA time total: 64.192us @@ -4607,29 +4607,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 333.628us 479.14% 333.628us 333.628us 1 - torch_eager 20.40% 175.467us 99.34% 854.292us 854.292us 0.000us 0.00% 73.758us 73.758us 1 - aten::conv1d 0.66% 5.659us 14.42% 124.043us 41.348us 0.000us 0.00% 47.231us 15.744us 3 - aten::convolution 1.14% 9.780us 13.77% 118.384us 39.461us 0.000us 0.00% 47.231us 15.744us 3 - aten::_convolution 2.67% 22.962us 12.63% 108.604us 36.201us 0.000us 0.00% 47.231us 15.744us 3 - aten::_conv_depthwise2d 2.49% 21.432us 8.04% 69.183us 23.061us 47.231us 67.83% 47.231us 15.744us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.231us 67.83% 47.231us 15.744us 3 - aten::to 0.69% 5.899us 61.45% 528.412us 88.069us 0.000us 0.00% 26.527us 4.421us 6 - aten::_to_copy 2.66% 22.900us 60.76% 522.513us 87.086us 0.000us 0.00% 26.527us 4.421us 6 - aten::copy_ 5.49% 47.171us 54.63% 469.832us 78.305us 22.399us 32.17% 26.527us 4.421us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.968us 17.19% 11.968us 3.989us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 14.98% 10.431us 3.477us 3 - Activity Buffer Request 28.72% 246.977us 28.72% 246.977us 246.977us 4.128us 5.93% 4.128us 4.128us 1 - aten::empty_strided 3.46% 29.781us 3.46% 29.781us 4.964us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 22.90% 196.964us 22.90% 196.964us 21.885us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 1.98% 17.050us 2.57% 22.090us 2.454us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.01% 8.700us 1.01% 8.700us 0.580us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.95% 16.810us 1.95% 16.810us 5.603us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.12% 9.661us 1.12% 9.661us 3.220us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.74% 6.399us 0.92% 7.879us 2.626us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 319.646us 457.16% 319.646us 319.646us 1 + torch_eager 19.98% 173.384us 99.30% 861.709us 861.709us 0.000us 0.00% 74.048us 74.048us 1 + aten::conv1d 0.75% 6.498us 13.19% 114.441us 38.147us 0.000us 0.00% 47.392us 15.797us 3 + aten::convolution 1.03% 8.971us 12.44% 107.943us 35.981us 0.000us 0.00% 47.392us 15.797us 3 + aten::_convolution 2.49% 21.580us 11.41% 98.972us 32.991us 0.000us 0.00% 47.392us 15.797us 3 + aten::_conv_depthwise2d 2.51% 21.761us 7.21% 62.552us 20.851us 47.392us 67.78% 47.392us 15.797us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.392us 67.78% 47.392us 15.797us 3 + aten::to 0.64% 5.540us 63.21% 548.512us 91.419us 0.000us 0.00% 26.656us 4.443us 6 + aten::_to_copy 2.49% 21.601us 62.57% 542.972us 90.495us 0.000us 0.00% 26.656us 4.443us 6 + aten::copy_ 5.20% 45.112us 56.72% 492.221us 82.037us 22.528us 32.22% 26.656us 4.443us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 17.16% 12.000us 4.000us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.528us 15.06% 10.528us 3.509us 3 + Activity Buffer Request 31.22% 270.946us 31.22% 270.946us 270.946us 4.128us 5.90% 4.128us 4.128us 1 + aten::empty_strided 3.36% 29.150us 3.36% 29.150us 4.858us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 22.77% 197.623us 22.77% 197.623us 21.958us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.83% 15.841us 2.40% 20.801us 2.311us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.99% 8.559us 0.99% 8.559us 0.571us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.14% 9.871us 1.14% 9.871us 3.290us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.09% 9.460us 1.09% 9.460us 3.153us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.62% 5.360us 0.78% 6.760us 2.253us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 859.952us -Self CUDA time total: 69.630us +Self CPU time total: 867.779us +Self CUDA time total: 69.920us @@ -4639,29 +4639,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 346.806us 186.86% 346.806us 346.806us 1 - torch_eager 14.53% 117.325us 99.36% 802.090us 802.090us 0.000us 0.00% 195.516us 195.516us 1 - aten::conv1d 0.70% 5.679us 15.36% 123.982us 41.327us 0.000us 0.00% 133.212us 44.404us 3 - aten::convolution 1.23% 9.941us 14.65% 118.303us 39.434us 0.000us 0.00% 133.212us 44.404us 3 - aten::_convolution 2.76% 22.310us 13.42% 108.362us 36.121us 0.000us 0.00% 133.212us 44.404us 3 - aten::_conv_depthwise2d 3.54% 28.551us 8.44% 68.171us 22.724us 133.212us 71.78% 133.212us 44.404us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.212us 71.78% 133.212us 44.404us 3 - aten::to 0.73% 5.930us 66.21% 534.532us 89.089us 0.000us 0.00% 62.304us 10.384us 6 - aten::_to_copy 2.86% 23.081us 65.48% 528.602us 88.100us 0.000us 0.00% 62.304us 10.384us 6 - aten::copy_ 5.96% 48.102us 58.83% 474.941us 79.157us 52.384us 28.22% 62.304us 10.384us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.504us 15.90% 29.504us 9.835us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.880us 12.33% 22.880us 7.627us 3 - Activity Buffer Request 31.11% 251.176us 31.11% 251.176us 251.176us 9.920us 5.34% 9.920us 9.920us 1 - aten::empty_strided 3.79% 30.580us 3.79% 30.580us 5.097us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 24.34% 196.463us 24.34% 196.463us 21.829us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.06% 16.611us 2.71% 21.892us 2.432us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.12% 9.041us 1.12% 9.041us 0.603us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.13% 9.090us 1.13% 9.090us 3.030us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.21% 9.730us 1.21% 9.730us 3.243us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.95% 7.689us 1.15% 9.269us 3.090us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 330.102us 177.34% 330.102us 330.102us 1 + torch_eager 14.69% 116.034us 99.32% 784.747us 784.747us 0.000us 0.00% 196.253us 196.253us 1 + aten::conv1d 0.71% 5.619us 14.22% 112.371us 37.457us 0.000us 0.00% 133.597us 44.532us 3 + aten::convolution 1.12% 8.850us 13.51% 106.752us 35.584us 0.000us 0.00% 133.597us 44.532us 3 + aten::_convolution 2.58% 20.371us 12.39% 97.902us 32.634us 0.000us 0.00% 133.597us 44.532us 3 + aten::_conv_depthwise2d 2.74% 21.650us 7.96% 62.881us 20.960us 133.597us 71.77% 133.597us 44.532us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.597us 71.77% 133.597us 44.532us 3 + aten::to 0.71% 5.639us 67.25% 531.321us 88.554us 0.000us 0.00% 62.656us 10.443us 6 + aten::_to_copy 2.76% 21.830us 66.53% 525.682us 87.614us 0.000us 0.00% 62.656us 10.443us 6 + aten::copy_ 5.82% 46.000us 59.91% 473.350us 78.892us 52.544us 28.23% 62.656us 10.443us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.568us 15.88% 29.568us 9.856us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.976us 12.34% 22.976us 7.659us 3 + Activity Buffer Request 31.98% 252.705us 31.98% 252.705us 252.705us 10.112us 5.43% 10.112us 10.112us 1 + aten::empty_strided 3.86% 30.502us 3.86% 30.502us 5.084us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 24.88% 196.555us 24.88% 196.555us 21.839us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.01% 15.900us 2.64% 20.820us 2.313us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.09% 8.601us 1.09% 8.601us 0.573us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.21% 9.550us 1.21% 9.550us 3.183us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.24% 9.771us 1.24% 9.771us 3.257us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.65% 5.130us 0.82% 6.480us 2.160us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 807.270us -Self CUDA time total: 185.596us +Self CPU time total: 790.087us +Self CUDA time total: 186.141us @@ -4671,29 +4671,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.579us 162.49% 339.579us 339.579us 1 - torch_eager 14.47% 115.372us 99.34% 792.230us 792.230us 0.000us 0.00% 222.202us 222.202us 1 - aten::conv1d 0.71% 5.650us 14.39% 114.783us 38.261us 0.000us 0.00% 153.629us 51.210us 3 - aten::convolution 1.12% 8.960us 13.68% 109.133us 36.378us 0.000us 0.00% 153.629us 51.210us 3 - aten::_convolution 2.81% 22.409us 12.56% 100.173us 33.391us 0.000us 0.00% 153.629us 51.210us 3 - aten::_conv_depthwise2d 2.65% 21.140us 7.72% 61.571us 20.524us 153.629us 73.51% 153.629us 51.210us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 153.629us 73.51% 153.629us 51.210us 3 - aten::to 0.74% 5.880us 67.17% 535.703us 89.284us 0.000us 0.00% 68.573us 11.429us 6 - aten::_to_copy 2.96% 23.602us 66.44% 529.823us 88.304us 0.000us 0.00% 68.573us 11.429us 6 - aten::copy_ 5.90% 47.080us 59.79% 476.780us 79.463us 55.357us 26.49% 68.573us 11.429us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 32.351us 15.48% 32.351us 10.784us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.006us 11.01% 23.006us 7.669us 3 - Activity Buffer Request 31.17% 248.536us 31.17% 248.536us 248.536us 13.216us 6.32% 13.216us 13.216us 1 - aten::empty_strided 3.69% 29.441us 3.69% 29.441us 4.907us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 25.34% 202.095us 25.34% 202.095us 22.455us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.16% 17.223us 2.82% 22.464us 2.496us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.10% 8.783us 1.10% 8.783us 0.586us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.15% 9.160us 1.15% 9.160us 3.053us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.30% 10.340us 1.30% 10.340us 3.447us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.78% 6.220us 0.94% 7.501us 2.500us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.514us 161.81% 339.514us 339.514us 1 + torch_eager 14.66% 116.206us 99.33% 787.167us 787.167us 0.000us 0.00% 223.098us 223.098us 1 + aten::conv1d 0.68% 5.420us 14.44% 114.442us 38.147us 0.000us 0.00% 153.917us 51.306us 3 + aten::convolution 1.14% 9.070us 13.76% 109.022us 36.341us 0.000us 0.00% 153.917us 51.306us 3 + aten::_convolution 2.75% 21.821us 12.61% 99.952us 33.317us 0.000us 0.00% 153.917us 51.306us 3 + aten::_conv_depthwise2d 2.79% 22.080us 7.98% 63.231us 21.077us 153.917us 73.36% 153.917us 51.306us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 153.917us 73.36% 153.917us 51.306us 3 + aten::to 0.73% 5.771us 67.07% 531.500us 88.583us 0.000us 0.00% 69.181us 11.530us 6 + aten::_to_copy 2.74% 21.730us 66.34% 525.729us 87.622us 0.000us 0.00% 69.181us 11.530us 6 + aten::copy_ 5.82% 46.089us 59.65% 472.729us 78.788us 55.902us 26.64% 69.181us 11.530us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 32.766us 15.62% 32.766us 10.922us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.136us 11.03% 23.136us 7.712us 3 + Activity Buffer Request 31.36% 248.555us 31.36% 248.555us 248.555us 13.279us 6.33% 13.279us 13.279us 1 + aten::empty_strided 3.95% 31.270us 3.95% 31.270us 5.212us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 25.22% 199.867us 25.22% 199.867us 22.207us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.96% 15.509us 2.61% 20.650us 2.294us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.11% 8.780us 1.11% 8.780us 0.585us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.26% 9.960us 1.26% 9.960us 3.320us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.19% 9.409us 1.19% 9.409us 3.136us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.67% 5.290us 0.83% 6.600us 2.200us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 797.480us -Self CUDA time total: 208.986us +Self CPU time total: 792.487us +Self CUDA time total: 209.819us @@ -4703,29 +4703,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 6.78% 123.653us 52.87% 964.884us 964.884us 0.000us 0.00% 1.509ms 1.509ms 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.412ms 100.39% 1.412ms 1.412ms 1 - aten::to 0.34% 6.170us 38.05% 694.387us 115.731us 0.000us 0.00% 816.698us 136.116us 6 - aten::_to_copy 1.57% 28.621us 37.71% 688.217us 114.703us 0.000us 0.00% 816.698us 136.116us 6 - aten::copy_ 2.79% 50.981us 25.74% 469.702us 78.284us 713.755us 50.75% 816.698us 136.116us 6 - aten::conv1d 0.31% 5.748us 6.52% 119.002us 39.667us 0.000us 0.00% 692.571us 230.857us 3 - aten::convolution 0.58% 10.581us 6.21% 113.254us 37.751us 0.000us 0.00% 692.571us 230.857us 3 - aten::_convolution 1.23% 22.532us 5.63% 102.673us 34.224us 0.000us 0.00% 692.571us 230.857us 3 - aten::_conv_depthwise2d 1.21% 22.101us 3.46% 63.221us 21.074us 692.571us 49.25% 692.571us 230.857us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 692.571us 49.25% 692.571us 230.857us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 406.878us 28.93% 406.878us 135.626us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 306.877us 21.82% 306.877us 102.292us 3 - Activity Buffer Request 13.13% 239.566us 13.13% 239.566us 239.566us 102.943us 7.32% 102.943us 102.943us 1 - aten::empty_strided 2.01% 36.730us 10.40% 189.894us 31.649us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 11.02% 201.035us 11.02% 201.035us 22.337us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.99% 18.011us 1.28% 23.402us 2.600us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.50% 9.042us 0.50% 9.042us 0.603us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.52% 9.410us 0.52% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.54% 9.830us 0.54% 9.830us 3.277us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.39% 7.200us 0.47% 8.510us 2.837us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 6.64% 122.391us 53.17% 980.021us 980.021us 0.000us 0.00% 1.512ms 1.512ms 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.413ms 100.43% 1.413ms 1.413ms 1 + aten::to 0.33% 6.111us 38.61% 711.666us 118.611us 0.000us 0.00% 818.734us 136.456us 6 + aten::_to_copy 1.46% 26.859us 38.28% 705.555us 117.592us 0.000us 0.00% 818.734us 136.456us 6 + aten::copy_ 2.64% 48.673us 26.26% 483.993us 80.665us 713.617us 50.73% 818.734us 136.456us 6 + aten::conv1d 0.38% 6.930us 6.48% 119.403us 39.801us 0.000us 0.00% 692.947us 230.982us 3 + aten::convolution 0.60% 11.081us 6.10% 112.473us 37.491us 0.000us 0.00% 692.947us 230.982us 3 + aten::_convolution 1.18% 21.812us 5.50% 101.392us 33.797us 0.000us 0.00% 692.947us 230.982us 3 + aten::_conv_depthwise2d 1.17% 21.580us 3.49% 64.271us 21.424us 692.947us 49.27% 692.947us 230.982us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 692.947us 49.27% 692.947us 230.982us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 406.808us 28.92% 406.808us 135.603us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 306.809us 21.81% 306.809us 102.270us 3 + Activity Buffer Request 13.93% 256.826us 13.93% 256.826us 256.826us 105.117us 7.47% 105.117us 105.117us 1 + aten::empty_strided 2.09% 38.520us 10.56% 194.703us 32.450us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.93% 201.405us 10.93% 201.405us 22.378us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.92% 16.870us 1.21% 22.379us 2.487us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.50% 9.219us 0.50% 9.219us 0.615us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.55% 10.080us 0.55% 10.080us 3.360us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.53% 9.700us 0.53% 9.700us 3.233us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.29% 5.350us 0.36% 6.580us 2.193us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.825ms -Self CUDA time total: 1.406ms +Self CPU time total: 1.843ms +Self CUDA time total: 1.407ms @@ -4735,29 +4735,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 3.67% 125.416us 69.04% 2.359ms 2.359ms 0.000us 0.00% 1.503ms 1.503ms 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.434ms 100.40% 1.434ms 1.434ms 1 - aten::to 0.17% 5.940us 61.15% 2.089ms 348.142us 0.000us 0.00% 760.702us 126.784us 6 - aten::_to_copy 0.72% 24.663us 60.97% 2.083ms 347.152us 0.000us 0.00% 760.702us 126.784us 6 - aten::copy_ 1.35% 46.170us 59.36% 2.028ms 337.971us 686.110us 48.03% 760.702us 126.784us 6 - aten::conv1d 0.17% 5.960us 3.43% 117.002us 39.001us 0.000us 0.00% 742.490us 247.497us 3 - aten::convolution 0.28% 9.489us 3.25% 111.042us 37.014us 0.000us 0.00% 742.490us 247.497us 3 - aten::_convolution 0.69% 23.709us 2.97% 101.553us 33.851us 0.000us 0.00% 742.490us 247.497us 3 - aten::_conv_depthwise2d 0.61% 20.921us 1.79% 61.223us 20.408us 742.490us 51.97% 742.490us 247.497us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 742.490us 51.97% 742.490us 247.497us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 400.959us 28.07% 400.959us 133.653us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 285.151us 19.96% 285.151us 95.050us 3 - Activity Buffer Request 52.88% 1.806ms 52.88% 1.806ms 1.806ms 74.592us 5.22% 74.592us 74.592us 1 - aten::empty_strided 0.89% 30.420us 0.89% 30.420us 5.070us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 5.75% 196.514us 5.75% 196.514us 21.835us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.52% 17.690us 0.68% 23.179us 2.575us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.27% 9.290us 0.27% 9.290us 0.619us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.28% 9.521us 0.28% 9.521us 3.174us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.28% 9.690us 0.28% 9.690us 3.230us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.19% 6.410us 0.23% 7.991us 2.664us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 3.65% 121.382us 68.13% 2.266ms 2.266ms 0.000us 0.00% 1.497ms 1.497ms 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.428ms 100.41% 1.428ms 1.428ms 1 + aten::to 0.17% 5.810us 60.23% 2.004ms 333.937us 0.000us 0.00% 757.239us 126.207us 6 + aten::_to_copy 0.66% 21.893us 60.05% 1.998ms 332.969us 0.000us 0.00% 757.239us 126.207us 6 + aten::copy_ 1.36% 45.129us 58.52% 1.947ms 324.453us 682.520us 47.99% 757.239us 126.207us 6 + aten::conv1d 0.16% 5.420us 3.47% 115.432us 38.477us 0.000us 0.00% 739.832us 246.611us 3 + aten::convolution 0.28% 9.180us 3.31% 110.012us 36.671us 0.000us 0.00% 739.832us 246.611us 3 + aten::_convolution 0.63% 21.009us 3.03% 100.832us 33.611us 0.000us 0.00% 739.832us 246.611us 3 + aten::_conv_depthwise2d 0.66% 21.951us 1.95% 65.032us 21.677us 739.832us 52.01% 739.832us 246.611us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 739.832us 52.01% 739.832us 246.611us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 397.628us 27.96% 397.628us 132.543us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 284.892us 20.03% 284.892us 94.964us 3 + Activity Buffer Request 51.86% 1.725ms 51.86% 1.725ms 1.725ms 74.719us 5.25% 74.719us 74.719us 1 + aten::empty_strided 0.88% 29.200us 0.88% 29.200us 4.867us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 6.01% 199.804us 6.01% 199.804us 22.200us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.50% 16.619us 0.66% 21.940us 2.438us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.27% 8.971us 0.27% 8.971us 0.598us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.30% 10.051us 0.30% 10.051us 3.350us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.29% 9.641us 0.29% 9.641us 3.214us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.16% 5.171us 0.19% 6.481us 2.160us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.416ms -Self CUDA time total: 1.429ms +Self CPU time total: 3.327ms +Self CUDA time total: 1.422ms impl wl p50(ms) ok @@ -4775,7 +4775,7 @@ torch_eager cuda_B2_D64_S512_W2 0.08 True torch_eager cuda_B2_D64_S512_W4 0.08 True torch_eager cuda_B4_D2048_S128_W2 0.08 True torch_eager cuda_B4_D2048_S128_W4 0.08 True -torch_eager cuda_B4_D2048_S2048_W2 0.49 True +torch_eager cuda_B4_D2048_S2048_W2 0.48 True torch_eager cuda_B4_D2048_S2048_W4 0.50 True torch_eager cuda_B4_D2048_S512_W2 0.09 True torch_eager cuda_B4_D2048_S512_W4 0.10 True @@ -4789,7 +4789,53 @@ torch_eager cuda_B4_D64_S512_W4 0.08 True▶ UV Install Logs