diff --git "a/causal_conv1d/impls/torch_causal_conv1d.html" "b/causal_conv1d/impls/torch_causal_conv1d.html" --- "a/causal_conv1d/impls/torch_causal_conv1d.html" +++ "b/causal_conv1d/impls/torch_causal_conv1d.html" @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.24s +Cell: nv | 0.21s | Raw @@ -3887,7 +3895,7 @@ Cell: nv | 0.24s
Wed Oct 29 14:27:09 2025 +-Wed Oct 29 15:50:16 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | |-----------------------------------------+------------------------+----------------------+ @@ -3896,7 +3904,7 @@ Cell: nv | 0.24s | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 33C P0 109W / 350W | 0MiB / 46068MiB | 100% Default | +| N/A 29C P0 78W / 350W | 0MiB / 46068MiB | 18% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -3918,9 +3926,9 @@ Cell: nv | 0.24s ▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 7.23s +Cell: benchmark | 3.67s | Raw @@ -3982,29 +3990,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 465.824us 2410.10% 465.824us 465.824us 1 - torch_eager 10.38% 221.098us 99.69% 2.123ms 2.123ms 0.000us 0.00% 21.632us 21.632us 1 - aten::to 0.54% 11.460us 78.80% 1.678ms 279.633us 0.000us 0.00% 14.304us 2.384us 6 - aten::_to_copy 2.14% 45.672us 78.26% 1.666ms 277.723us 0.000us 0.00% 14.304us 2.384us 6 - aten::copy_ 2.97% 63.201us 73.51% 1.565ms 260.883us 12.000us 62.09% 14.304us 2.384us 6 - aten::conv1d 0.45% 9.560us 8.33% 177.314us 59.105us 0.000us 0.00% 7.328us 2.443us 3 - aten::convolution 0.76% 16.270us 7.88% 167.754us 55.918us 0.000us 0.00% 7.328us 2.443us 3 - aten::_convolution 1.63% 34.781us 7.11% 151.484us 50.495us 0.000us 0.00% 7.328us 2.443us 3 - aten::_conv_depthwise2d 2.18% 46.460us 4.51% 96.001us 32.000us 7.328us 37.91% 7.328us 2.443us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 37.91% 7.328us 2.443us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 32.45% 6.272us 2.091us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 29.64% 5.728us 1.909us 3 - Activity Buffer Request 67.39% 1.435ms 67.39% 1.435ms 1.435ms 2.304us 11.92% 2.304us 2.304us 1 - aten::empty_strided 2.60% 55.371us 2.60% 55.371us 9.228us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 4.37% 93.031us 4.37% 93.031us 10.337us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 1.44% 30.589us 1.81% 38.620us 4.291us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.63% 13.371us 0.63% 13.371us 0.891us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.55% 11.811us 0.55% 11.811us 3.937us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.56% 11.940us 0.56% 11.940us 3.980us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.37% 7.972us 0.46% 9.712us 3.237us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 430.557us 2231.44% 430.557us 430.557us 1 + torch_eager 10.26% 219.304us 99.68% 2.131ms 2.131ms 0.000us 0.00% 21.630us 21.630us 1 + aten::to 0.52% 11.162us 80.31% 1.717ms 286.188us 0.000us 0.00% 14.269us 2.378us 6 + aten::_to_copy 1.64% 35.099us 79.78% 1.706ms 284.328us 0.000us 0.00% 14.269us 2.378us 6 + aten::copy_ 2.90% 61.950us 75.45% 1.613ms 268.888us 11.934us 61.85% 14.269us 2.378us 6 + aten::conv1d 0.34% 7.309us 7.15% 152.793us 50.931us 0.000us 0.00% 7.361us 2.454us 3 + aten::convolution 0.74% 15.802us 6.80% 145.484us 48.495us 0.000us 0.00% 7.361us 2.454us 3 + aten::_convolution 1.46% 31.300us 6.06% 129.682us 43.227us 0.000us 0.00% 7.361us 2.454us 3 + aten::_conv_depthwise2d 1.58% 33.771us 3.83% 81.842us 27.281us 7.361us 38.15% 7.361us 2.454us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.361us 38.15% 7.361us 2.454us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.271us 32.50% 6.271us 2.090us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.663us 29.35% 5.663us 1.888us 3 + Activity Buffer Request 69.43% 1.484ms 69.43% 1.484ms 1.484ms 2.335us 12.10% 2.335us 2.335us 1 + aten::empty_strided 2.69% 57.542us 2.69% 57.542us 9.590us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 4.29% 91.753us 4.29% 91.753us 10.195us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.26% 26.879us 1.60% 34.300us 3.811us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.54% 11.581us 0.54% 11.581us 0.772us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.53% 11.370us 0.53% 11.370us 3.790us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.55% 11.861us 0.55% 11.861us 3.954us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.32% 6.790us 0.37% 8.000us 2.667us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.129ms -Self CUDA time total: 19.328us +Self CPU time total: 2.138ms +Self CUDA time total: 19.295us @@ -4014,29 +4022,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 332.863us 1691.38% 332.863us 332.863us 1 - torch_eager 6.60% 126.115us 99.71% 1.906ms 1.906ms 0.000us 0.00% 21.792us 21.792us 1 - aten::to 0.31% 5.930us 85.54% 1.635ms 272.467us 0.000us 0.00% 13.760us 2.293us 6 - aten::_to_copy 1.30% 24.791us 85.23% 1.629ms 271.478us 0.000us 0.00% 13.760us 2.293us 6 - aten::copy_ 2.71% 51.809us 82.30% 1.573ms 262.158us 11.648us 59.19% 13.760us 2.293us 6 - aten::conv1d 0.31% 5.929us 6.17% 117.852us 39.284us 0.000us 0.00% 8.032us 2.677us 3 - aten::convolution 0.53% 10.111us 5.86% 111.923us 37.308us 0.000us 0.00% 8.032us 2.677us 3 - aten::_convolution 1.20% 22.951us 5.33% 101.812us 33.937us 0.000us 0.00% 8.032us 2.677us 3 - aten::_conv_depthwise2d 1.20% 22.860us 3.35% 64.021us 21.340us 8.032us 40.81% 8.032us 2.677us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 8.032us 40.81% 8.032us 2.677us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.080us 30.89% 6.080us 2.027us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.568us 28.29% 5.568us 1.856us 3 - Activity Buffer Request 77.00% 1.472ms 77.00% 1.472ms 1.472ms 2.112us 10.73% 2.112us 2.112us 1 - aten::empty_strided 1.63% 31.132us 1.63% 31.132us 5.189us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 3.70% 70.762us 3.70% 70.762us 7.862us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.87% 16.659us 1.16% 22.190us 2.466us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.46% 8.781us 0.46% 8.781us 0.585us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.55% 10.521us 0.55% 10.521us 3.507us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.49% 9.390us 0.49% 9.390us 3.130us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.29% 5.540us 0.35% 6.670us 2.223us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 335.581us 1716.35% 335.581us 335.581us 1 + torch_eager 7.79% 148.313us 99.72% 1.898ms 1.898ms 0.000us 0.00% 21.664us 21.664us 1 + aten::to 0.37% 7.132us 84.25% 1.604ms 267.308us 0.000us 0.00% 13.760us 2.293us 6 + aten::_to_copy 1.28% 24.280us 83.88% 1.597ms 266.119us 0.000us 0.00% 13.760us 2.293us 6 + aten::copy_ 2.64% 50.321us 81.08% 1.543ms 257.239us 11.648us 59.57% 13.760us 2.293us 6 + aten::conv1d 0.32% 6.130us 6.26% 119.243us 39.748us 0.000us 0.00% 7.904us 2.635us 3 + aten::convolution 0.58% 11.131us 5.94% 113.113us 37.704us 0.000us 0.00% 7.904us 2.635us 3 + aten::_convolution 1.18% 22.459us 5.36% 101.982us 33.994us 0.000us 0.00% 7.904us 2.635us 3 + aten::_conv_depthwise2d 1.08% 20.592us 3.32% 63.132us 21.044us 7.904us 40.43% 7.904us 2.635us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 40.43% 7.904us 2.635us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.080us 31.10% 6.080us 2.027us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.568us 28.48% 5.568us 1.856us 3 + Activity Buffer Request 75.89% 1.445ms 75.89% 1.445ms 1.445ms 2.112us 10.80% 2.112us 2.112us 1 + aten::empty_strided 1.52% 29.001us 1.52% 29.001us 4.834us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 3.69% 70.220us 3.69% 70.220us 7.802us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.92% 17.542us 1.22% 23.251us 2.583us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.48% 9.139us 0.48% 9.139us 0.609us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.58% 11.060us 0.58% 11.060us 3.687us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.51% 9.700us 0.51% 9.700us 3.233us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.31% 5.860us 0.37% 7.020us 2.340us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.911ms -Self CUDA time total: 19.680us +Self CPU time total: 1.904ms +Self CUDA time total: 19.552us @@ -4046,29 +4054,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 355.199us 1913.89% 355.199us 355.199us 1 - torch_eager 6.67% 125.171us 99.71% 1.872ms 1.872ms 0.000us 0.00% 20.511us 20.511us 1 - aten::to 0.32% 6.091us 84.23% 1.581ms 263.570us 0.000us 0.00% 13.600us 2.267us 6 - aten::_to_copy 1.32% 24.859us 83.90% 1.575ms 262.555us 0.000us 0.00% 13.600us 2.267us 6 - aten::copy_ 2.70% 50.760us 80.88% 1.518ms 253.083us 11.648us 62.76% 13.600us 2.267us 6 - aten::conv1d 0.30% 5.670us 7.37% 138.423us 46.141us 0.000us 0.00% 6.911us 2.304us 3 - aten::convolution 0.52% 9.720us 7.07% 132.753us 44.251us 0.000us 0.00% 6.911us 2.304us 3 - aten::_convolution 1.24% 23.210us 6.55% 123.033us 41.011us 0.000us 0.00% 6.911us 2.304us 3 - aten::_conv_depthwise2d 1.26% 23.712us 4.48% 84.033us 28.011us 6.911us 37.24% 6.911us 2.304us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.911us 37.24% 6.911us 2.304us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 32.24% 5.984us 1.995us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 30.52% 5.664us 1.888us 3 - Activity Buffer Request 75.59% 1.419ms 75.59% 1.419ms 1.419ms 1.952us 10.52% 1.952us 1.952us 1 - aten::empty_strided 1.70% 31.973us 1.70% 31.973us 5.329us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 3.83% 72.002us 3.83% 72.002us 8.000us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.89% 16.661us 1.15% 21.682us 2.409us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.48% 8.941us 0.48% 8.941us 0.596us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.49% 28.041us 1.49% 28.041us 9.347us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.47% 8.840us 0.47% 8.840us 2.947us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.32% 5.960us 0.40% 7.470us 2.490us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 343.036us 1844.97% 343.036us 343.036us 1 + torch_eager 7.68% 146.161us 99.72% 1.897ms 1.897ms 0.000us 0.00% 20.481us 20.481us 1 + aten::to 0.37% 6.953us 83.90% 1.596ms 266.066us 0.000us 0.00% 13.536us 2.256us 6 + aten::_to_copy 1.25% 23.842us 83.53% 1.589ms 264.907us 0.000us 0.00% 13.536us 2.256us 6 + aten::copy_ 2.67% 50.789us 80.65% 1.535ms 255.762us 11.648us 62.65% 13.536us 2.256us 6 + aten::conv1d 0.33% 6.290us 6.66% 126.782us 42.261us 0.000us 0.00% 6.945us 2.315us 3 + aten::convolution 0.51% 9.650us 6.33% 120.492us 40.164us 0.000us 0.00% 6.945us 2.315us 3 + aten::_convolution 1.22% 23.120us 5.83% 110.842us 36.947us 0.000us 0.00% 6.945us 2.315us 3 + aten::_conv_depthwise2d 1.44% 27.490us 3.78% 71.832us 23.944us 6.945us 37.35% 6.945us 2.315us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.945us 37.35% 6.945us 2.315us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.952us 32.01% 5.952us 1.984us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 30.64% 5.696us 1.899us 3 + Activity Buffer Request 75.34% 1.433ms 75.34% 1.433ms 1.433ms 1.888us 10.15% 1.888us 1.888us 1 + aten::empty_strided 1.63% 31.030us 1.63% 31.030us 5.172us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 3.99% 75.911us 3.99% 75.911us 8.435us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.90% 17.191us 1.18% 22.431us 2.492us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.46% 8.831us 0.46% 8.831us 0.589us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.55% 10.511us 0.55% 10.511us 3.504us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.43% 8.261us 0.43% 8.261us 2.754us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.31% 5.930us 0.38% 7.310us 2.437us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.878ms -Self CUDA time total: 18.559us +Self CPU time total: 1.903ms +Self CUDA time total: 18.593us @@ -4078,29 +4086,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 335.841us 1714.87% 335.841us 335.841us 1 - torch_eager 6.09% 125.084us 99.75% 2.047ms 2.047ms 0.000us 0.00% 21.728us 21.728us 1 - aten::to 0.29% 6.012us 86.59% 1.777ms 296.210us 0.000us 0.00% 14.049us 2.341us 6 - aten::_to_copy 1.18% 24.318us 86.30% 1.771ms 295.209us 0.000us 0.00% 14.049us 2.341us 6 - aten::copy_ 2.44% 50.170us 83.64% 1.717ms 286.105us 11.905us 60.79% 14.049us 2.341us 6 - aten::conv1d 0.29% 5.981us 5.73% 117.633us 39.211us 0.000us 0.00% 7.679us 2.560us 3 - aten::convolution 0.48% 9.909us 5.44% 111.652us 37.217us 0.000us 0.00% 7.679us 2.560us 3 - aten::_convolution 1.11% 22.712us 4.96% 101.743us 33.914us 0.000us 0.00% 7.679us 2.560us 3 - aten::_conv_depthwise2d 1.08% 22.231us 3.11% 63.781us 21.260us 7.679us 39.21% 7.679us 2.560us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.679us 39.21% 7.679us 2.560us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 31.54% 6.176us 2.059us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.729us 29.25% 5.729us 1.910us 3 - Activity Buffer Request 70.17% 1.440ms 70.17% 1.440ms 1.440ms 2.144us 10.95% 2.144us 2.144us 1 - aten::empty_strided 1.48% 30.301us 1.48% 30.301us 5.050us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 12.02% 246.676us 12.02% 246.676us 27.408us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.85% 17.450us 1.12% 22.930us 2.548us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.44% 8.940us 0.44% 8.940us 0.596us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.47% 9.630us 0.47% 9.630us 3.210us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.56% 11.490us 0.56% 11.490us 3.830us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.28% 5.710us 0.34% 6.930us 2.310us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 329.535us 1679.93% 329.535us 329.535us 1 + torch_eager 6.84% 137.444us 99.71% 2.005ms 2.005ms 0.000us 0.00% 21.760us 21.760us 1 + aten::to 0.33% 6.580us 85.73% 1.724ms 287.253us 0.000us 0.00% 14.048us 2.341us 6 + aten::_to_copy 1.15% 23.069us 85.41% 1.717ms 286.156us 0.000us 0.00% 14.048us 2.341us 6 + aten::copy_ 2.51% 50.362us 82.79% 1.664ms 277.390us 11.904us 60.69% 14.048us 2.341us 6 + aten::conv1d 0.28% 5.589us 5.76% 115.862us 38.621us 0.000us 0.00% 7.712us 2.571us 3 + aten::convolution 0.52% 10.381us 5.49% 110.273us 36.758us 0.000us 0.00% 7.712us 2.571us 3 + aten::_convolution 1.13% 22.651us 4.97% 99.892us 33.297us 0.000us 0.00% 7.712us 2.571us 3 + aten::_conv_depthwise2d 1.01% 20.392us 3.08% 61.981us 20.660us 7.712us 39.31% 7.712us 2.571us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 39.31% 7.712us 2.571us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 31.48% 6.176us 2.059us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 29.20% 5.728us 1.909us 3 + Activity Buffer Request 70.54% 1.418ms 70.54% 1.418ms 1.418ms 2.144us 10.93% 2.144us 2.144us 1 + aten::empty_strided 1.47% 29.531us 1.47% 29.531us 4.922us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.85% 218.024us 10.85% 218.024us 24.225us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.87% 17.580us 1.13% 22.650us 2.517us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.42% 8.370us 0.42% 8.370us 0.558us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.48% 9.699us 0.48% 9.699us 3.233us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.49% 9.760us 0.49% 9.760us 3.253us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.28% 5.670us 0.34% 6.790us 2.263us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.053ms -Self CUDA time total: 19.584us +Self CPU time total: 2.010ms +Self CUDA time total: 19.616us @@ -4110,29 +4118,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 329.565us 1339.31% 329.565us 329.565us 1 - torch_eager 6.13% 122.184us 99.75% 1.990ms 1.990ms 0.000us 0.00% 26.911us 26.911us 1 - aten::to 0.30% 5.979us 86.40% 1.724ms 287.259us 0.000us 0.00% 15.359us 2.560us 6 - aten::_to_copy 1.37% 27.300us 86.10% 1.718ms 286.262us 0.000us 0.00% 15.359us 2.560us 6 - aten::copy_ 2.45% 48.801us 83.22% 1.660ms 276.655us 13.055us 53.05% 15.359us 2.560us 6 - aten::conv1d 0.29% 5.841us 5.86% 116.932us 38.977us 0.000us 0.00% 11.552us 3.851us 3 - aten::convolution 0.50% 9.929us 5.57% 111.091us 37.030us 0.000us 0.00% 11.552us 3.851us 3 - aten::_convolution 1.16% 23.192us 5.07% 101.162us 33.721us 0.000us 0.00% 11.552us 3.851us 3 - aten::_conv_depthwise2d 1.12% 22.341us 3.11% 62.030us 20.677us 11.552us 46.95% 11.552us 3.851us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 46.95% 11.552us 3.851us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.688us 27.18% 6.688us 2.229us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.367us 25.87% 6.367us 2.122us 3 - Activity Buffer Request 71.71% 1.430ms 71.71% 1.430ms 1.430ms 2.304us 9.36% 2.304us 2.304us 1 - aten::empty_strided 1.52% 30.342us 1.52% 30.342us 5.057us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.06% 200.744us 10.06% 200.744us 22.305us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.86% 17.251us 1.14% 22.681us 2.520us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.45% 9.051us 0.45% 9.051us 0.603us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.48% 9.579us 0.48% 9.579us 3.193us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.50% 10.050us 0.50% 10.050us 3.350us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.30% 6.019us 0.36% 7.270us 2.423us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 344.351us 1401.22% 344.351us 344.351us 1 + torch_eager 7.42% 151.244us 99.76% 2.034ms 2.034ms 0.000us 0.00% 26.847us 26.847us 1 + aten::to 0.33% 6.730us 85.23% 1.738ms 289.583us 0.000us 0.00% 15.264us 2.544us 6 + aten::_to_copy 1.15% 23.491us 84.90% 1.731ms 288.462us 0.000us 0.00% 15.264us 2.544us 6 + aten::copy_ 2.84% 57.871us 82.24% 1.677ms 279.428us 12.992us 52.87% 15.264us 2.544us 6 + aten::conv1d 0.31% 6.410us 5.76% 117.443us 39.148us 0.000us 0.00% 11.583us 3.861us 3 + aten::convolution 0.49% 10.031us 5.45% 111.033us 37.011us 0.000us 0.00% 11.583us 3.861us 3 + aten::_convolution 1.08% 22.081us 4.95% 101.002us 33.667us 0.000us 0.00% 11.583us 3.861us 3 + aten::_conv_depthwise2d 1.04% 21.239us 3.10% 63.201us 21.067us 11.583us 47.13% 11.583us 3.861us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.583us 47.13% 11.583us 3.861us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.656us 27.08% 6.656us 2.219us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 25.78% 6.336us 2.112us 3 + Activity Buffer Request 70.08% 1.429ms 70.08% 1.429ms 1.429ms 2.272us 9.25% 2.272us 2.272us 1 + aten::empty_strided 1.51% 30.710us 1.51% 30.710us 5.118us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.42% 212.467us 10.42% 212.467us 23.607us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.89% 18.130us 1.15% 23.350us 2.594us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.43% 8.790us 0.43% 8.790us 0.586us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.48% 9.800us 0.48% 9.800us 3.267us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.47% 9.640us 0.47% 9.640us 3.213us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.28% 5.760us 0.35% 7.050us 2.350us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.995ms -Self CUDA time total: 24.607us +Self CPU time total: 2.039ms +Self CUDA time total: 24.575us @@ -4142,29 +4150,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 358.812us 1379.20% 358.812us 358.812us 1 - torch_eager 6.94% 139.423us 99.75% 2.005ms 2.005ms 0.000us 0.00% 28.256us 28.256us 1 - aten::to 0.33% 6.550us 85.45% 1.717ms 286.205us 0.000us 0.00% 15.199us 2.533us 6 - aten::_to_copy 1.20% 24.182us 85.13% 1.711ms 285.114us 0.000us 0.00% 15.199us 2.533us 6 - aten::copy_ 2.59% 52.130us 82.30% 1.654ms 275.648us 12.959us 49.81% 15.199us 2.533us 6 - aten::conv1d 0.30% 6.120us 5.97% 119.993us 39.998us 0.000us 0.00% 13.057us 4.352us 3 - aten::convolution 0.48% 9.660us 5.67% 113.873us 37.958us 0.000us 0.00% 13.057us 4.352us 3 - aten::_convolution 1.13% 22.802us 5.19% 104.213us 34.738us 0.000us 0.00% 13.057us 4.352us 3 - aten::_conv_depthwise2d 1.09% 21.932us 3.25% 65.242us 21.747us 13.057us 50.19% 13.057us 4.352us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.057us 50.19% 13.057us 4.352us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.623us 25.46% 6.623us 2.208us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 24.35% 6.336us 2.112us 3 - Activity Buffer Request 70.68% 1.420ms 70.68% 1.420ms 1.420ms 2.240us 8.61% 2.240us 2.240us 1 - aten::empty_strided 1.62% 32.611us 1.62% 32.611us 5.435us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.17% 204.364us 10.17% 204.364us 22.707us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.88% 17.647us 1.15% 23.189us 2.577us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.47% 9.382us 0.47% 9.382us 0.625us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.58% 11.651us 0.58% 11.651us 3.884us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.44% 8.769us 0.44% 8.769us 2.923us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.32% 6.420us 0.39% 7.890us 2.630us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 332.990us 1278.37% 332.990us 332.990us 1 + torch_eager 7.09% 142.971us 99.76% 2.011ms 2.011ms 0.000us 0.00% 28.288us 28.288us 1 + aten::to 0.35% 7.062us 85.44% 1.723ms 287.120us 0.000us 0.00% 15.232us 2.539us 6 + aten::_to_copy 1.18% 23.771us 85.09% 1.716ms 285.943us 0.000us 0.00% 15.232us 2.539us 6 + aten::copy_ 2.51% 50.519us 82.47% 1.663ms 277.136us 12.992us 49.88% 15.232us 2.539us 6 + aten::conv1d 0.32% 6.541us 5.84% 117.833us 39.278us 0.000us 0.00% 13.056us 4.352us 3 + aten::convolution 0.52% 10.410us 5.52% 111.292us 37.097us 0.000us 0.00% 13.056us 4.352us 3 + aten::_convolution 1.19% 24.049us 5.00% 100.882us 33.627us 0.000us 0.00% 13.056us 4.352us 3 + aten::_conv_depthwise2d 1.01% 20.460us 2.98% 60.052us 20.017us 13.056us 50.12% 13.056us 4.352us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.056us 50.12% 13.056us 4.352us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 25.43% 6.624us 2.208us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 24.45% 6.368us 2.123us 3 + Activity Buffer Request 70.71% 1.426ms 70.71% 1.426ms 1.426ms 2.240us 8.60% 2.240us 2.240us 1 + aten::empty_strided 1.44% 29.071us 1.44% 29.071us 4.845us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.31% 207.805us 10.31% 207.805us 23.089us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.90% 18.081us 1.15% 23.201us 2.578us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.43% 8.650us 0.43% 8.650us 0.577us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.49% 9.891us 0.49% 9.891us 3.297us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.42% 8.561us 0.42% 8.561us 2.854us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.31% 6.350us 0.38% 7.610us 2.537us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.010ms -Self CUDA time total: 26.016us +Self CPU time total: 2.016ms +Self CUDA time total: 26.048us @@ -4174,29 +4182,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.896us 853.65% 328.896us 328.896us 1 - torch_eager 6.29% 121.493us 99.73% 1.928ms 1.928ms 0.000us 0.00% 41.088us 41.088us 1 - aten::conv1d 0.31% 5.961us 6.00% 115.903us 38.634us 0.000us 0.00% 22.688us 7.563us 3 - aten::convolution 0.50% 9.600us 5.69% 109.942us 36.647us 0.000us 0.00% 22.688us 7.563us 3 - aten::_convolution 1.16% 22.510us 5.19% 100.342us 33.447us 0.000us 0.00% 22.688us 7.563us 3 - aten::_conv_depthwise2d 1.17% 22.551us 3.25% 62.881us 20.960us 22.688us 58.89% 22.688us 7.563us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.688us 58.89% 22.688us 7.563us 3 - aten::to 0.33% 6.421us 86.08% 1.664ms 277.308us 0.000us 0.00% 18.400us 3.067us 6 - aten::_to_copy 1.25% 24.161us 85.75% 1.657ms 276.238us 0.000us 0.00% 18.400us 3.067us 6 - aten::copy_ 2.57% 49.759us 82.93% 1.603ms 267.166us 15.840us 41.11% 18.400us 3.067us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.448us 21.93% 8.448us 2.816us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 19.19% 7.392us 2.464us 3 - Activity Buffer Request 71.07% 1.374ms 71.07% 1.374ms 1.374ms 2.560us 6.64% 2.560us 2.560us 1 - aten::empty_strided 1.57% 30.271us 1.57% 30.271us 5.045us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.43% 201.525us 10.43% 201.525us 22.392us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.86% 16.701us 1.14% 22.001us 2.445us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.45% 8.751us 0.45% 8.751us 0.583us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.48% 9.290us 0.48% 9.290us 3.097us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.47% 9.060us 0.47% 9.060us 3.020us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.28% 5.459us 0.35% 6.690us 2.230us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 332.220us 868.07% 332.220us 332.220us 1 + torch_eager 7.10% 144.065us 99.76% 2.024ms 2.024ms 0.000us 0.00% 40.831us 40.831us 1 + aten::conv1d 0.30% 6.030us 5.72% 116.102us 38.701us 0.000us 0.00% 22.464us 7.488us 3 + aten::convolution 0.49% 9.861us 5.42% 110.072us 36.691us 0.000us 0.00% 22.464us 7.488us 3 + aten::_convolution 1.11% 22.459us 4.94% 100.211us 33.404us 0.000us 0.00% 22.464us 7.488us 3 + aten::_conv_depthwise2d 1.00% 20.252us 3.07% 62.362us 20.787us 22.464us 58.70% 22.464us 7.488us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.464us 58.70% 22.464us 7.488us 3 + aten::to 0.31% 6.271us 85.57% 1.737ms 289.428us 0.000us 0.00% 18.367us 3.061us 6 + aten::_to_copy 1.14% 23.180us 85.26% 1.730ms 288.383us 0.000us 0.00% 18.367us 3.061us 6 + aten::copy_ 2.42% 49.061us 82.56% 1.675ms 279.226us 15.807us 41.30% 18.367us 3.061us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.448us 22.07% 8.448us 2.816us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.359us 19.23% 7.359us 2.453us 3 + Activity Buffer Request 70.88% 1.438ms 70.88% 1.438ms 1.438ms 2.560us 6.69% 2.560us 2.560us 1 + aten::empty_strided 1.57% 31.760us 1.57% 31.760us 5.293us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.30% 209.084us 10.30% 209.084us 23.232us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.88% 17.889us 1.13% 22.980us 2.553us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.43% 8.691us 0.43% 8.691us 0.579us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.58% 11.680us 0.58% 11.680us 3.893us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.45% 9.220us 0.45% 9.220us 3.073us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.29% 5.850us 0.36% 7.240us 2.413us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.933ms -Self CUDA time total: 38.528us +Self CPU time total: 2.029ms +Self CUDA time total: 38.271us @@ -4206,29 +4214,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 334.458us 810.83% 334.458us 334.458us 1 - torch_eager 6.32% 125.394us 99.75% 1.978ms 1.978ms 0.000us 0.00% 43.841us 43.841us 1 - aten::conv1d 0.30% 5.899us 5.88% 116.562us 38.854us 0.000us 0.00% 25.600us 8.533us 3 - aten::convolution 0.49% 9.810us 5.58% 110.663us 36.888us 0.000us 0.00% 25.600us 8.533us 3 - aten::_convolution 1.13% 22.411us 5.09% 100.853us 33.618us 0.000us 0.00% 25.600us 8.533us 3 - aten::_conv_depthwise2d 1.14% 22.520us 3.20% 63.392us 21.131us 25.600us 62.06% 25.600us 8.533us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.600us 62.06% 25.600us 8.533us 3 - aten::to 0.30% 5.959us 86.14% 1.708ms 284.675us 0.000us 0.00% 18.241us 3.040us 6 - aten::_to_copy 1.33% 26.372us 85.84% 1.702ms 283.682us 0.000us 0.00% 18.241us 3.040us 6 - aten::copy_ 2.49% 49.420us 83.02% 1.646ms 274.363us 15.649us 37.94% 18.241us 3.040us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.321us 20.17% 8.321us 2.774us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 17.77% 7.328us 2.443us 3 - Activity Buffer Request 71.51% 1.418ms 71.51% 1.418ms 1.418ms 2.592us 6.28% 2.592us 2.592us 1 - aten::empty_strided 1.49% 29.540us 1.49% 29.540us 4.923us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.06% 199.427us 10.06% 199.427us 22.159us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.92% 18.199us 1.18% 23.330us 2.592us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.44% 8.651us 0.44% 8.651us 0.577us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.54% 10.640us 0.54% 10.640us 3.547us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.48% 9.610us 0.48% 9.610us 3.203us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.28% 5.590us 0.34% 6.770us 2.257us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 347.901us 847.38% 347.901us 347.901us 1 + torch_eager 7.21% 147.111us 99.76% 2.035ms 2.035ms 0.000us 0.00% 43.616us 43.616us 1 + aten::conv1d 0.33% 6.680us 5.94% 121.133us 40.378us 0.000us 0.00% 25.376us 8.459us 3 + aten::convolution 0.49% 10.011us 5.61% 114.453us 38.151us 0.000us 0.00% 25.376us 8.459us 3 + aten::_convolution 1.21% 24.739us 5.12% 104.442us 34.814us 0.000us 0.00% 25.376us 8.459us 3 + aten::_conv_depthwise2d 1.05% 21.431us 3.09% 62.981us 20.994us 25.376us 61.81% 25.376us 8.459us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.376us 61.81% 25.376us 8.459us 3 + aten::to 0.35% 7.210us 85.28% 1.740ms 289.922us 0.000us 0.00% 18.240us 3.040us 6 + aten::_to_copy 1.21% 24.639us 84.93% 1.732ms 288.720us 0.000us 0.00% 18.240us 3.040us 6 + aten::copy_ 2.56% 52.303us 82.23% 1.677ms 279.542us 15.680us 38.19% 18.240us 3.040us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.288us 20.19% 8.288us 2.763us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 18.00% 7.392us 2.464us 3 + Activity Buffer Request 70.32% 1.434ms 70.32% 1.434ms 1.434ms 2.560us 6.24% 2.560us 2.560us 1 + aten::empty_strided 1.49% 30.432us 1.49% 30.432us 5.072us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.49% 213.884us 10.49% 213.884us 23.765us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.88% 17.871us 1.14% 23.242us 2.582us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.44% 8.942us 0.44% 8.942us 0.596us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.48% 9.840us 0.48% 9.840us 3.280us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.42% 8.540us 0.42% 8.540us 2.847us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.31% 6.230us 0.37% 7.540us 2.513us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.983ms -Self CUDA time total: 41.249us +Self CPU time total: 2.040ms +Self CUDA time total: 41.056us @@ -4238,29 +4246,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 338.849us 326.92% 338.849us 338.849us 1 - torch_eager 5.95% 117.585us 99.74% 1.970ms 1.970ms 0.000us 0.00% 109.697us 109.697us 1 - aten::conv1d 0.30% 5.970us 6.05% 119.502us 39.834us 0.000us 0.00% 71.232us 23.744us 3 - aten::convolution 0.49% 9.700us 5.75% 113.532us 37.844us 0.000us 0.00% 71.232us 23.744us 3 - aten::_convolution 1.15% 22.781us 5.26% 103.832us 34.611us 0.000us 0.00% 71.232us 23.744us 3 - aten::_conv_depthwise2d 1.18% 23.259us 3.31% 65.420us 21.807us 71.232us 68.72% 71.232us 23.744us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 71.232us 68.72% 71.232us 23.744us 3 - aten::to 0.31% 6.199us 86.38% 1.706ms 284.313us 0.000us 0.00% 38.465us 6.411us 6 - aten::_to_copy 1.31% 25.891us 86.06% 1.700ms 283.280us 0.000us 0.00% 38.465us 6.411us 6 - aten::copy_ 2.57% 50.812us 83.17% 1.643ms 273.758us 32.417us 31.28% 38.465us 6.411us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.760us 17.13% 17.760us 5.920us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.657us 14.14% 14.657us 4.886us 3 - Activity Buffer Request 71.61% 1.414ms 71.61% 1.414ms 1.414ms 6.048us 5.84% 6.048us 6.048us 1 - aten::empty_strided 1.58% 31.240us 1.58% 31.240us 5.207us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.13% 200.155us 10.13% 200.155us 22.239us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.87% 17.181us 1.15% 22.621us 2.513us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.45% 8.941us 0.45% 8.941us 0.596us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.51% 10.050us 0.51% 10.050us 3.350us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.47% 9.370us 0.47% 9.370us 3.123us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.28% 5.551us 0.35% 6.851us 2.284us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 334.331us 325.39% 334.331us 334.331us 1 + torch_eager 7.01% 141.713us 99.76% 2.018ms 2.018ms 0.000us 0.00% 108.764us 108.764us 1 + aten::conv1d 0.30% 6.090us 5.77% 116.623us 38.874us 0.000us 0.00% 70.528us 23.509us 3 + aten::convolution 0.56% 11.281us 5.46% 110.533us 36.844us 0.000us 0.00% 70.528us 23.509us 3 + aten::_convolution 1.11% 22.501us 4.91% 99.252us 33.084us 0.000us 0.00% 70.528us 23.509us 3 + aten::_conv_depthwise2d 1.02% 20.538us 3.03% 61.301us 20.434us 70.528us 68.64% 70.528us 23.509us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.528us 68.64% 70.528us 23.509us 3 + aten::to 0.31% 6.229us 85.56% 1.731ms 288.457us 0.000us 0.00% 38.236us 6.373us 6 + aten::_to_copy 1.17% 23.650us 85.25% 1.725ms 287.419us 0.000us 0.00% 38.236us 6.373us 6 + aten::copy_ 2.48% 50.230us 82.63% 1.672ms 278.605us 32.221us 31.36% 38.236us 6.373us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.598us 17.13% 17.598us 5.866us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.623us 14.23% 14.623us 4.874us 3 + Activity Buffer Request 70.91% 1.435ms 70.91% 1.435ms 1.435ms 6.015us 5.85% 6.015us 6.015us 1 + aten::empty_strided 1.45% 29.232us 1.45% 29.232us 4.872us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.36% 209.517us 10.36% 209.517us 23.280us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.88% 17.770us 1.13% 22.940us 2.549us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.42% 8.560us 0.42% 8.560us 0.571us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.48% 9.771us 0.48% 9.771us 3.257us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.41% 8.351us 0.41% 8.351us 2.784us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.28% 5.590us 0.33% 6.770us 2.257us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.975ms -Self CUDA time total: 103.649us +Self CPU time total: 2.023ms +Self CUDA time total: 102.749us @@ -4270,29 +4278,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 357.597us 314.53% 357.597us 357.597us 1 - torch_eager 6.01% 120.196us 99.73% 1.995ms 1.995ms 0.000us 0.00% 119.645us 119.645us 1 - aten::conv1d 0.28% 5.578us 6.85% 137.112us 45.704us 0.000us 0.00% 81.344us 27.115us 3 - aten::convolution 0.47% 9.452us 6.58% 131.534us 43.845us 0.000us 0.00% 81.344us 27.115us 3 - aten::_convolution 1.16% 23.298us 6.10% 122.082us 40.694us 0.000us 0.00% 81.344us 27.115us 3 - aten::_conv_depthwise2d 1.16% 23.221us 4.15% 82.932us 27.644us 81.344us 71.55% 81.344us 27.115us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 81.344us 71.55% 81.344us 27.115us 3 - aten::to 0.33% 6.509us 85.46% 1.710ms 284.935us 0.000us 0.00% 38.301us 6.383us 6 - aten::_to_copy 1.29% 25.870us 85.14% 1.703ms 283.850us 0.000us 0.00% 38.301us 6.383us 6 - aten::copy_ 2.58% 51.531us 82.27% 1.646ms 274.308us 32.350us 28.45% 38.301us 6.383us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.727us 15.59% 17.727us 5.909us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.623us 12.86% 14.623us 4.874us 3 - Activity Buffer Request 70.95% 1.419ms 70.95% 1.419ms 1.419ms 5.951us 5.23% 5.951us 5.951us 1 - aten::empty_strided 1.57% 31.380us 1.57% 31.380us 5.230us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 9.95% 199.044us 9.95% 199.044us 22.116us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.89% 17.740us 1.16% 23.191us 2.577us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.47% 9.433us 0.47% 9.433us 0.629us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.53% 10.531us 0.53% 10.531us 3.510us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.26% 25.130us 1.26% 25.130us 8.377us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.30% 6.010us 0.38% 7.612us 2.537us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 330.969us 293.42% 330.969us 330.969us 1 + torch_eager 14.99% 119.634us 99.39% 793.059us 793.059us 0.000us 0.00% 118.814us 118.814us 1 + aten::conv1d 0.68% 5.459us 14.66% 116.982us 38.994us 0.000us 0.00% 80.510us 26.837us 3 + aten::convolution 1.26% 10.041us 13.98% 111.523us 37.174us 0.000us 0.00% 80.510us 26.837us 3 + aten::_convolution 2.84% 22.661us 12.72% 101.482us 33.827us 0.000us 0.00% 80.510us 26.837us 3 + aten::_conv_depthwise2d 2.60% 20.719us 7.95% 63.401us 21.134us 80.510us 71.38% 80.510us 26.837us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.510us 71.38% 80.510us 26.837us 3 + aten::to 0.74% 5.920us 66.51% 530.742us 88.457us 0.000us 0.00% 38.304us 6.384us 6 + aten::_to_copy 2.94% 23.422us 65.77% 524.822us 87.470us 0.000us 0.00% 38.304us 6.384us 6 + aten::copy_ 6.43% 51.340us 58.99% 470.681us 78.447us 32.288us 28.62% 38.304us 6.384us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.696us 15.69% 17.696us 5.899us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.592us 12.94% 14.592us 4.864us 3 + Activity Buffer Request 29.02% 231.576us 29.02% 231.576us 231.576us 6.016us 5.33% 6.016us 6.016us 1 + aten::empty_strided 3.85% 30.719us 3.85% 30.719us 5.120us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 26.56% 211.935us 26.56% 211.935us 23.548us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.12% 16.940us 2.72% 21.720us 2.413us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.02% 8.121us 1.02% 8.121us 0.541us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.20% 9.582us 1.20% 9.582us 3.194us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.12% 8.930us 1.12% 8.930us 2.977us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.72% 5.780us 0.87% 6.970us 2.323us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.000ms -Self CUDA time total: 113.694us +Self CPU time total: 797.960us +Self CUDA time total: 112.798us @@ -4302,29 +4310,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 5.97% 120.782us 97.66% 1.975ms 1.975ms 0.000us 0.00% 434.301us 434.301us 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 421.021us 106.85% 421.021us 421.021us 1 - aten::conv1d 0.30% 6.069us 5.79% 117.202us 39.067us 0.000us 0.00% 251.007us 83.669us 3 - aten::convolution 0.47% 9.471us 5.49% 111.133us 37.044us 0.000us 0.00% 251.007us 83.669us 3 - aten::_convolution 1.10% 22.180us 5.03% 101.662us 33.887us 0.000us 0.00% 251.007us 83.669us 3 - aten::_conv_depthwise2d 1.13% 22.779us 3.17% 64.182us 21.394us 251.007us 63.71% 251.007us 83.669us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 251.007us 63.71% 251.007us 83.669us 3 - aten::to 0.31% 6.200us 84.52% 1.710ms 284.917us 0.000us 0.00% 183.294us 30.549us 6 - aten::_to_copy 1.19% 24.072us 84.22% 1.703ms 283.884us 0.000us 0.00% 183.294us 30.549us 6 - aten::copy_ 2.45% 49.593us 81.56% 1.650ms 274.942us 143.007us 36.29% 183.294us 30.549us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 102.495us 26.01% 102.495us 34.165us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.512us 10.28% 40.512us 13.504us 3 - Activity Buffer Request 70.36% 1.423ms 70.36% 1.423ms 1.423ms 40.287us 10.22% 40.287us 40.287us 1 - aten::empty_strided 1.46% 29.579us 1.46% 29.579us 4.930us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 9.86% 199.474us 9.86% 199.474us 22.164us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.84% 17.021us 1.11% 22.432us 2.492us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.45% 9.090us 0.45% 9.090us 0.606us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.48% 9.720us 0.48% 9.720us 3.240us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.45% 9.202us 0.45% 9.202us 3.067us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.28% 5.680us 0.35% 7.060us 2.353us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 13.79% 117.896us 93.85% 802.069us 802.069us 0.000us 0.00% 432.858us 432.858us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 419.356us 106.55% 419.356us 419.356us 1 + aten::conv1d 0.66% 5.648us 13.24% 113.161us 37.720us 0.000us 0.00% 251.262us 83.754us 3 + aten::convolution 1.11% 9.481us 12.58% 107.513us 35.838us 0.000us 0.00% 251.262us 83.754us 3 + aten::_convolution 2.53% 21.627us 11.47% 98.032us 32.677us 0.000us 0.00% 251.262us 83.754us 3 + aten::_conv_depthwise2d 2.35% 20.121us 7.14% 61.002us 20.334us 251.262us 63.84% 251.262us 83.754us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 251.262us 63.84% 251.262us 83.754us 3 + aten::to 0.66% 5.670us 63.66% 544.101us 90.683us 0.000us 0.00% 181.596us 30.266us 6 + aten::_to_copy 2.68% 22.880us 63.00% 538.431us 89.739us 0.000us 0.00% 181.596us 30.266us 6 + aten::copy_ 6.04% 51.591us 56.88% 486.161us 81.027us 142.333us 36.16% 181.596us 30.266us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 102.271us 25.98% 102.271us 34.090us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.062us 10.18% 40.062us 13.354us 3 + Activity Buffer Request 28.73% 245.556us 28.73% 245.556us 245.556us 39.263us 9.98% 39.263us 39.263us 1 + aten::empty_strided 3.44% 29.390us 3.44% 29.390us 4.898us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 24.77% 211.714us 24.77% 211.714us 23.524us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.99% 17.042us 2.56% 21.904us 2.434us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.01% 8.621us 1.01% 8.621us 0.575us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.11% 9.471us 1.11% 9.471us 3.157us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.02% 8.710us 1.02% 8.710us 2.903us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.71% 6.031us 0.84% 7.190us 2.397us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.023ms -Self CUDA time total: 394.014us +Self CPU time total: 854.650us +Self CUDA time total: 393.595us @@ -4334,29 +4342,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 5.89% 122.072us 95.29% 1.975ms 1.975ms 0.000us 0.00% 486.458us 486.458us 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 474.010us 106.16% 474.010us 474.010us 1 - aten::conv1d 0.28% 5.830us 5.59% 115.853us 38.618us 0.000us 0.00% 299.291us 99.764us 3 - aten::convolution 0.46% 9.610us 5.31% 110.023us 36.674us 0.000us 0.00% 299.291us 99.764us 3 - aten::_convolution 1.08% 22.439us 4.85% 100.413us 33.471us 0.000us 0.00% 299.291us 99.764us 3 - aten::_conv_depthwise2d 1.04% 21.490us 3.04% 62.983us 20.994us 299.291us 67.03% 299.291us 99.764us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 299.291us 67.03% 299.291us 99.764us 3 - aten::to 0.31% 6.341us 82.51% 1.710ms 284.962us 0.000us 0.00% 187.167us 31.195us 6 - aten::_to_copy 1.23% 25.592us 82.20% 1.703ms 283.906us 0.000us 0.00% 187.167us 31.195us 6 - aten::copy_ 2.39% 49.481us 79.48% 1.647ms 274.512us 147.199us 32.97% 187.167us 31.195us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 106.911us 23.94% 106.911us 35.637us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.288us 9.02% 40.288us 13.429us 3 - Activity Buffer Request 68.62% 1.422ms 68.62% 1.422ms 1.422ms 39.968us 8.95% 39.968us 39.968us 1 - aten::empty_strided 1.48% 30.770us 1.48% 30.770us 5.128us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 9.53% 197.485us 9.53% 197.485us 21.943us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.81% 16.791us 1.08% 22.301us 2.478us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.44% 9.141us 0.44% 9.141us 0.609us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.47% 9.701us 0.47% 9.701us 3.234us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.48% 9.941us 0.48% 9.941us 3.314us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.27% 5.510us 0.33% 6.790us 2.263us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 12.73% 119.312us 88.90% 833.220us 833.220us 0.000us 0.00% 487.606us 487.606us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 476.503us 106.43% 476.503us 476.503us 1 + aten::conv1d 0.59% 5.550us 12.38% 116.073us 38.691us 0.000us 0.00% 298.682us 99.561us 3 + aten::convolution 1.01% 9.430us 11.79% 110.523us 36.841us 0.000us 0.00% 298.682us 99.561us 3 + aten::_convolution 2.32% 21.781us 10.79% 101.093us 33.698us 0.000us 0.00% 298.682us 99.561us 3 + aten::_conv_depthwise2d 2.19% 20.491us 6.88% 64.493us 21.498us 298.682us 66.71% 298.682us 99.561us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 298.682us 66.71% 298.682us 99.561us 3 + aten::to 0.60% 5.580us 60.86% 570.404us 95.067us 0.000us 0.00% 188.924us 31.487us 6 + aten::_to_copy 2.42% 22.662us 60.26% 564.824us 94.137us 0.000us 0.00% 188.924us 31.487us 6 + aten::copy_ 5.33% 49.982us 54.62% 511.981us 85.330us 149.053us 33.29% 188.924us 31.487us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 108.926us 24.33% 108.926us 36.309us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.127us 8.96% 40.127us 13.376us 3 + Activity Buffer Request 29.44% 275.977us 29.44% 275.977us 275.977us 39.871us 8.91% 39.871us 39.871us 1 + aten::empty_strided 3.22% 30.181us 3.22% 30.181us 5.030us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 22.44% 210.343us 22.44% 210.343us 23.371us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.80% 16.910us 2.35% 22.009us 2.445us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.91% 8.519us 0.91% 8.519us 0.568us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.16% 10.891us 1.16% 10.891us 3.630us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.94% 8.790us 0.94% 8.790us 2.930us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.59% 5.500us 0.71% 6.690us 2.230us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.072ms -Self CUDA time total: 446.490us +Self CPU time total: 937.282us +Self CUDA time total: 447.735us @@ -4366,29 +4374,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 358.523us 1924.96% 358.523us 358.523us 1 - torch_eager 17.94% 139.773us 99.33% 774.049us 774.049us 0.000us 0.00% 20.513us 20.513us 1 - aten::to 0.94% 7.351us 62.88% 489.983us 81.664us 0.000us 0.00% 13.376us 2.229us 6 - aten::_to_copy 3.20% 24.930us 61.93% 482.632us 80.439us 0.000us 0.00% 13.376us 2.229us 6 - aten::copy_ 6.90% 53.742us 54.52% 424.881us 70.813us 11.488us 61.68% 13.376us 2.229us 6 - aten::conv1d 0.75% 5.841us 15.01% 116.973us 38.991us 0.000us 0.00% 7.137us 2.379us 3 - aten::convolution 1.33% 10.360us 14.26% 111.132us 37.044us 0.000us 0.00% 7.137us 2.379us 3 - aten::_convolution 3.01% 23.430us 12.93% 100.772us 33.591us 0.000us 0.00% 7.137us 2.379us 3 - aten::_conv_depthwise2d 2.81% 21.882us 7.98% 62.192us 20.731us 7.137us 38.32% 7.137us 2.379us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.137us 38.32% 7.137us 2.379us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 31.61% 5.888us 1.963us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.600us 30.07% 5.600us 1.867us 3 - Activity Buffer Request 24.98% 194.695us 24.98% 194.695us 194.695us 1.888us 10.14% 1.888us 1.888us 1 - aten::empty_strided 4.21% 32.821us 4.21% 32.821us 5.470us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 25.28% 197.004us 25.28% 197.004us 21.889us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.16% 16.850us 2.84% 22.160us 2.462us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.13% 8.821us 1.13% 8.821us 0.588us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.22% 9.521us 1.22% 9.521us 3.174us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.31% 10.229us 1.31% 10.229us 3.410us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.74% 5.740us 0.90% 7.020us 2.340us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 323.326us 1733.09% 323.326us 323.326us 1 + torch_eager 14.06% 116.944us 99.42% 826.859us 826.859us 0.000us 0.00% 20.544us 20.544us 1 + aten::to 0.72% 5.971us 68.57% 570.283us 95.047us 0.000us 0.00% 13.344us 2.224us 6 + aten::_to_copy 2.68% 22.330us 67.85% 564.312us 94.052us 0.000us 0.00% 13.344us 2.224us 6 + aten::copy_ 6.25% 51.969us 61.73% 513.371us 85.562us 11.456us 61.41% 13.344us 2.224us 6 + aten::conv1d 0.66% 5.530us 13.54% 112.622us 37.541us 0.000us 0.00% 7.200us 2.400us 3 + aten::convolution 1.25% 10.420us 12.88% 107.092us 35.697us 0.000us 0.00% 7.200us 2.400us 3 + aten::_convolution 2.52% 20.950us 11.62% 96.672us 32.224us 0.000us 0.00% 7.200us 2.400us 3 + aten::_conv_depthwise2d 2.43% 20.241us 7.30% 60.692us 20.231us 7.200us 38.59% 7.200us 2.400us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.200us 38.59% 7.200us 2.400us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 31.56% 5.888us 1.963us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.568us 29.85% 5.568us 1.856us 3 + Activity Buffer Request 31.20% 259.516us 31.20% 259.516us 259.516us 1.888us 10.12% 1.888us 1.888us 1 + aten::empty_strided 3.44% 28.611us 3.44% 28.611us 4.768us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 26.78% 222.677us 26.78% 222.677us 24.742us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.11% 17.509us 2.72% 22.620us 2.513us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.03% 8.541us 1.03% 8.541us 0.569us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.40% 11.610us 1.40% 11.610us 3.870us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.97% 8.050us 0.97% 8.050us 2.683us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.68% 5.660us 0.82% 6.830us 2.277us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 779.258us -Self CUDA time total: 18.625us +Self CPU time total: 831.660us +Self CUDA time total: 18.656us @@ -4398,29 +4406,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.763us 1698.07% 328.763us 328.763us 1 - torch_eager 14.65% 115.015us 99.34% 779.670us 779.670us 0.000us 0.00% 21.248us 21.248us 1 - aten::to 0.80% 6.290us 66.21% 519.631us 86.605us 0.000us 0.00% 13.406us 2.234us 6 - aten::_to_copy 3.14% 24.649us 65.41% 513.341us 85.557us 0.000us 0.00% 13.406us 2.234us 6 - aten::copy_ 6.80% 53.351us 58.20% 456.761us 76.127us 11.519us 59.50% 13.406us 2.234us 6 - aten::conv1d 0.75% 5.880us 15.10% 118.484us 39.495us 0.000us 0.00% 7.842us 2.614us 3 - aten::convolution 1.21% 9.513us 14.35% 112.604us 37.535us 0.000us 0.00% 7.842us 2.614us 3 - aten::_convolution 2.83% 22.229us 13.14% 103.091us 34.364us 0.000us 0.00% 7.842us 2.614us 3 - aten::_conv_depthwise2d 3.15% 24.720us 8.43% 66.141us 22.047us 7.842us 40.50% 7.842us 2.614us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.842us 40.50% 7.842us 2.614us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.887us 30.41% 5.887us 1.962us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.632us 29.09% 5.632us 1.877us 3 - Activity Buffer Request 29.55% 231.946us 29.55% 231.946us 231.946us 1.887us 9.75% 1.887us 1.887us 1 - aten::empty_strided 4.07% 31.931us 4.07% 31.931us 5.322us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 24.68% 193.684us 24.68% 193.684us 21.520us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.11% 16.541us 2.75% 21.581us 2.398us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.09% 8.568us 1.09% 8.568us 0.571us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.27% 9.951us 1.27% 9.951us 3.317us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.18% 9.250us 1.18% 9.250us 3.083us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.72% 5.642us 0.89% 6.980us 2.327us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 351.835us 1817.33% 351.835us 351.835us 1 + torch_eager 14.08% 121.071us 99.43% 854.999us 854.999us 0.000us 0.00% 21.248us 21.248us 1 + aten::to 0.71% 6.141us 68.62% 590.084us 98.347us 0.000us 0.00% 13.312us 2.219us 6 + aten::_to_copy 2.73% 23.503us 67.91% 583.943us 97.324us 0.000us 0.00% 13.312us 2.219us 6 + aten::copy_ 6.25% 53.711us 59.45% 511.250us 85.208us 11.424us 59.01% 13.312us 2.219us 6 + aten::conv1d 0.65% 5.630us 13.53% 116.322us 38.774us 0.000us 0.00% 7.936us 2.645us 3 + aten::convolution 1.12% 9.630us 12.87% 110.692us 36.897us 0.000us 0.00% 7.936us 2.645us 3 + aten::_convolution 2.70% 23.181us 11.75% 101.062us 33.687us 0.000us 0.00% 7.936us 2.645us 3 + aten::_conv_depthwise2d 2.42% 20.779us 7.31% 62.821us 20.940us 7.936us 40.99% 7.936us 2.645us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 40.99% 7.936us 2.645us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.856us 30.25% 5.856us 1.952us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.568us 28.76% 5.568us 1.856us 3 + Activity Buffer Request 31.74% 272.946us 31.74% 272.946us 272.946us 1.888us 9.75% 1.888us 1.888us 1 + aten::empty_strided 5.72% 49.190us 5.72% 49.190us 8.198us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 24.15% 207.684us 24.15% 207.684us 23.076us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.01% 17.302us 2.65% 22.752us 2.528us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.04% 8.971us 1.04% 8.971us 0.598us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.16% 9.970us 1.16% 9.970us 3.323us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.04% 8.981us 1.04% 8.981us 2.994us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.67% 5.729us 0.81% 6.930us 2.310us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 784.850us -Self CUDA time total: 19.361us +Self CPU time total: 859.928us +Self CUDA time total: 19.360us @@ -4430,29 +4438,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 330.454us 1698.73% 330.454us 330.454us 1 - torch_eager 14.50% 115.185us 99.38% 789.290us 789.290us 0.000us 0.00% 21.628us 21.628us 1 - aten::to 0.75% 5.979us 66.62% 529.132us 88.189us 0.000us 0.00% 14.332us 2.389us 6 - aten::_to_copy 3.11% 24.732us 65.87% 523.153us 87.192us 0.000us 0.00% 14.332us 2.389us 6 - aten::copy_ 6.75% 53.590us 58.69% 466.101us 77.684us 12.157us 62.49% 14.332us 2.389us 6 - aten::conv1d 0.72% 5.740us 14.75% 117.122us 39.041us 0.000us 0.00% 7.296us 2.432us 3 - aten::convolution 1.18% 9.359us 14.02% 111.382us 37.127us 0.000us 0.00% 7.296us 2.432us 3 - aten::_convolution 2.82% 22.362us 12.85% 102.023us 34.008us 0.000us 0.00% 7.296us 2.432us 3 - aten::_conv_depthwise2d 2.86% 22.741us 8.10% 64.351us 21.450us 7.296us 37.51% 7.296us 2.432us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.296us 37.51% 7.296us 2.432us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.238us 32.07% 6.238us 2.079us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.919us 30.43% 5.919us 1.973us 3 - Activity Buffer Request 30.19% 239.746us 30.19% 239.746us 239.746us 2.175us 11.18% 2.175us 2.175us 1 - aten::empty_strided 4.07% 32.320us 4.07% 32.320us 5.387us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 24.58% 195.235us 24.58% 195.235us 21.693us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.10% 16.713us 2.76% 21.891us 2.432us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.12% 8.919us 1.12% 8.919us 0.595us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.20% 9.570us 1.20% 9.570us 3.190us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.20% 9.570us 1.20% 9.570us 3.190us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.72% 5.709us 0.89% 7.030us 2.343us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 336.603us 1730.16% 336.603us 336.603us 1 + torch_eager 7.08% 144.052us 99.72% 2.028ms 2.028ms 0.000us 0.00% 21.631us 21.631us 1 + aten::to 0.35% 7.181us 85.48% 1.738ms 289.712us 0.000us 0.00% 14.367us 2.394us 6 + aten::_to_copy 1.19% 24.130us 85.13% 1.731ms 288.515us 0.000us 0.00% 14.367us 2.394us 6 + aten::copy_ 2.47% 50.222us 82.49% 1.678ms 279.593us 12.191us 62.66% 14.367us 2.394us 6 + aten::conv1d 0.31% 6.211us 5.80% 117.993us 39.331us 0.000us 0.00% 7.264us 2.421us 3 + aten::convolution 0.49% 9.910us 5.50% 111.782us 37.261us 0.000us 0.00% 7.264us 2.421us 3 + aten::_convolution 1.11% 22.590us 5.01% 101.872us 33.957us 0.000us 0.00% 7.264us 2.421us 3 + aten::_conv_depthwise2d 1.01% 20.531us 3.08% 62.662us 20.887us 7.264us 37.34% 7.264us 2.421us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.264us 37.34% 7.264us 2.421us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 32.24% 6.272us 2.091us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.919us 30.42% 5.919us 1.973us 3 + Activity Buffer Request 70.94% 1.443ms 70.94% 1.443ms 1.443ms 2.176us 11.18% 2.176us 2.176us 1 + aten::empty_strided 1.45% 29.401us 1.45% 29.401us 4.900us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.12% 205.814us 10.12% 205.814us 22.868us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.93% 19.000us 1.20% 24.310us 2.701us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.43% 8.650us 0.43% 8.650us 0.577us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.52% 10.541us 0.52% 10.541us 3.514us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.51% 10.450us 0.51% 10.450us 3.483us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.30% 6.000us 0.36% 7.230us 2.410us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 794.200us -Self CUDA time total: 19.453us +Self CPU time total: 2.034ms +Self CUDA time total: 19.455us @@ -4462,29 +4470,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 325.021us 1622.51% 325.021us 325.021us 1 - torch_eager 14.95% 114.725us 99.33% 762.279us 762.279us 0.000us 0.00% 22.176us 22.176us 1 - aten::to 0.78% 5.949us 65.87% 505.530us 84.255us 0.000us 0.00% 14.272us 2.379us 6 - aten::_to_copy 3.19% 24.509us 65.10% 499.581us 83.264us 0.000us 0.00% 14.272us 2.379us 6 - aten::copy_ 6.59% 50.599us 57.97% 444.890us 74.148us 12.128us 60.54% 14.272us 2.379us 6 - aten::conv1d 0.79% 6.100us 15.11% 115.973us 38.658us 0.000us 0.00% 7.904us 2.635us 3 - aten::convolution 1.34% 10.290us 14.32% 109.873us 36.624us 0.000us 0.00% 7.904us 2.635us 3 - aten::_convolution 2.97% 22.812us 12.98% 99.583us 33.194us 0.000us 0.00% 7.904us 2.635us 3 - aten::_conv_depthwise2d 2.93% 22.501us 8.10% 62.182us 20.727us 7.904us 39.46% 7.904us 2.635us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 39.46% 7.904us 2.635us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 30.99% 6.208us 2.069us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 29.55% 5.920us 1.973us 3 - Activity Buffer Request 28.71% 220.306us 28.71% 220.306us 220.306us 2.144us 10.70% 2.144us 2.144us 1 - aten::empty_strided 3.93% 30.182us 3.93% 30.182us 5.030us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 25.32% 194.286us 25.32% 194.286us 21.587us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.11% 16.159us 2.76% 21.209us 2.357us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.09% 8.360us 1.09% 8.360us 0.557us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.23% 9.450us 1.23% 9.450us 3.150us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.29% 9.930us 1.29% 9.930us 3.310us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.71% 5.470us 0.87% 6.670us 2.223us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 330.813us 1648.79% 330.813us 330.813us 1 + torch_eager 15.21% 119.271us 99.33% 778.918us 778.918us 0.000us 0.00% 22.208us 22.208us 1 + aten::to 0.72% 5.611us 65.71% 515.312us 85.885us 0.000us 0.00% 14.305us 2.384us 6 + aten::_to_copy 2.87% 22.510us 65.00% 509.701us 84.950us 0.000us 0.00% 14.305us 2.384us 6 + aten::copy_ 6.38% 50.021us 58.03% 455.090us 75.848us 12.161us 60.61% 14.305us 2.384us 6 + aten::conv1d 0.69% 5.380us 15.11% 118.473us 39.491us 0.000us 0.00% 7.903us 2.634us 3 + aten::convolution 1.31% 10.292us 14.42% 113.093us 37.698us 0.000us 0.00% 7.903us 2.634us 3 + aten::_convolution 2.98% 23.360us 13.11% 102.801us 34.267us 0.000us 0.00% 7.903us 2.634us 3 + aten::_conv_depthwise2d 2.80% 21.952us 8.17% 64.041us 21.347us 7.903us 39.39% 7.903us 2.634us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.903us 39.39% 7.903us 2.634us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.240us 31.10% 6.240us 2.080us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.921us 29.51% 5.921us 1.974us 3 + Activity Buffer Request 27.43% 215.065us 27.43% 215.065us 215.065us 2.144us 10.69% 2.144us 2.144us 1 + aten::empty_strided 4.09% 32.101us 4.09% 32.101us 5.350us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 27.10% 212.544us 27.10% 212.544us 23.616us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.14% 16.752us 2.74% 21.481us 2.387us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.03% 8.081us 1.03% 8.081us 0.539us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.34% 10.539us 1.34% 10.539us 3.513us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.15% 9.010us 1.15% 9.010us 3.003us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.74% 5.829us 0.90% 7.070us 2.357us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 767.429us -Self CUDA time total: 20.032us +Self CPU time total: 784.179us +Self CUDA time total: 20.064us @@ -4494,29 +4502,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 356.764us 983.15% 356.764us 356.764us 1 - torch_eager 15.53% 123.844us 99.36% 792.350us 792.350us 0.000us 0.00% 38.944us 38.944us 1 - aten::conv1d 0.79% 6.320us 15.33% 122.233us 40.744us 0.000us 0.00% 20.320us 6.773us 3 - aten::convolution 1.24% 9.851us 14.54% 115.913us 38.638us 0.000us 0.00% 20.320us 6.773us 3 - aten::_convolution 2.89% 23.052us 13.30% 106.062us 35.354us 0.000us 0.00% 20.320us 6.773us 3 - aten::_conv_depthwise2d 2.97% 23.692us 8.39% 66.891us 22.297us 20.320us 56.00% 20.320us 6.773us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.320us 56.00% 20.320us 6.773us 3 - aten::to 0.80% 6.349us 64.76% 516.391us 86.065us 0.000us 0.00% 18.624us 3.104us 6 - aten::_to_copy 3.21% 25.572us 63.96% 510.042us 85.007us 0.000us 0.00% 18.624us 3.104us 6 - aten::copy_ 6.54% 52.120us 56.52% 450.739us 75.123us 15.968us 44.00% 18.624us 3.104us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.607us 23.72% 8.607us 2.869us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.361us 20.28% 7.361us 2.454us 3 - Activity Buffer Request 27.46% 218.966us 27.46% 218.966us 218.966us 2.656us 7.32% 2.656us 2.656us 1 - aten::empty_strided 4.23% 33.731us 4.23% 33.731us 5.622us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 25.38% 202.413us 25.38% 202.413us 22.490us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.20% 17.520us 2.88% 22.939us 2.549us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.21% 9.679us 1.21% 9.679us 0.645us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.40% 11.140us 1.40% 11.140us 3.713us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.17% 9.299us 1.17% 9.299us 3.100us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.75% 6.010us 0.93% 7.450us 2.483us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 332.020us 920.74% 332.020us 332.020us 1 + torch_eager 14.90% 117.475us 99.36% 783.438us 783.438us 0.000us 0.00% 38.651us 38.651us 1 + aten::conv1d 0.68% 5.380us 14.48% 114.172us 38.057us 0.000us 0.00% 20.190us 6.730us 3 + aten::convolution 1.18% 9.340us 13.80% 108.792us 36.264us 0.000us 0.00% 20.190us 6.730us 3 + aten::_convolution 2.79% 21.980us 12.61% 99.452us 33.151us 0.000us 0.00% 20.190us 6.730us 3 + aten::_conv_depthwise2d 2.62% 20.631us 7.92% 62.452us 20.817us 20.190us 55.99% 20.190us 6.730us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.190us 55.99% 20.190us 6.730us 3 + aten::to 0.79% 6.191us 66.58% 524.962us 87.494us 0.000us 0.00% 18.461us 3.077us 6 + aten::_to_copy 2.94% 23.190us 65.79% 518.771us 86.462us 0.000us 0.00% 18.461us 3.077us 6 + aten::copy_ 6.46% 50.920us 58.97% 464.950us 77.492us 15.870us 44.01% 18.461us 3.077us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.479us 23.51% 8.479us 2.826us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.391us 20.50% 7.391us 2.464us 3 + Activity Buffer Request 28.77% 226.875us 28.77% 226.875us 226.875us 2.591us 7.19% 2.591us 2.591us 1 + aten::empty_strided 3.88% 30.631us 3.88% 30.631us 5.105us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 26.51% 209.015us 26.51% 209.015us 23.224us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.16% 17.000us 2.79% 22.010us 2.446us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.07% 8.441us 1.07% 8.441us 0.563us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.46% 11.521us 1.46% 11.521us 3.840us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.07% 8.440us 1.07% 8.440us 2.813us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.73% 5.720us 0.87% 6.850us 2.283us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 797.430us -Self CUDA time total: 36.288us +Self CPU time total: 788.468us +Self CUDA time total: 36.060us @@ -4526,29 +4534,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 332.353us 866.25% 332.353us 332.353us 1 - torch_eager 6.20% 124.083us 99.73% 1.997ms 1.997ms 0.000us 0.00% 40.959us 40.959us 1 - aten::conv1d 0.30% 6.071us 5.74% 115.013us 38.338us 0.000us 0.00% 22.592us 7.531us 3 - aten::convolution 0.48% 9.660us 5.44% 108.942us 36.314us 0.000us 0.00% 22.592us 7.531us 3 - aten::_convolution 1.09% 21.840us 4.96% 99.282us 33.094us 0.000us 0.00% 22.592us 7.531us 3 - aten::_conv_depthwise2d 1.15% 22.991us 3.11% 62.342us 20.781us 22.592us 58.88% 22.592us 7.531us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.592us 58.88% 22.592us 7.531us 3 - aten::to 0.32% 6.339us 86.44% 1.731ms 288.505us 0.000us 0.00% 18.367us 3.061us 6 - aten::_to_copy 1.25% 24.980us 86.12% 1.725ms 287.449us 0.000us 0.00% 18.367us 3.061us 6 - aten::copy_ 2.51% 50.252us 83.36% 1.669ms 278.222us 15.775us 41.12% 18.367us 3.061us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.416us 21.94% 8.416us 2.805us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.359us 19.18% 7.359us 2.453us 3 - Activity Buffer Request 72.13% 1.445ms 72.13% 1.445ms 1.445ms 2.592us 6.76% 2.592us 2.592us 1 - aten::empty_strided 1.52% 30.382us 1.52% 30.382us 5.064us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 9.74% 194.985us 9.74% 194.985us 21.665us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.87% 17.330us 1.13% 22.630us 2.514us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.45% 8.941us 0.45% 8.941us 0.596us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.48% 9.610us 0.48% 9.610us 3.203us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.46% 9.250us 0.46% 9.250us 3.083us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.27% 5.490us 0.34% 6.780us 2.260us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 323.480us 850.14% 323.480us 323.480us 1 + torch_eager 14.55% 115.293us 99.34% 787.399us 787.399us 0.000us 0.00% 40.643us 40.643us 1 + aten::conv1d 0.68% 5.400us 14.40% 114.153us 38.051us 0.000us 0.00% 22.336us 7.445us 3 + aten::convolution 1.16% 9.210us 13.72% 108.753us 36.251us 0.000us 0.00% 22.336us 7.445us 3 + aten::_convolution 2.80% 22.227us 12.56% 99.543us 33.181us 0.000us 0.00% 22.336us 7.445us 3 + aten::_conv_depthwise2d 2.52% 20.003us 7.87% 62.343us 20.781us 22.336us 58.70% 22.336us 7.445us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.336us 58.70% 22.336us 7.445us 3 + aten::to 0.94% 7.450us 67.15% 532.253us 88.709us 0.000us 0.00% 18.307us 3.051us 6 + aten::_to_copy 2.90% 22.999us 66.21% 524.803us 87.467us 0.000us 0.00% 18.307us 3.051us 6 + aten::copy_ 6.35% 50.294us 59.67% 472.953us 78.825us 15.714us 41.30% 18.307us 3.051us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.321us 21.87% 8.321us 2.774us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.393us 19.43% 7.393us 2.464us 3 + Activity Buffer Request 30.44% 241.286us 30.44% 241.286us 241.286us 2.593us 6.81% 2.593us 2.593us 1 + aten::empty_strided 3.64% 28.851us 3.64% 28.851us 4.808us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 25.80% 204.463us 25.80% 204.463us 22.718us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.08% 16.472us 2.71% 21.512us 2.390us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.07% 8.500us 1.07% 8.500us 0.567us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.34% 10.600us 1.34% 10.600us 3.533us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.09% 8.650us 1.09% 8.650us 2.883us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.71% 5.641us 0.87% 6.891us 2.297us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.003ms -Self CUDA time total: 38.367us +Self CPU time total: 792.609us +Self CUDA time total: 38.050us @@ -4558,29 +4566,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.952us 509.17% 328.952us 328.952us 1 - torch_eager 15.31% 114.903us 99.32% 745.599us 745.599us 0.000us 0.00% 68.701us 68.701us 1 - aten::conv1d 0.89% 6.660us 15.50% 116.373us 38.791us 0.000us 0.00% 42.238us 14.079us 3 - aten::convolution 1.33% 9.952us 14.61% 109.713us 36.571us 0.000us 0.00% 42.238us 14.079us 3 - aten::_convolution 2.95% 22.149us 13.29% 99.761us 33.254us 0.000us 0.00% 42.238us 14.079us 3 - aten::_conv_depthwise2d 2.94% 22.090us 8.38% 62.891us 20.964us 42.238us 65.38% 42.238us 14.079us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 42.238us 65.38% 42.238us 14.079us 3 - aten::to 0.80% 6.039us 65.05% 488.341us 81.390us 0.000us 0.00% 26.463us 4.410us 6 - aten::_to_copy 3.23% 24.281us 64.25% 482.302us 80.384us 0.000us 0.00% 26.463us 4.410us 6 - aten::copy_ 6.57% 49.302us 56.69% 425.561us 70.927us 22.367us 34.62% 26.463us 4.410us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.936us 18.48% 11.936us 3.979us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 16.15% 10.431us 3.477us 3 - Activity Buffer Request 26.58% 199.565us 26.58% 199.565us 199.565us 4.096us 6.34% 4.096us 4.096us 1 - aten::empty_strided 4.32% 32.460us 4.32% 32.460us 5.410us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 26.45% 198.565us 26.45% 198.565us 22.063us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.13% 16.001us 2.81% 21.091us 2.343us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.16% 8.690us 1.16% 8.690us 0.579us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.26% 9.490us 1.26% 9.490us 3.163us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.26% 9.440us 1.26% 9.440us 3.147us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.75% 5.611us 0.93% 6.981us 2.327us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 336.347us 525.30% 336.347us 336.347us 1 + torch_eager 15.07% 122.512us 99.37% 807.929us 807.929us 0.000us 0.00% 68.125us 68.125us 1 + aten::conv1d 0.67% 5.471us 14.06% 114.283us 38.094us 0.000us 0.00% 41.663us 13.888us 3 + aten::convolution 1.12% 9.100us 13.38% 108.812us 36.271us 0.000us 0.00% 41.663us 13.888us 3 + aten::_convolution 2.67% 21.730us 12.26% 99.712us 33.237us 0.000us 0.00% 41.663us 13.888us 3 + aten::_conv_depthwise2d 2.50% 20.351us 7.64% 62.102us 20.701us 41.663us 65.07% 41.663us 13.888us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.663us 65.07% 41.663us 13.888us 3 + aten::to 0.85% 6.891us 66.79% 543.023us 90.504us 0.000us 0.00% 26.462us 4.410us 6 + aten::_to_copy 2.85% 23.179us 65.94% 536.132us 89.355us 0.000us 0.00% 26.462us 4.410us 6 + aten::copy_ 6.23% 50.622us 59.33% 482.372us 80.395us 22.367us 34.93% 26.462us 4.410us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.935us 18.64% 11.935us 3.978us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 16.29% 10.432us 3.477us 3 + Activity Buffer Request 30.75% 250.036us 30.75% 250.036us 250.036us 4.095us 6.40% 4.095us 4.095us 1 + aten::empty_strided 3.76% 30.581us 3.76% 30.581us 5.097us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 25.31% 205.766us 25.31% 205.766us 22.863us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.22% 18.070us 2.95% 23.970us 2.663us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.13% 9.220us 1.13% 9.220us 0.615us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.19% 9.690us 1.19% 9.690us 3.230us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.99% 8.009us 0.99% 8.009us 2.670us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.69% 5.650us 0.84% 6.800us 2.267us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 750.709us -Self CUDA time total: 64.605us +Self CPU time total: 813.049us +Self CUDA time total: 64.030us @@ -4590,29 +4598,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.798us 467.68% 328.798us 328.798us 1 - torch_eager 14.69% 115.264us 99.37% 779.669us 779.669us 0.000us 0.00% 74.432us 74.432us 1 - aten::conv1d 0.75% 5.869us 14.89% 116.853us 38.951us 0.000us 0.00% 47.840us 15.947us 3 - aten::convolution 1.20% 9.412us 14.15% 110.984us 36.995us 0.000us 0.00% 47.840us 15.947us 3 - aten::_convolution 2.99% 23.451us 12.95% 101.572us 33.857us 0.000us 0.00% 47.840us 15.947us 3 - aten::_conv_depthwise2d 2.71% 21.281us 8.10% 63.532us 21.177us 47.840us 68.05% 47.840us 15.947us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.840us 68.05% 47.840us 15.947us 3 - aten::to 0.74% 5.828us 66.46% 521.411us 86.902us 0.000us 0.00% 26.592us 4.432us 6 - aten::_to_copy 3.27% 25.622us 65.71% 515.583us 85.931us 0.000us 0.00% 26.592us 4.432us 6 - aten::copy_ 6.42% 50.382us 58.46% 458.651us 76.442us 22.464us 31.95% 26.592us 4.432us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.032us 17.11% 12.032us 4.011us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 14.84% 10.432us 3.477us 3 - Activity Buffer Request 29.93% 234.846us 29.93% 234.846us 234.846us 4.128us 5.87% 4.128us 4.128us 1 - aten::empty_strided 3.99% 31.310us 3.99% 31.310us 5.218us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 24.83% 194.803us 24.83% 194.803us 21.645us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.07% 16.243us 2.72% 21.332us 2.370us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.07% 8.401us 1.07% 8.401us 0.560us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.35% 10.581us 1.35% 10.581us 3.527us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.31% 10.290us 1.31% 10.290us 3.430us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.69% 5.406us 0.84% 6.568us 2.189us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 325.852us 467.34% 325.852us 325.852us 1 + torch_eager 14.81% 118.946us 99.39% 798.399us 798.399us 0.000us 0.00% 73.789us 73.789us 1 + aten::conv1d 0.70% 5.610us 14.26% 114.513us 38.171us 0.000us 0.00% 47.294us 15.765us 3 + aten::convolution 1.17% 9.382us 13.56% 108.903us 36.301us 0.000us 0.00% 47.294us 15.765us 3 + aten::_convolution 2.75% 22.119us 12.39% 99.521us 33.174us 0.000us 0.00% 47.294us 15.765us 3 + aten::_conv_depthwise2d 2.53% 20.361us 7.64% 61.351us 20.450us 47.294us 67.83% 47.294us 15.765us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.294us 67.83% 47.294us 15.765us 3 + aten::to 0.79% 6.379us 67.07% 538.781us 89.797us 0.000us 0.00% 26.495us 4.416us 6 + aten::_to_copy 2.79% 22.401us 66.28% 532.402us 88.734us 0.000us 0.00% 26.495us 4.416us 6 + aten::copy_ 6.32% 50.749us 59.88% 480.970us 80.162us 22.431us 32.17% 26.495us 4.416us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.031us 17.25% 12.031us 4.010us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 14.92% 10.400us 3.467us 3 + Activity Buffer Request 31.04% 249.326us 31.04% 249.326us 249.326us 4.064us 5.83% 4.064us 4.064us 1 + aten::empty_strided 3.61% 29.031us 3.61% 29.031us 4.839us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 25.24% 202.725us 25.24% 202.725us 22.525us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.08% 16.713us 2.70% 21.680us 2.409us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.07% 8.568us 1.07% 8.568us 0.571us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.32% 10.569us 1.32% 10.569us 3.523us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.07% 8.591us 1.07% 8.591us 2.864us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.82% 6.609us 1.00% 8.010us 2.670us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 784.589us -Self CUDA time total: 70.304us +Self CPU time total: 803.260us +Self CUDA time total: 69.725us @@ -4622,29 +4630,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 341.882us 182.91% 341.882us 341.882us 1 - torch_eager 15.14% 117.185us 99.33% 768.879us 768.879us 0.000us 0.00% 197.117us 197.117us 1 - aten::conv1d 0.79% 6.110us 14.86% 114.993us 38.331us 0.000us 0.00% 134.270us 44.757us 3 - aten::convolution 1.22% 9.451us 14.07% 108.883us 36.294us 0.000us 0.00% 134.270us 44.757us 3 - aten::_convolution 2.87% 22.240us 12.85% 99.432us 33.144us 0.000us 0.00% 134.270us 44.757us 3 - aten::_conv_depthwise2d 2.84% 21.991us 8.04% 62.222us 20.741us 134.270us 71.84% 134.270us 44.757us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 134.270us 71.84% 134.270us 44.757us 3 - aten::to 0.77% 5.950us 65.77% 509.102us 84.850us 0.000us 0.00% 62.847us 10.474us 6 - aten::_to_copy 3.29% 25.489us 65.00% 503.152us 83.859us 0.000us 0.00% 62.847us 10.474us 6 - aten::copy_ 6.45% 49.889us 57.58% 445.721us 74.287us 52.639us 28.16% 62.847us 10.474us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.728us 15.91% 29.728us 9.909us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.911us 12.26% 22.911us 7.637us 3 - Activity Buffer Request 28.61% 221.416us 28.61% 221.416us 221.416us 10.208us 5.46% 10.208us 10.208us 1 - aten::empty_strided 4.13% 31.942us 4.13% 31.942us 5.324us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 25.24% 195.386us 25.24% 195.386us 21.710us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.14% 16.602us 2.90% 22.460us 2.496us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.19% 9.247us 1.19% 9.247us 0.616us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.23% 9.500us 1.23% 9.500us 3.167us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.26% 9.761us 1.26% 9.761us 3.254us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.71% 5.470us 0.87% 6.700us 2.233us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 346.241us 186.20% 346.241us 346.241us 1 + torch_eager 7.14% 146.093us 99.75% 2.041ms 2.041ms 0.000us 0.00% 196.000us 196.000us 1 + aten::conv1d 0.31% 6.251us 5.79% 118.533us 39.511us 0.000us 0.00% 133.248us 44.416us 3 + aten::convolution 0.51% 10.359us 5.49% 112.282us 37.427us 0.000us 0.00% 133.248us 44.416us 3 + aten::_convolution 1.19% 24.280us 4.98% 101.923us 33.974us 0.000us 0.00% 133.248us 44.416us 3 + aten::_conv_depthwise2d 1.04% 21.191us 3.02% 61.762us 20.587us 133.248us 71.66% 133.248us 44.416us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.248us 71.66% 133.248us 44.416us 3 + aten::to 0.32% 6.489us 85.48% 1.749ms 291.443us 0.000us 0.00% 62.752us 10.459us 6 + aten::_to_copy 1.16% 23.832us 85.16% 1.742ms 290.362us 0.000us 0.00% 62.752us 10.459us 6 + aten::copy_ 2.53% 51.751us 82.59% 1.689ms 281.575us 52.704us 28.34% 62.752us 10.459us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.760us 16.00% 29.760us 9.920us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.944us 12.34% 22.944us 7.648us 3 + Activity Buffer Request 71.04% 1.453ms 71.04% 1.453ms 1.453ms 10.048us 5.40% 10.048us 10.048us 1 + aten::empty_strided 1.41% 28.891us 1.41% 28.891us 4.815us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.04% 205.324us 10.04% 205.324us 22.814us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.86% 17.550us 1.11% 22.661us 2.518us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.42% 8.641us 0.42% 8.641us 0.576us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.53% 10.750us 0.53% 10.750us 3.583us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.44% 9.021us 0.44% 9.021us 3.007us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.30% 6.220us 0.37% 7.470us 2.490us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 774.039us -Self CUDA time total: 186.909us +Self CPU time total: 2.046ms +Self CUDA time total: 185.952us @@ -4654,29 +4662,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 349.277us 165.88% 349.277us 349.277us 1 - torch_eager 15.39% 117.165us 99.36% 756.609us 756.609us 0.000us 0.00% 224.029us 224.029us 1 - aten::conv1d 0.74% 5.661us 15.33% 116.734us 38.911us 0.000us 0.00% 154.686us 51.562us 3 - aten::convolution 1.20% 9.150us 14.59% 111.073us 37.024us 0.000us 0.00% 154.686us 51.562us 3 - aten::_convolution 2.96% 22.532us 13.38% 101.923us 33.974us 0.000us 0.00% 154.686us 51.562us 3 - aten::_conv_depthwise2d 2.86% 21.751us 8.47% 64.492us 21.497us 154.686us 73.47% 154.686us 51.562us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 154.686us 73.47% 154.686us 51.562us 3 - aten::to 0.84% 6.379us 65.15% 496.150us 82.692us 0.000us 0.00% 69.343us 11.557us 6 - aten::_to_copy 3.33% 25.371us 64.32% 489.771us 81.628us 0.000us 0.00% 69.343us 11.557us 6 - aten::copy_ 6.44% 49.031us 56.76% 432.240us 72.040us 55.871us 26.53% 69.343us 11.557us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 32.831us 15.59% 32.831us 10.944us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.040us 10.94% 23.040us 7.680us 3 - Activity Buffer Request 27.33% 208.145us 27.33% 208.145us 208.145us 13.472us 6.40% 13.472us 13.472us 1 - aten::empty_strided 4.22% 32.160us 4.22% 32.160us 5.360us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 25.87% 197.025us 25.87% 197.025us 21.892us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.14% 16.329us 2.83% 21.520us 2.391us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.17% 8.932us 1.17% 8.932us 0.595us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.38% 10.500us 1.38% 10.500us 3.500us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.35% 10.280us 1.35% 10.280us 3.427us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.72% 5.468us 0.90% 6.839us 2.280us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 341.281us 162.73% 341.281us 341.281us 1 + torch_eager 15.25% 117.693us 99.36% 766.878us 766.878us 0.000us 0.00% 223.168us 223.168us 1 + aten::conv1d 0.68% 5.279us 14.60% 112.702us 37.567us 0.000us 0.00% 154.016us 51.339us 3 + aten::convolution 1.24% 9.560us 13.92% 107.423us 35.808us 0.000us 0.00% 154.016us 51.339us 3 + aten::_convolution 2.67% 20.611us 12.68% 97.863us 32.621us 0.000us 0.00% 154.016us 51.339us 3 + aten::_conv_depthwise2d 2.74% 21.170us 8.14% 62.852us 20.951us 154.016us 73.44% 154.016us 51.339us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 154.016us 73.44% 154.016us 51.339us 3 + aten::to 0.75% 5.750us 66.22% 511.121us 85.187us 0.000us 0.00% 69.152us 11.525us 6 + aten::_to_copy 2.86% 22.060us 65.48% 505.371us 84.228us 0.000us 0.00% 69.152us 11.525us 6 + aten::copy_ 6.81% 52.581us 58.74% 453.391us 75.565us 55.712us 26.56% 69.152us 11.525us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 32.705us 15.59% 32.705us 10.902us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.007us 10.97% 23.007us 7.669us 3 + Activity Buffer Request 28.38% 219.045us 28.38% 219.045us 219.045us 13.440us 6.41% 13.440us 13.440us 1 + aten::empty_strided 3.88% 29.920us 3.88% 29.920us 4.987us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 26.50% 204.546us 26.50% 204.546us 22.727us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.10% 16.212us 2.69% 20.781us 2.309us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.01% 7.798us 1.01% 7.798us 0.520us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.33% 10.250us 1.33% 10.250us 3.417us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.12% 8.651us 1.12% 8.651us 2.884us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.71% 5.470us 0.87% 6.730us 2.243us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 761.499us -Self CUDA time total: 210.557us +Self CPU time total: 771.798us +Self CUDA time total: 209.728us @@ -4686,29 +4694,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 6.72% 121.944us 52.58% 953.714us 953.714us 0.000us 0.00% 1.521ms 1.521ms 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.421ms 100.41% 1.421ms 1.421ms 1 - aten::to 0.35% 6.300us 37.63% 682.555us 113.759us 0.000us 0.00% 824.097us 137.350us 6 - aten::_to_copy 1.68% 30.549us 37.28% 676.255us 112.709us 0.000us 0.00% 824.097us 137.350us 6 - aten::copy_ 2.98% 53.981us 24.83% 450.422us 75.070us 718.817us 50.79% 824.097us 137.350us 6 - aten::conv1d 0.35% 6.281us 6.65% 120.554us 40.185us 0.000us 0.00% 696.543us 232.181us 3 - aten::convolution 0.57% 10.251us 6.30% 114.273us 38.091us 0.000us 0.00% 696.543us 232.181us 3 - aten::_convolution 1.27% 23.111us 5.73% 104.022us 34.674us 0.000us 0.00% 696.543us 232.181us 3 - aten::_conv_depthwise2d 1.23% 22.359us 3.60% 65.321us 21.774us 696.543us 49.21% 696.543us 232.181us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 696.543us 49.21% 696.543us 232.181us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 409.920us 28.96% 409.920us 136.640us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 308.897us 21.82% 308.897us 102.966us 3 - Activity Buffer Request 11.98% 217.246us 11.98% 217.246us 217.246us 105.280us 7.44% 105.280us 105.280us 1 - aten::empty_strided 2.17% 39.370us 10.77% 195.284us 32.547us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 11.13% 201.976us 11.13% 201.976us 22.442us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.99% 18.030us 1.31% 23.761us 2.640us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.53% 9.620us 0.53% 9.620us 0.641us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.59% 10.751us 0.59% 10.751us 3.584us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.52% 9.430us 0.52% 9.430us 3.143us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.31% 5.670us 0.39% 7.030us 2.343us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 6.80% 123.239us 52.01% 942.341us 942.341us 0.000us 0.00% 1.520ms 1.520ms 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.422ms 100.42% 1.422ms 1.422ms 1 + aten::to 0.34% 6.231us 36.98% 669.957us 111.660us 0.000us 0.00% 824.603us 137.434us 6 + aten::_to_copy 1.55% 28.122us 36.63% 663.726us 110.621us 0.000us 0.00% 824.603us 137.434us 6 + aten::copy_ 2.95% 53.430us 24.71% 447.748us 74.625us 720.572us 50.89% 824.603us 137.434us 6 + aten::conv1d 0.34% 6.111us 6.66% 120.744us 40.248us 0.000us 0.00% 695.357us 231.786us 3 + aten::convolution 0.56% 10.201us 6.33% 114.633us 38.211us 0.000us 0.00% 695.357us 231.786us 3 + aten::_convolution 1.36% 24.689us 5.76% 104.432us 34.811us 0.000us 0.00% 695.357us 231.786us 3 + aten::_conv_depthwise2d 1.22% 22.151us 3.50% 63.431us 21.144us 695.357us 49.11% 695.357us 231.786us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 695.357us 49.11% 695.357us 231.786us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 407.263us 28.76% 407.263us 135.754us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 313.309us 22.13% 313.309us 104.436us 3 + Activity Buffer Request 11.46% 207.684us 11.46% 207.684us 207.684us 104.031us 7.35% 104.031us 104.031us 1 + aten::empty_strided 2.02% 36.603us 10.37% 187.856us 31.309us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 11.47% 207.874us 11.47% 207.874us 23.097us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.99% 18.011us 1.30% 23.581us 2.620us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.51% 9.270us 0.51% 9.270us 0.618us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.60% 10.830us 0.60% 10.830us 3.610us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.51% 9.210us 0.51% 9.210us 3.070us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.35% 6.401us 0.43% 7.711us 2.570us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.814ms -Self CUDA time total: 1.415ms +Self CPU time total: 1.812ms +Self CUDA time total: 1.416ms @@ -4718,29 +4726,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 4.05% 123.714us 65.96% 2.016ms 2.016ms 0.000us 0.00% 1.502ms 1.502ms 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.433ms 100.43% 1.433ms 1.433ms 1 - aten::to 0.21% 6.507us 56.82% 1.737ms 289.475us 0.000us 0.00% 764.927us 127.488us 6 - aten::_to_copy 0.85% 25.961us 56.61% 1.730ms 288.391us 0.000us 0.00% 764.927us 127.488us 6 - aten::copy_ 1.76% 53.800us 54.73% 1.673ms 278.832us 689.887us 48.36% 764.927us 127.488us 6 - aten::conv1d 0.20% 6.220us 4.18% 127.663us 42.554us 0.000us 0.00% 736.735us 245.578us 3 - aten::convolution 0.34% 10.420us 3.97% 121.443us 40.481us 0.000us 0.00% 736.735us 245.578us 3 - aten::_convolution 0.75% 22.860us 3.63% 111.023us 37.008us 0.000us 0.00% 736.735us 245.578us 3 - aten::_conv_depthwise2d 0.96% 29.441us 2.37% 72.583us 24.194us 736.735us 51.64% 736.735us 245.578us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 736.735us 51.64% 736.735us 245.578us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 397.471us 27.86% 397.471us 132.490us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 292.416us 20.50% 292.416us 97.472us 3 - Activity Buffer Request 47.26% 1.445ms 47.26% 1.445ms 1.445ms 75.040us 5.26% 75.040us 75.040us 1 - aten::empty_strided 1.03% 31.391us 1.03% 31.391us 5.232us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 6.45% 197.169us 6.45% 197.169us 21.908us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.57% 17.300us 0.75% 22.850us 2.539us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.30% 9.200us 0.30% 9.200us 0.613us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.32% 9.780us 0.32% 9.780us 3.260us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.36% 10.870us 0.36% 10.870us 3.623us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.19% 5.770us 0.23% 7.180us 2.393us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 6.34% 116.852us 42.22% 778.698us 778.698us 0.000us 0.00% 1.501ms 1.501ms 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.431ms 100.41% 1.431ms 1.431ms 1 + aten::to 0.32% 5.860us 28.27% 521.352us 86.892us 0.000us 0.00% 762.491us 127.082us 6 + aten::_to_copy 1.22% 22.580us 27.95% 515.492us 85.915us 0.000us 0.00% 762.491us 127.082us 6 + aten::copy_ 2.82% 52.081us 25.06% 462.219us 77.037us 686.779us 48.19% 762.491us 127.082us 6 + aten::conv1d 0.30% 5.601us 6.22% 114.743us 38.248us 0.000us 0.00% 738.362us 246.121us 3 + aten::convolution 0.51% 9.380us 5.92% 109.142us 36.381us 0.000us 0.00% 738.362us 246.121us 3 + aten::_convolution 1.20% 22.202us 5.41% 99.762us 33.254us 0.000us 0.00% 738.362us 246.121us 3 + aten::_conv_depthwise2d 1.10% 20.251us 3.34% 61.660us 20.553us 738.362us 51.81% 738.362us 246.121us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 738.362us 51.81% 738.362us 246.121us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 398.333us 27.95% 398.333us 132.778us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 288.446us 20.24% 288.446us 96.149us 3 + Activity Buffer Request 12.41% 228.855us 12.41% 228.855us 228.855us 75.712us 5.31% 75.712us 75.712us 1 + aten::empty_strided 1.66% 30.693us 1.66% 30.693us 5.115us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 11.01% 202.993us 11.01% 202.993us 22.555us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.90% 16.659us 1.18% 21.780us 2.420us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.46% 8.512us 0.46% 8.512us 0.567us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.53% 9.739us 0.53% 9.739us 3.246us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.54% 9.960us 0.54% 9.960us 3.320us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.33% 6.150us 0.40% 7.440us 2.480us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.057ms -Self CUDA time total: 1.427ms +Self CPU time total: 1.844ms +Self CUDA time total: 1.425ms impl wl p50(ms) ok @@ -4765,16 +4773,10 @@ torch_eager cuda_B4_D2048_S512_W4 0.10 True torch_eager cuda_B4_D64_S128_W2 0.08 True torch_eager cuda_B4_D64_S128_W4 0.08 True torch_eager cuda_B4_D64_S2048_W2 0.08 True -torch_eager cuda_B4_D64_S2048_W4 0.09 True +torch_eager cuda_B4_D64_S2048_W4 0.08 True torch_eager cuda_B4_D64_S512_W2 0.08 True torch_eager cuda_B4_D64_S512_W4 0.08 True-▶ UV Install Logs- -Artifacts:
causal_conv1d.jsonl