diff --git "a/rotary/impls/torch_rotary.html" "b/rotary/impls/torch_rotary.html" --- "a/rotary/impls/torch_rotary.html" +++ "b/rotary/impls/torch_rotary.html" @@ -3904,7 +3904,7 @@ Cell: nv | 0.25s
Fri Dec 19 19:54:55 2025 +Fri Dec 19 23:00:45 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 | +-----------------------------------------+------------------------+----------------------+ @@ -3913,7 +3913,7 @@ Cell: nv | 0.25s | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 36C P0 90W / 350W | 0MiB / 46068MiB | 17% Default | +| N/A 40C P0 85W / 350W | 0MiB / 46068MiB | 24% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -3937,7 +3937,7 @@ Cell: nv | 0.25s ▼ output ▶ uv-logs | -Cell: benchmark | 7.93s +Cell: benchmark | 7.89s | Raw @@ -4016,27 +4016,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.121ms 1256.65% 1.121ms 1.121ms 1 - torch_eager 12.91% 396.632us 99.49% 3.057ms 3.057ms 0.000us 0.00% 90.428us 90.428us 1 - aten::mul 6.11% 187.745us 10.64% 326.977us 13.624us 47.168us 52.87% 47.168us 1.965us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.168us 52.87% 47.168us 1.965us 24 - aten::copy_ 3.73% 114.507us 64.51% 1.982ms 110.114us 28.957us 32.46% 30.173us 1.676us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.335us 25.04% 22.335us 1.861us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.087us 14.67% 13.087us 1.091us 12 - aten::clone 1.26% 38.732us 62.92% 1.933ms 322.183us 0.000us 0.00% 7.838us 1.306us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.622us 7.42% 6.622us 1.104us 6 - aten::sub 1.53% 47.130us 2.58% 79.281us 13.214us 6.559us 7.35% 6.559us 1.093us 6 - aten::add 1.27% 38.901us 2.21% 67.791us 11.299us 6.528us 7.32% 6.528us 1.088us 6 - Activity Buffer Request 55.82% 1.715ms 55.82% 1.715ms 1.715ms 1.216us 1.36% 1.216us 1.216us 1 - aten::empty_strided 2.00% 61.422us 2.00% 61.422us 10.237us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.49% 76.434us 2.49% 76.434us 12.739us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.64% 81.233us 3.38% 103.763us 4.323us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.73% 22.530us 0.73% 22.530us 0.939us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 9.00% 276.368us 9.00% 276.368us 5.758us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.51% 15.650us 0.51% 15.650us 15.650us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.009ms 1132.93% 1.009ms 1.009ms 1 + torch_eager 12.41% 369.428us 99.52% 2.961ms 2.961ms 0.000us 0.00% 90.371us 90.371us 1 + aten::mul 5.71% 169.883us 9.58% 285.017us 11.876us 47.039us 52.80% 47.039us 1.960us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.039us 52.80% 47.039us 1.960us 24 + aten::copy_ 3.61% 107.364us 66.50% 1.979ms 109.941us 28.963us 32.51% 30.243us 1.680us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.243us 24.97% 22.243us 1.854us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.089us 14.69% 13.089us 1.091us 12 + aten::clone 1.40% 41.760us 65.34% 1.944ms 324.049us 0.000us 0.00% 8.000us 1.333us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.720us 7.54% 6.720us 1.120us 6 + aten::sub 1.56% 46.400us 2.42% 72.061us 12.010us 6.561us 7.36% 6.561us 1.094us 6 + aten::add 1.27% 37.811us 2.01% 59.841us 9.973us 6.528us 7.33% 6.528us 1.088us 6 + Activity Buffer Request 58.48% 1.740ms 58.48% 1.740ms 1.740ms 1.280us 1.44% 1.280us 1.280us 1 + aten::empty_strided 1.77% 52.541us 1.77% 52.541us 8.757us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.39% 71.252us 2.39% 71.252us 11.875us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.63% 78.291us 3.42% 101.892us 4.245us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.79% 23.601us 0.79% 23.601us 0.983us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.49% 222.745us 7.49% 222.745us 4.641us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.48% 14.390us 0.48% 14.390us 14.390us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.072ms -Self CUDA time total: 89.212us +Self CPU time total: 2.976ms +Self CUDA time total: 89.091us @@ -4046,27 +4046,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.007ms 1113.49% 1.007ms 1.007ms 1 - torch_eager 10.54% 298.489us 99.78% 2.827ms 2.827ms 0.000us 0.00% 91.582us 91.582us 1 - aten::mul 6.23% 176.544us 10.66% 301.947us 12.581us 47.549us 52.58% 47.549us 1.981us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.549us 52.58% 47.549us 1.981us 24 - aten::copy_ 3.68% 104.131us 68.91% 1.952ms 108.446us 29.376us 32.49% 30.529us 1.696us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.591us 24.98% 22.591us 1.883us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.504us 14.93% 13.504us 1.125us 12 - aten::clone 0.77% 21.889us 65.76% 1.863ms 310.451us 0.000us 0.00% 7.938us 1.323us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.785us 7.50% 6.785us 1.131us 6 - aten::sub 1.41% 40.059us 2.39% 67.780us 11.297us 6.784us 7.50% 6.784us 1.131us 6 - aten::add 1.28% 36.151us 2.32% 65.853us 10.975us 6.720us 7.43% 6.720us 1.120us 6 - Activity Buffer Request 60.61% 1.717ms 60.61% 1.717ms 1.717ms 1.153us 1.28% 1.153us 1.153us 1 - aten::empty_strided 1.12% 31.742us 1.12% 31.742us 5.290us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.04% 57.701us 2.04% 57.701us 9.617us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.45% 69.273us 3.06% 86.813us 3.617us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.62% 17.540us 0.62% 17.540us 0.731us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 9.05% 256.228us 9.05% 256.228us 5.338us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.22% 6.150us 0.22% 6.150us 6.150us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 914.978us 1014.29% 914.978us 914.978us 1 + torch_eager 10.68% 295.102us 99.80% 2.758ms 2.758ms 0.000us 0.00% 91.361us 91.361us 1 + aten::mul 5.65% 156.202us 9.56% 264.265us 11.011us 47.519us 52.68% 47.519us 1.980us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.519us 52.68% 47.519us 1.980us 24 + aten::copy_ 3.57% 98.672us 70.68% 1.953ms 108.523us 29.314us 32.50% 30.466us 1.693us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.529us 24.97% 22.529us 1.877us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.376us 14.83% 13.376us 1.115us 12 + aten::clone 0.80% 22.200us 68.13% 1.883ms 313.805us 0.000us 0.00% 7.937us 1.323us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.785us 7.52% 6.785us 1.131us 6 + aten::add 1.12% 30.913us 1.88% 51.842us 8.640us 6.688us 7.41% 6.688us 1.115us 6 + aten::sub 1.32% 36.532us 2.17% 59.963us 9.994us 6.688us 7.41% 6.688us 1.115us 6 + Activity Buffer Request 63.06% 1.743ms 63.06% 1.743ms 1.743ms 1.152us 1.28% 1.152us 1.152us 1 + aten::empty_strided 1.09% 30.200us 1.09% 30.200us 5.033us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 1.91% 52.752us 1.91% 52.752us 8.792us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.31% 63.802us 2.94% 81.195us 3.383us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.63% 17.393us 0.63% 17.393us 0.725us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.65% 211.533us 7.65% 211.533us 4.407us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.20% 5.491us 0.20% 5.491us 5.491us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.833ms -Self CUDA time total: 90.429us +Self CPU time total: 2.764ms +Self CUDA time total: 90.209us @@ -4076,27 +4076,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.006ms 1072.44% 1.006ms 1.006ms 1 - torch_eager 10.52% 298.395us 99.78% 2.831ms 2.831ms 0.000us 0.00% 95.037us 95.037us 1 - aten::mul 6.11% 173.334us 10.70% 303.588us 12.649us 48.703us 51.91% 48.703us 2.029us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.703us 51.91% 48.703us 2.029us 24 - aten::copy_ 3.65% 103.459us 69.19% 1.963ms 109.044us 30.654us 32.67% 31.870us 1.771us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.975us 24.49% 22.975us 1.915us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.464us 15.42% 14.464us 1.205us 12 - aten::clone 0.76% 21.692us 65.97% 1.871ms 311.909us 0.000us 0.00% 8.895us 1.482us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.679us 8.18% 7.679us 1.280us 6 - aten::add 1.20% 34.161us 2.12% 60.242us 10.040us 7.233us 7.71% 7.233us 1.206us 6 - aten::sub 1.38% 39.061us 2.38% 67.492us 11.249us 7.231us 7.71% 7.231us 1.205us 6 - Activity Buffer Request 60.90% 1.728ms 60.90% 1.728ms 1.728ms 1.216us 1.30% 1.216us 1.216us 1 - aten::empty_strided 1.08% 30.530us 1.08% 30.530us 5.088us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.06% 58.531us 2.06% 58.531us 9.755us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.41% 68.445us 3.03% 85.834us 3.576us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.61% 17.389us 0.61% 17.389us 0.725us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 9.10% 258.010us 9.10% 258.010us 5.375us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.22% 6.240us 0.22% 6.240us 6.240us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 896.092us 952.15% 896.092us 896.092us 1 + torch_eager 10.34% 284.358us 99.81% 2.744ms 2.744ms 0.000us 0.00% 95.424us 95.424us 1 + aten::mul 5.45% 149.791us 9.32% 256.084us 10.670us 48.736us 51.79% 48.736us 2.031us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.736us 51.79% 48.736us 2.031us 24 + aten::copy_ 3.52% 96.699us 71.27% 1.959ms 108.841us 30.943us 32.88% 32.255us 1.792us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.071us 24.51% 23.071us 1.923us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.433us 15.34% 14.433us 1.203us 12 + aten::clone 0.75% 20.741us 68.58% 1.885ms 314.190us 0.000us 0.00% 9.184us 1.531us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 8.36% 7.872us 1.312us 6 + aten::sub 1.32% 36.203us 2.15% 58.992us 9.832us 7.264us 7.72% 7.264us 1.211us 6 + aten::add 1.12% 30.822us 1.89% 51.942us 8.657us 7.169us 7.62% 7.169us 1.195us 6 + Activity Buffer Request 63.70% 1.751ms 63.70% 1.751ms 1.751ms 1.312us 1.39% 1.312us 1.312us 1 + aten::empty_strided 1.08% 29.590us 1.08% 29.590us 4.932us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 1.85% 50.982us 1.85% 50.982us 8.497us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.36% 64.969us 3.01% 82.862us 3.453us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.65% 17.893us 0.65% 17.893us 0.746us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.66% 210.524us 7.66% 210.524us 4.386us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.19% 5.270us 0.19% 5.270us 5.270us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.837ms -Self CUDA time total: 93.821us +Self CPU time total: 2.749ms +Self CUDA time total: 94.112us @@ -4106,27 +4106,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.008ms 991.12% 1.008ms 1.008ms 1 - torch_eager 11.39% 291.220us 99.77% 2.552ms 2.552ms 0.000us 0.00% 103.069us 103.069us 1 - aten::mul 6.75% 172.580us 11.78% 301.232us 12.551us 52.797us 51.90% 52.797us 2.200us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.797us 51.90% 52.797us 2.200us 24 - aten::copy_ 3.99% 102.084us 65.85% 1.684ms 93.565us 32.544us 31.99% 33.888us 1.883us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.736us 24.32% 24.736us 2.061us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.384us 16.11% 16.384us 1.365us 12 - aten::clone 0.85% 21.760us 62.39% 1.596ms 265.957us 0.000us 0.00% 9.152us 1.525us 6 - aten::sub 1.61% 41.170us 2.75% 70.342us 11.724us 8.256us 8.12% 8.256us 1.376us 6 - aten::add 1.37% 35.121us 2.54% 64.851us 10.809us 8.128us 7.99% 8.128us 1.355us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 7.68% 7.808us 1.301us 6 - Activity Buffer Request 49.27% 1.260ms 49.27% 1.260ms 1.260ms 1.344us 1.32% 1.344us 1.344us 1 - aten::empty_strided 1.23% 31.560us 1.23% 31.560us 5.260us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.74% 249.077us 9.74% 249.077us 41.513us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.69% 68.742us 3.39% 86.703us 3.613us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.70% 17.961us 0.70% 17.961us 0.748us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 10.18% 260.466us 10.18% 260.466us 5.426us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.23% 5.870us 0.23% 5.870us 5.870us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 914.202us 899.81% 914.202us 914.202us 1 + torch_eager 9.98% 278.657us 99.81% 2.786ms 2.786ms 0.000us 0.00% 102.911us 102.911us 1 + aten::mul 5.55% 155.014us 9.50% 265.176us 11.049us 52.799us 51.97% 52.799us 2.200us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.799us 51.97% 52.799us 2.200us 24 + aten::copy_ 3.52% 98.370us 71.20% 1.987ms 110.396us 32.288us 31.78% 33.600us 1.867us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.544us 24.16% 24.544us 2.045us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.512us 16.25% 16.512us 1.376us 12 + aten::clone 0.72% 20.142us 68.59% 1.914ms 319.034us 0.000us 0.00% 9.056us 1.509us 6 + aten::sub 1.33% 37.171us 2.19% 61.241us 10.207us 8.320us 8.19% 8.320us 1.387us 6 + aten::add 1.42% 39.541us 2.20% 61.471us 10.245us 8.192us 8.06% 8.192us 1.365us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 7.62% 7.744us 1.291us 6 + Activity Buffer Request 57.78% 1.613ms 57.78% 1.613ms 1.613ms 1.312us 1.29% 1.312us 1.312us 1 + aten::empty_strided 1.10% 30.730us 1.10% 30.730us 5.122us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 7.81% 218.005us 7.81% 218.005us 36.334us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.28% 63.724us 2.90% 81.002us 3.375us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.62% 17.278us 0.62% 17.278us 0.720us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.68% 214.384us 7.68% 214.384us 4.466us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.19% 5.360us 0.19% 5.360us 5.360us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.558ms -Self CUDA time total: 101.725us +Self CPU time total: 2.791ms +Self CUDA time total: 101.599us @@ -4136,27 +4136,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.014ms 1081.02% 1.014ms 1.014ms 1 - torch_eager 10.25% 325.467us 99.80% 3.170ms 3.170ms 0.000us 0.00% 95.040us 95.040us 1 - aten::mul 5.52% 175.344us 9.66% 306.970us 12.790us 48.800us 52.05% 48.800us 2.033us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.800us 52.05% 48.800us 2.033us 24 - aten::copy_ 3.31% 105.233us 71.23% 2.263ms 125.701us 30.784us 32.83% 32.064us 1.781us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.041us 24.57% 23.041us 1.920us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.176us 15.12% 14.176us 1.181us 12 - aten::clone 0.88% 27.881us 68.64% 2.180ms 363.408us 0.000us 0.00% 9.023us 1.504us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.743us 8.26% 7.743us 1.290us 6 - aten::sub 1.21% 38.440us 2.12% 67.480us 11.247us 7.136us 7.61% 7.136us 1.189us 6 - aten::add 1.04% 32.931us 1.90% 60.251us 10.042us 7.040us 7.51% 7.040us 1.173us 6 - Activity Buffer Request 54.81% 1.741ms 54.81% 1.741ms 1.741ms 1.280us 1.37% 1.280us 1.280us 1 - aten::empty_strided 1.02% 32.521us 1.02% 32.521us 5.420us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 10.79% 342.870us 10.79% 342.870us 57.145us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.15% 68.383us 2.74% 86.883us 3.620us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.58% 18.500us 0.58% 18.500us 0.771us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.23% 261.446us 8.23% 261.446us 5.447us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.20% 6.450us 0.20% 6.450us 6.450us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 915.332us 974.61% 915.332us 915.332us 1 + torch_eager 10.01% 309.953us 99.82% 3.091ms 3.091ms 0.000us 0.00% 95.198us 95.198us 1 + aten::mul 4.96% 153.567us 8.55% 264.920us 11.038us 48.833us 52.00% 48.833us 2.035us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.833us 52.00% 48.833us 2.035us 24 + aten::copy_ 3.18% 98.342us 72.96% 2.259ms 125.517us 30.876us 32.88% 32.156us 1.786us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.070us 24.56% 23.070us 1.923us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.209us 15.13% 14.209us 1.184us 12 + aten::clone 0.91% 28.102us 71.00% 2.199ms 366.430us 0.000us 0.00% 9.086us 1.514us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.806us 8.31% 7.806us 1.301us 6 + aten::sub 1.22% 37.870us 1.99% 61.740us 10.290us 7.168us 7.63% 7.168us 1.195us 6 + aten::add 0.97% 30.001us 1.65% 51.171us 8.529us 7.041us 7.50% 7.041us 1.173us 6 + Activity Buffer Request 56.99% 1.765ms 56.99% 1.765ms 1.765ms 1.280us 1.36% 1.280us 1.280us 1 + aten::empty_strided 1.03% 31.751us 1.03% 31.751us 5.292us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.91% 337.859us 10.91% 337.859us 56.310us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.16% 66.933us 2.72% 84.261us 3.511us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.56% 17.328us 0.56% 17.328us 0.722us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 6.93% 214.663us 6.93% 214.663us 4.472us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.18% 5.580us 0.18% 5.580us 5.580us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.177ms -Self CUDA time total: 93.760us +Self CPU time total: 3.097ms +Self CUDA time total: 93.918us @@ -4166,26 +4166,26 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.001ms 983.06% 1.001ms 1.001ms 1 - torch_eager 9.84% 295.255us 99.81% 2.995ms 2.995ms 0.000us 0.00% 103.136us 103.136us 1 - aten::mul 5.72% 171.754us 9.96% 298.939us 12.456us 52.896us 51.95% 52.896us 2.204us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.896us 51.95% 52.896us 2.204us 24 - aten::copy_ 3.46% 103.695us 71.00% 2.131ms 118.374us 32.417us 31.84% 33.729us 1.874us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.673us 24.23% 24.673us 2.056us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.511us 16.22% 16.511us 1.376us 12 - aten::clone 0.71% 21.410us 67.93% 2.039ms 339.780us 0.000us 0.00% 9.056us 1.509us 6 - aten::sub 1.35% 40.422us 2.36% 70.682us 11.780us 8.319us 8.17% 8.319us 1.386us 6 - aten::add 1.19% 35.599us 2.15% 64.481us 10.747us 8.192us 8.05% 8.192us 1.365us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 7.61% 7.744us 1.291us 6 - Activity Buffer Request 57.66% 1.731ms 57.66% 1.731ms 1.731ms 1.312us 1.29% 1.312us 1.312us 1 - aten::empty_strided 1.04% 31.291us 1.04% 31.291us 5.215us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 7.41% 222.325us 7.41% 222.325us 37.054us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.17% 65.101us 2.75% 82.681us 3.445us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.59% 17.580us 0.59% 17.580us 0.733us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.68% 260.519us 8.68% 260.519us 5.427us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.19% 5.671us 0.19% 5.671us 5.671us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 991.450us 973.69% 991.450us 991.450us 1 + torch_eager 12.00% 374.920us 99.82% 3.119ms 3.119ms 0.000us 0.00% 103.167us 103.167us 1 + aten::mul 4.76% 148.675us 8.21% 256.446us 10.685us 52.898us 51.95% 52.898us 2.204us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.898us 51.95% 52.898us 2.204us 24 + aten::copy_ 3.10% 96.880us 71.33% 2.229ms 123.830us 32.414us 31.83% 33.757us 1.875us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.575us 24.13% 24.575us 2.048us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.512us 16.22% 16.512us 1.376us 12 + aten::clone 0.70% 21.890us 69.11% 2.160ms 359.941us 0.000us 0.00% 9.182us 1.530us 6 + aten::sub 1.16% 36.128us 1.91% 59.601us 9.934us 8.320us 8.17% 8.320us 1.387us 6 + aten::add 1.18% 36.821us 1.99% 62.122us 10.354us 8.192us 8.05% 8.192us 1.365us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.839us 7.70% 7.839us 1.307us 6 + Activity Buffer Request 56.19% 1.756ms 56.19% 1.756ms 1.756ms 1.343us 1.32% 1.343us 1.343us 1 + aten::empty_strided 0.98% 30.741us 0.98% 30.741us 5.124us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.19% 318.287us 10.19% 318.287us 53.048us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.12% 66.300us 2.71% 84.551us 3.523us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.58% 18.251us 0.58% 18.251us 0.760us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 6.87% 214.557us 6.87% 214.557us 4.470us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.18% 5.580us 0.18% 5.580us 5.580us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.001ms +Self CPU time total: 3.125ms Self CUDA time total: 101.824us @@ -4196,27 +4196,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.015ms 840.95% 1.015ms 1.015ms 1 - torch_eager 9.74% 291.664us 99.80% 2.987ms 2.987ms 0.000us 0.00% 122.530us 122.530us 1 - aten::mul 5.84% 174.903us 10.28% 307.588us 12.816us 62.144us 51.48% 62.144us 2.589us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.144us 51.48% 62.144us 2.589us 24 - aten::copy_ 3.43% 102.713us 70.67% 2.115ms 117.501us 39.265us 32.53% 41.090us 2.283us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.864us 23.91% 28.864us 2.405us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.296us 15.99% 19.296us 1.608us 12 - aten::clone 0.71% 21.240us 67.46% 2.019ms 336.492us 0.000us 0.00% 12.226us 2.038us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.401us 8.62% 10.401us 1.733us 6 - aten::add 1.16% 34.581us 2.15% 64.442us 10.740us 9.696us 8.03% 9.696us 1.616us 6 - aten::sub 1.36% 40.831us 2.36% 70.613us 11.769us 9.600us 7.95% 9.600us 1.600us 6 - Activity Buffer Request 57.99% 1.736ms 57.99% 1.736ms 1.736ms 1.825us 1.51% 1.825us 1.825us 1 - aten::empty_strided 1.04% 31.082us 1.04% 31.082us 5.180us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.60% 197.545us 6.60% 197.545us 32.924us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.24% 66.920us 2.85% 85.310us 3.555us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.61% 18.390us 0.61% 18.390us 0.766us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 9.07% 271.392us 9.07% 271.392us 5.654us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.20% 6.001us 0.20% 6.001us 6.001us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 898.909us 743.56% 898.909us 898.909us 1 + torch_eager 9.91% 292.126us 99.82% 2.942ms 2.942ms 0.000us 0.00% 122.717us 122.717us 1 + aten::mul 5.16% 152.179us 8.78% 258.933us 10.789us 62.079us 51.35% 62.079us 2.587us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.079us 51.35% 62.079us 2.587us 24 + aten::copy_ 3.24% 95.439us 72.74% 2.144ms 119.116us 39.487us 32.66% 41.311us 2.295us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.927us 23.93% 28.927us 2.411us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.327us 15.99% 19.327us 1.611us 12 + aten::clone 0.77% 22.660us 70.42% 2.076ms 345.928us 0.000us 0.00% 12.384us 2.064us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.560us 8.73% 10.560us 1.760us 6 + aten::sub 1.22% 36.000us 2.00% 58.931us 9.822us 9.664us 7.99% 9.664us 1.611us 6 + aten::add 1.08% 31.800us 1.80% 53.111us 8.852us 9.663us 7.99% 9.663us 1.611us 6 + Activity Buffer Request 57.43% 1.693ms 57.43% 1.693ms 1.693ms 1.824us 1.51% 1.824us 1.824us 1 + aten::empty_strided 1.01% 29.882us 1.01% 29.882us 4.980us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.11% 297.916us 10.11% 297.916us 49.653us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.20% 64.806us 2.80% 82.537us 3.439us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.60% 17.731us 0.60% 17.731us 0.739us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.09% 208.898us 7.09% 208.898us 4.352us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.18% 5.241us 0.18% 5.241us 5.241us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.993ms -Self CUDA time total: 120.705us +Self CPU time total: 2.948ms +Self CUDA time total: 120.893us @@ -4226,27 +4226,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.002ms 581.09% 1.002ms 1.002ms 1 - torch_eager 20.60% 295.691us 99.58% 1.430ms 1.430ms 0.000us 0.00% 175.327us 175.327us 1 - aten::mul 11.87% 170.476us 20.75% 297.916us 12.413us 89.503us 51.89% 89.503us 3.729us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.503us 51.89% 89.503us 3.729us 24 - aten::copy_ 7.24% 103.879us 39.87% 572.479us 31.804us 57.887us 33.56% 60.735us 3.374us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.832us 23.67% 40.832us 3.403us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.089us 14.55% 25.089us 2.091us 12 - aten::clone 1.51% 21.660us 33.62% 482.661us 80.443us 0.000us 0.00% 19.903us 3.317us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.055us 9.89% 17.055us 2.842us 6 - aten::sub 2.80% 40.184us 4.70% 67.454us 11.242us 12.546us 7.27% 12.546us 2.091us 6 - aten::add 2.41% 34.531us 4.25% 60.992us 10.165us 12.543us 7.27% 12.543us 2.090us 6 - Activity Buffer Request 14.04% 201.545us 14.04% 201.545us 201.545us 2.848us 1.65% 2.848us 2.848us 1 - aten::empty_strided 2.07% 29.712us 2.07% 29.712us 4.952us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.60% 195.253us 13.60% 195.253us 32.542us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.57% 65.572us 5.83% 83.731us 3.489us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.26% 18.159us 1.26% 18.159us 0.757us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 17.62% 252.973us 17.62% 252.973us 5.270us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.42% 6.061us 0.42% 6.061us 6.061us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 904.612us 526.12% 904.612us 904.612us 1 + torch_eager 19.42% 290.431us 99.64% 1.490ms 1.490ms 0.000us 0.00% 174.789us 174.789us 1 + aten::mul 10.19% 152.404us 17.63% 263.706us 10.988us 89.218us 51.89% 89.218us 3.717us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.218us 51.89% 89.218us 3.717us 24 + aten::copy_ 6.40% 95.669us 46.12% 689.622us 38.312us 57.665us 33.54% 60.513us 3.362us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.577us 23.60% 40.577us 3.381us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.058us 14.57% 25.058us 2.088us 12 + aten::clone 1.32% 19.751us 41.22% 616.413us 102.735us 0.000us 0.00% 19.936us 3.323us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 9.94% 17.088us 2.848us 6 + aten::add 2.23% 33.380us 3.73% 55.850us 9.308us 12.577us 7.31% 12.577us 2.096us 6 + aten::sub 2.48% 37.041us 4.08% 60.981us 10.164us 12.481us 7.26% 12.481us 2.080us 6 + Activity Buffer Request 17.03% 254.616us 17.03% 254.616us 254.616us 2.848us 1.66% 2.848us 2.848us 1 + aten::empty_strided 1.98% 29.551us 1.98% 29.551us 4.925us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 18.81% 281.226us 18.81% 281.226us 46.871us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.20% 62.788us 5.36% 80.180us 3.341us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.16% 17.392us 1.16% 17.392us 0.725us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.43% 215.823us 14.43% 215.823us 4.496us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.36% 5.360us 0.36% 5.360us 5.360us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.436ms -Self CUDA time total: 172.479us +Self CPU time total: 1.495ms +Self CUDA time total: 171.941us @@ -4256,27 +4256,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 998.580us 827.30% 998.580us 998.580us 1 - torch_eager 20.47% 288.290us 99.58% 1.403ms 1.403ms 0.000us 0.00% 122.464us 122.464us 1 - aten::mul 12.45% 175.343us 21.53% 303.228us 12.634us 62.175us 51.51% 62.175us 2.591us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.175us 51.51% 62.175us 2.591us 24 - aten::copy_ 7.38% 103.973us 38.88% 547.625us 30.424us 39.265us 32.53% 41.025us 2.279us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.800us 23.86% 28.800us 2.400us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.264us 15.96% 19.264us 1.605us 12 - aten::clone 1.40% 19.710us 32.30% 454.921us 75.820us 0.000us 0.00% 12.225us 2.038us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.465us 8.67% 10.465us 1.744us 6 - aten::add 2.42% 34.049us 4.36% 61.370us 10.228us 9.695us 8.03% 9.695us 1.616us 6 - aten::sub 2.76% 38.918us 4.74% 66.750us 11.125us 9.569us 7.93% 9.569us 1.595us 6 - Activity Buffer Request 13.00% 183.145us 13.00% 183.145us 183.145us 1.760us 1.46% 1.760us 1.760us 1 - aten::empty_strided 2.13% 30.010us 2.13% 30.010us 5.002us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.33% 187.794us 13.33% 187.794us 31.299us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.79% 67.456us 6.09% 85.722us 3.572us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.30% 18.266us 1.30% 18.266us 0.761us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 18.16% 255.751us 18.16% 255.751us 5.328us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.42% 5.851us 0.42% 5.851us 5.851us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 891.715us 739.56% 891.715us 891.715us 1 + torch_eager 18.90% 276.136us 99.61% 1.455ms 1.455ms 0.000us 0.00% 122.334us 122.334us 1 + aten::mul 10.73% 156.714us 18.01% 263.146us 10.964us 61.921us 51.36% 61.921us 2.580us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 61.921us 51.36% 61.921us 2.580us 24 + aten::copy_ 6.60% 96.484us 46.13% 674.035us 37.446us 39.325us 32.61% 41.085us 2.283us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.894us 23.96% 28.894us 2.408us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.328us 16.03% 19.328us 1.611us 12 + aten::clone 1.42% 20.789us 41.25% 602.732us 100.455us 0.000us 0.00% 12.191us 2.032us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 8.65% 10.431us 1.738us 6 + aten::add 2.19% 32.032us 3.63% 53.102us 8.850us 9.696us 8.04% 9.696us 1.616us 6 + aten::sub 2.44% 35.611us 4.02% 58.732us 9.789us 9.632us 7.99% 9.632us 1.605us 6 + Activity Buffer Request 17.44% 254.736us 17.44% 254.736us 254.736us 1.760us 1.46% 1.760us 1.760us 1 + aten::empty_strided 2.08% 30.391us 2.08% 30.391us 5.065us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 18.06% 263.885us 18.06% 263.885us 43.981us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.23% 61.792us 5.41% 79.021us 3.293us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.18% 17.229us 1.18% 17.229us 0.718us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.34% 209.553us 14.34% 209.553us 4.366us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.39% 5.690us 0.39% 5.690us 5.690us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.409ms -Self CUDA time total: 120.704us +Self CPU time total: 1.461ms +Self CUDA time total: 120.574us @@ -4286,27 +4286,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 990.551us 575.17% 990.551us 990.551us 1 - torch_eager 9.55% 281.107us 99.81% 2.938ms 2.938ms 0.000us 0.00% 175.100us 175.100us 1 - aten::mul 5.81% 170.942us 10.23% 301.167us 12.549us 89.468us 51.95% 89.468us 3.728us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.468us 51.95% 89.468us 3.728us 24 - aten::copy_ 3.50% 102.971us 70.92% 2.088ms 115.984us 57.728us 33.52% 60.608us 3.367us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.800us 23.69% 40.800us 3.400us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.024us 14.53% 25.024us 2.085us 12 - aten::clone 0.79% 23.370us 68.06% 2.004ms 333.942us 0.000us 0.00% 19.808us 3.301us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 9.83% 16.928us 2.821us 6 - aten::add 1.19% 35.020us 2.08% 61.181us 10.197us 12.513us 7.27% 12.513us 2.085us 6 - aten::sub 1.27% 37.451us 2.29% 67.451us 11.242us 12.511us 7.26% 12.511us 2.085us 6 - Activity Buffer Request 58.47% 1.721ms 58.47% 1.721ms 1.721ms 2.880us 1.67% 2.880us 2.880us 1 - aten::empty_strided 1.04% 30.722us 1.04% 30.722us 5.120us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.59% 193.904us 6.59% 193.904us 32.317us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.29% 67.323us 2.91% 85.623us 3.568us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.62% 18.300us 0.62% 18.300us 0.762us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.69% 255.869us 8.69% 255.869us 5.331us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.19% 5.631us 0.19% 5.631us 5.631us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 898.269us 521.86% 898.269us 898.269us 1 + torch_eager 9.37% 284.685us 99.81% 3.032ms 3.032ms 0.000us 0.00% 174.974us 174.974us 1 + aten::mul 4.95% 150.441us 8.39% 254.798us 10.617us 89.503us 52.00% 89.503us 3.729us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.503us 52.00% 89.503us 3.729us 24 + aten::copy_ 3.17% 96.407us 73.91% 2.246ms 124.750us 57.409us 33.35% 60.256us 3.348us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.513us 23.54% 40.513us 3.376us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.215us 14.65% 25.215us 2.101us 12 + aten::clone 0.72% 21.929us 71.62% 2.176ms 362.673us 0.000us 0.00% 19.743us 3.291us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.896us 9.82% 16.896us 2.816us 6 + aten::add 1.01% 30.763us 1.71% 51.873us 8.645us 12.671us 7.36% 12.671us 2.112us 6 + aten::sub 1.24% 37.781us 2.01% 61.191us 10.198us 12.544us 7.29% 12.544us 2.091us 6 + Activity Buffer Request 60.27% 1.831ms 60.27% 1.831ms 1.831ms 2.847us 1.65% 2.847us 2.847us 1 + aten::empty_strided 0.99% 30.010us 0.99% 30.010us 5.002us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.59% 260.978us 8.59% 260.978us 43.496us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.13% 64.731us 2.72% 82.502us 3.438us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.58% 17.771us 0.58% 17.771us 0.740us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 6.78% 205.930us 6.78% 205.930us 4.290us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.19% 5.660us 0.19% 5.660us 5.660us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.944ms -Self CUDA time total: 172.220us +Self CPU time total: 3.038ms +Self CUDA time total: 172.127us @@ -4316,27 +4316,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 996.043us 348.79% 996.043us 996.043us 1 - torch_eager 20.87% 290.654us 99.57% 1.387ms 1.387ms 0.000us 0.00% 304.030us 304.030us 1 - aten::mul 12.33% 171.709us 21.56% 300.326us 12.514us 133.152us 46.63% 133.152us 5.548us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.152us 46.63% 133.152us 5.548us 24 - aten::copy_ 7.20% 100.319us 38.33% 533.940us 29.663us 111.295us 38.97% 129.758us 7.209us 18 - aten::clone 1.43% 19.920us 31.74% 442.109us 73.685us 0.000us 0.00% 72.510us 12.085us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.248us 20.05% 57.248us 4.771us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 54.047us 18.93% 54.047us 9.008us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.120us 14.40% 41.120us 3.427us 12 - aten::sub 2.81% 39.111us 4.76% 66.281us 11.047us 20.672us 7.24% 20.672us 3.445us 6 - aten::add 2.49% 34.670us 4.42% 61.560us 10.260us 20.448us 7.16% 20.448us 3.408us 6 - Activity Buffer Request 12.17% 169.484us 12.17% 169.484us 169.484us 18.463us 6.47% 18.463us 18.463us 1 - aten::empty_strided 2.13% 29.640us 2.13% 29.640us 4.940us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.75% 191.524us 13.75% 191.524us 31.921us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.84% 67.441us 6.08% 84.703us 3.529us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.24% 17.262us 1.24% 17.262us 0.719us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 18.33% 255.290us 18.33% 255.290us 5.319us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.43% 5.940us 0.43% 5.940us 5.940us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 909.882us 318.77% 909.882us 909.882us 1 + torch_eager 19.71% 283.050us 99.60% 1.431ms 1.431ms 0.000us 0.00% 303.835us 303.835us 1 + aten::mul 10.48% 150.585us 17.83% 256.036us 10.668us 133.276us 46.69% 133.276us 5.553us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.276us 46.69% 133.276us 5.553us 24 + aten::copy_ 6.71% 96.363us 44.37% 637.354us 35.409us 111.072us 38.91% 129.472us 7.193us 18 + aten::clone 1.41% 20.282us 39.28% 564.182us 94.030us 0.000us 0.00% 72.352us 12.059us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.120us 20.01% 57.120us 4.760us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.952us 18.90% 53.952us 8.992us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.087us 14.39% 41.087us 3.424us 12 + aten::sub 3.01% 43.193us 4.71% 67.582us 11.264us 20.671us 7.24% 20.671us 3.445us 6 + aten::add 2.32% 33.393us 3.83% 55.073us 9.179us 20.416us 7.15% 20.416us 3.403us 6 + Activity Buffer Request 16.10% 231.245us 16.10% 231.245us 231.245us 18.400us 6.45% 18.400us 18.400us 1 + aten::empty_strided 2.13% 30.620us 2.13% 30.620us 5.103us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.40% 249.916us 17.40% 249.916us 41.653us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.43% 63.582us 5.61% 80.624us 3.359us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.19% 17.042us 1.19% 17.042us 0.710us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.71% 211.350us 14.71% 211.350us 4.403us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.40% 5.701us 0.40% 5.701us 5.701us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.393ms -Self CUDA time total: 285.567us +Self CPU time total: 1.436ms +Self CUDA time total: 285.435us @@ -4346,27 +4346,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.018ms 178.84% 1.018ms 1.018ms 1 - torch_eager 20.93% 294.497us 99.60% 1.402ms 1.402ms 0.000us 0.00% 592.727us 592.727us 1 - aten::copy_ 7.21% 101.432us 37.46% 527.132us 29.285us 273.662us 48.09% 297.374us 16.521us 18 - aten::mul 12.77% 179.663us 21.96% 309.058us 12.877us 229.816us 40.39% 229.816us 9.576us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 229.816us 40.39% 229.816us 9.576us 24 - aten::clone 1.44% 20.252us 30.89% 434.772us 72.462us 0.000us 0.00% 206.559us 34.426us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 182.847us 32.13% 182.847us 30.474us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.815us 15.96% 90.815us 7.568us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.537us 11.52% 65.537us 5.461us 12 - aten::sub 2.76% 38.901us 5.06% 71.272us 11.879us 32.865us 5.78% 32.865us 5.477us 6 - aten::add 2.62% 36.830us 4.51% 63.532us 10.589us 32.672us 5.74% 32.672us 5.445us 6 - Activity Buffer Request 11.90% 167.504us 11.90% 167.504us 167.504us 23.712us 4.17% 23.712us 23.712us 1 - aten::empty_strided 2.15% 30.291us 2.15% 30.291us 5.049us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.10% 184.385us 13.10% 184.385us 30.731us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.77% 67.152us 6.08% 85.571us 3.565us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.31% 18.419us 1.31% 18.419us 0.767us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 18.64% 262.279us 18.64% 262.279us 5.464us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.40% 5.670us 0.40% 5.670us 5.670us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 912.081us 160.07% 912.081us 912.081us 1 + torch_eager 19.62% 281.157us 99.59% 1.427ms 1.427ms 0.000us 0.00% 593.529us 593.529us 1 + aten::copy_ 6.77% 97.080us 44.43% 636.741us 35.375us 273.659us 48.03% 297.370us 16.521us 18 + aten::mul 10.89% 156.021us 18.62% 266.885us 11.120us 230.237us 40.41% 230.237us 9.593us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 230.237us 40.41% 230.237us 9.593us 24 + aten::clone 1.32% 18.940us 39.02% 559.152us 93.192us 0.000us 0.00% 206.843us 34.474us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 183.132us 32.14% 183.132us 30.522us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.527us 15.89% 90.527us 7.544us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.922us 11.57% 65.922us 5.494us 12 + aten::sub 2.63% 37.723us 4.26% 61.043us 10.174us 33.089us 5.81% 33.089us 5.515us 6 + aten::add 2.15% 30.871us 3.64% 52.150us 8.692us 32.833us 5.76% 32.833us 5.472us 6 + Activity Buffer Request 16.39% 234.945us 16.39% 234.945us 234.945us 23.711us 4.16% 23.711us 23.711us 1 + aten::empty_strided 2.06% 29.522us 2.06% 29.522us 4.920us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.05% 244.375us 17.05% 244.375us 40.729us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.41% 63.274us 5.65% 80.913us 3.371us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.23% 17.639us 1.23% 17.639us 0.735us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.06% 215.804us 15.06% 215.804us 4.496us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.41% 5.810us 0.41% 5.810us 5.810us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.407ms -Self CUDA time total: 569.015us +Self CPU time total: 1.433ms +Self CUDA time total: 569.818us @@ -4376,27 +4376,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.000ms 1080.76% 1.000ms 1.000ms 1 - torch_eager 9.93% 294.202us 99.82% 2.956ms 2.956ms 0.000us 0.00% 93.659us 93.659us 1 - aten::mul 5.79% 171.584us 10.10% 299.275us 12.470us 49.565us 53.56% 49.565us 2.065us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.565us 53.56% 49.565us 2.065us 24 - aten::copy_ 3.59% 106.321us 70.88% 2.099ms 116.627us 29.406us 31.78% 30.526us 1.696us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.623us 24.45% 22.623us 1.885us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.568us 14.66% 13.568us 1.131us 12 - aten::clone 0.71% 20.970us 67.84% 2.009ms 334.883us 0.000us 0.00% 7.903us 1.317us 6 - aten::sub 1.35% 39.881us 2.33% 69.031us 11.505us 6.784us 7.33% 6.784us 1.131us 6 - aten::add 1.15% 34.189us 1.99% 58.861us 9.810us 6.784us 7.33% 6.784us 1.131us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.783us 7.33% 6.783us 1.130us 6 - Activity Buffer Request 58.33% 1.728ms 58.33% 1.728ms 1.728ms 1.120us 1.21% 1.120us 1.120us 1 - aten::empty_strided 1.04% 30.880us 1.04% 30.880us 5.147us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.53% 193.506us 6.53% 193.506us 32.251us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.25% 66.634us 2.83% 83.844us 3.493us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.58% 17.210us 0.58% 17.210us 0.717us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.56% 253.394us 8.56% 253.394us 5.279us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.18% 5.350us 0.18% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 982.303us 1061.08% 982.303us 982.303us 1 + torch_eager 9.91% 295.071us 99.80% 2.971ms 2.971ms 0.000us 0.00% 93.696us 93.696us 1 + aten::mul 5.34% 158.961us 9.11% 271.063us 11.294us 49.631us 53.61% 49.631us 2.068us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.631us 53.61% 49.631us 2.068us 24 + aten::copy_ 3.28% 97.679us 70.68% 2.104ms 116.897us 29.410us 31.77% 30.530us 1.696us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.593us 24.40% 22.593us 1.883us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.535us 14.62% 13.535us 1.128us 12 + aten::clone 0.75% 22.189us 68.29% 2.033ms 338.797us 0.000us 0.00% 7.937us 1.323us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.817us 7.36% 6.817us 1.136us 6 + aten::sub 1.25% 37.104us 2.05% 60.974us 10.162us 6.784us 7.33% 6.784us 1.131us 6 + aten::add 1.06% 31.689us 1.78% 52.921us 8.820us 6.751us 7.29% 6.751us 1.125us 6 + Activity Buffer Request 57.10% 1.700ms 57.10% 1.700ms 1.700ms 1.120us 1.21% 1.120us 1.120us 1 + aten::empty_strided 1.03% 30.660us 1.03% 30.660us 5.110us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.33% 247.996us 8.33% 247.996us 41.333us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.24% 66.630us 4.50% 133.983us 5.583us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 2.26% 67.353us 2.26% 67.353us 2.806us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.25% 215.936us 7.25% 215.936us 4.499us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.20% 5.820us 0.20% 5.820us 5.820us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.962ms -Self CUDA time total: 92.539us +Self CPU time total: 2.977ms +Self CUDA time total: 92.576us @@ -4406,27 +4406,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.005ms 1046.18% 1.005ms 1.005ms 1 - torch_eager 20.70% 290.067us 99.57% 1.395ms 1.395ms 0.000us 0.00% 97.374us 97.374us 1 - aten::mul 12.93% 181.162us 22.03% 308.587us 12.858us 51.138us 53.23% 51.138us 2.131us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.138us 53.23% 51.138us 2.131us 24 - aten::copy_ 7.20% 100.934us 37.57% 526.405us 29.245us 30.750us 32.01% 32.061us 1.781us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.975us 23.92% 22.975us 1.915us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.175us 14.76% 14.175us 1.181us 12 - aten::clone 1.53% 21.448us 31.30% 438.559us 73.093us 0.000us 0.00% 9.086us 1.514us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.775us 8.09% 7.775us 1.296us 6 - aten::sub 2.81% 39.410us 4.84% 67.761us 11.294us 7.103us 7.39% 7.103us 1.184us 6 - aten::add 2.52% 35.299us 4.51% 63.202us 10.534us 7.072us 7.36% 7.072us 1.179us 6 - Activity Buffer Request 11.90% 166.735us 11.90% 166.735us 166.735us 1.311us 1.36% 1.311us 1.311us 1 - aten::empty_strided 2.22% 31.071us 2.22% 31.071us 5.178us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.33% 186.813us 13.33% 186.813us 31.136us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.90% 68.622us 6.17% 86.384us 3.599us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.27% 17.762us 1.27% 17.762us 0.740us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 18.24% 255.602us 18.24% 255.602us 5.325us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.43% 6.050us 0.43% 6.050us 6.050us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 884.987us 920.94% 884.987us 884.987us 1 + torch_eager 19.85% 274.487us 99.61% 1.377ms 1.377ms 0.000us 0.00% 97.408us 97.408us 1 + aten::mul 10.71% 148.094us 18.37% 254.035us 10.585us 51.231us 53.31% 51.231us 2.135us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.231us 53.31% 51.231us 2.135us 24 + aten::copy_ 6.85% 94.742us 43.76% 605.074us 33.615us 30.720us 31.97% 32.032us 1.780us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.881us 23.81% 22.881us 1.907us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.145us 14.72% 14.145us 1.179us 12 + aten::clone 1.40% 19.328us 38.56% 533.201us 88.867us 0.000us 0.00% 9.151us 1.525us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.839us 8.16% 7.839us 1.306us 6 + aten::sub 2.67% 36.902us 4.36% 60.321us 10.054us 7.073us 7.36% 7.073us 1.179us 6 + aten::add 2.24% 30.942us 3.77% 52.122us 8.687us 7.072us 7.36% 7.072us 1.179us 6 + Activity Buffer Request 15.48% 213.995us 15.48% 213.995us 213.995us 1.312us 1.37% 1.312us 1.312us 1 + aten::empty_strided 2.17% 30.062us 2.17% 30.062us 5.010us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.28% 238.866us 17.28% 238.866us 39.811us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.64% 64.092us 5.92% 81.901us 3.413us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.29% 17.809us 1.29% 17.809us 0.742us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.04% 208.011us 15.04% 208.011us 4.334us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.39% 5.351us 0.39% 5.351us 5.351us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.401ms -Self CUDA time total: 96.063us +Self CPU time total: 1.383ms +Self CUDA time total: 96.096us @@ -4436,27 +4436,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.008ms 966.66% 1.008ms 1.008ms 1 - torch_eager 19.87% 287.726us 99.59% 1.442ms 1.442ms 0.000us 0.00% 105.564us 105.564us 1 - aten::mul 12.48% 180.745us 21.62% 312.998us 13.042us 55.327us 53.07% 55.327us 2.305us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.327us 53.07% 55.327us 2.305us 24 - aten::copy_ 7.05% 102.025us 39.57% 572.964us 31.831us 32.416us 31.09% 33.727us 1.874us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.737us 23.73% 24.737us 2.061us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.510us 15.84% 16.510us 1.376us 12 - aten::clone 1.40% 20.271us 33.26% 481.521us 80.254us 0.000us 0.00% 8.990us 1.498us 6 - aten::sub 2.64% 38.220us 4.70% 68.062us 11.344us 8.351us 8.01% 8.351us 1.392us 6 - aten::add 2.49% 36.083us 4.34% 62.773us 10.462us 8.159us 7.83% 8.159us 1.360us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.679us 7.37% 7.679us 1.280us 6 - Activity Buffer Request 13.36% 193.394us 13.36% 193.394us 193.394us 1.311us 1.26% 1.311us 1.311us 1 - aten::empty_strided 2.11% 30.520us 2.11% 30.520us 5.087us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 14.10% 204.105us 14.10% 204.105us 34.017us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.72% 68.322us 5.99% 86.681us 3.612us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.27% 18.359us 1.27% 18.359us 0.765us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 18.11% 262.225us 18.11% 262.225us 5.463us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.41% 5.880us 0.41% 5.880us 5.880us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 884.827us 849.48% 884.827us 884.827us 1 + torch_eager 19.57% 271.768us 99.62% 1.383ms 1.383ms 0.000us 0.00% 105.505us 105.505us 1 + aten::mul 10.84% 150.541us 18.55% 257.594us 10.733us 55.295us 53.09% 55.295us 2.304us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.295us 53.09% 55.295us 2.304us 24 + aten::copy_ 7.04% 97.750us 44.26% 614.622us 34.146us 32.417us 31.12% 33.761us 1.876us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.609us 23.63% 24.609us 2.051us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.449us 15.79% 16.449us 1.371us 12 + aten::clone 1.38% 19.214us 38.80% 538.794us 89.799us 0.000us 0.00% 9.152us 1.525us 6 + aten::sub 2.56% 35.499us 4.18% 58.050us 9.675us 8.289us 7.96% 8.289us 1.381us 6 + aten::add 2.19% 30.379us 3.71% 51.460us 8.577us 8.160us 7.83% 8.160us 1.360us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 7.50% 7.808us 1.301us 6 + Activity Buffer Request 16.03% 222.545us 16.03% 222.545us 222.545us 1.344us 1.29% 1.344us 1.344us 1 + aten::empty_strided 2.12% 29.421us 2.12% 29.421us 4.904us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 16.95% 235.404us 16.95% 235.404us 39.234us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.56% 63.330us 5.86% 81.351us 3.390us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.30% 18.021us 1.30% 18.021us 0.751us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.09% 209.608us 15.09% 209.608us 4.367us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.38% 5.250us 0.38% 5.250us 5.250us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.448ms -Self CUDA time total: 104.253us +Self CPU time total: 1.389ms +Self CUDA time total: 104.161us @@ -4466,27 +4466,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.006ms 813.90% 1.006ms 1.006ms 1 - torch_eager 9.94% 295.168us 99.80% 2.964ms 2.964ms 0.000us 0.00% 125.309us 125.309us 1 - aten::mul 5.76% 171.113us 10.09% 299.797us 12.492us 65.183us 52.76% 65.183us 2.716us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.183us 52.76% 65.183us 2.716us 24 - aten::copy_ 3.36% 99.933us 70.61% 2.097ms 116.499us 39.102us 31.65% 40.862us 2.270us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.702us 23.23% 28.702us 2.392us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.264us 15.59% 19.264us 1.605us 12 - aten::clone 0.72% 21.400us 67.60% 2.008ms 334.638us 0.000us 0.00% 12.160us 2.027us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 8.42% 10.400us 1.733us 6 - aten::sub 1.36% 40.342us 2.40% 71.402us 11.900us 9.664us 7.82% 9.664us 1.611us 6 - aten::add 1.19% 35.292us 2.11% 62.642us 10.440us 9.600us 7.77% 9.600us 1.600us 6 - Activity Buffer Request 58.22% 1.729ms 58.22% 1.729ms 1.729ms 1.760us 1.42% 1.760us 1.760us 1 - aten::empty_strided 1.03% 30.470us 1.03% 30.470us 5.078us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.55% 194.565us 6.55% 194.565us 32.427us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.28% 67.682us 2.90% 86.242us 3.593us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.62% 18.560us 0.62% 18.560us 0.773us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.77% 260.395us 8.77% 260.395us 5.425us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.20% 5.880us 0.20% 5.880us 5.880us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 919.709us 742.30% 919.709us 919.709us 1 + torch_eager 9.60% 287.984us 99.81% 2.995ms 2.995ms 0.000us 0.00% 125.756us 125.756us 1 + aten::mul 5.08% 152.565us 8.72% 261.687us 10.904us 65.119us 52.56% 65.119us 2.713us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.119us 52.56% 65.119us 2.713us 24 + aten::copy_ 3.44% 103.340us 73.23% 2.197ms 122.068us 39.390us 31.79% 41.246us 2.291us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.863us 23.30% 28.863us 2.405us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.391us 15.65% 19.391us 1.616us 12 + aten::clone 0.71% 21.441us 70.60% 2.118ms 353.061us 0.000us 0.00% 12.383us 2.064us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.527us 8.50% 10.527us 1.754us 6 + aten::add 1.05% 31.401us 1.75% 52.512us 8.752us 9.728us 7.85% 9.728us 1.621us 6 + aten::sub 1.23% 37.000us 2.00% 60.082us 10.014us 9.663us 7.80% 9.663us 1.611us 6 + Activity Buffer Request 59.85% 1.796ms 59.85% 1.796ms 1.796ms 1.856us 1.50% 1.856us 1.856us 1 + aten::empty_strided 1.02% 30.521us 1.02% 30.521us 5.087us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 7.96% 238.696us 7.96% 238.696us 39.783us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.18% 65.422us 2.78% 83.321us 3.472us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.60% 17.899us 0.60% 17.899us 0.746us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.09% 212.797us 7.09% 212.797us 4.433us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.19% 5.701us 0.19% 5.701us 5.701us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.970ms -Self CUDA time total: 123.549us +Self CPU time total: 3.000ms +Self CUDA time total: 123.900us @@ -4496,27 +4496,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 983.089us 943.25% 983.089us 983.089us 1 - torch_eager 19.98% 288.033us 99.62% 1.436ms 1.436ms 0.000us 0.00% 105.536us 105.536us 1 - aten::mul 11.74% 169.216us 20.40% 294.119us 12.255us 55.424us 53.18% 55.424us 2.309us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.424us 53.18% 55.424us 2.309us 24 - aten::copy_ 7.16% 103.226us 41.05% 591.856us 32.881us 32.352us 31.04% 33.664us 1.870us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.640us 23.64% 24.640us 2.053us 12 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 877.937us 842.38% 877.937us 877.937us 1 + torch_eager 19.95% 270.846us 99.61% 1.353ms 1.353ms 0.000us 0.00% 105.532us 105.532us 1 + aten::mul 11.07% 150.304us 18.76% 254.767us 10.615us 55.358us 53.12% 55.358us 2.307us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.358us 53.12% 55.358us 2.307us 24 + aten::copy_ 7.05% 95.673us 43.37% 588.992us 32.722us 32.415us 31.10% 33.726us 1.874us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.672us 23.67% 24.672us 2.056us 12 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.448us 15.78% 16.448us 1.371us 12 - aten::clone 1.52% 21.981us 35.14% 506.672us 84.445us 0.000us 0.00% 9.024us 1.504us 6 - aten::sub 2.73% 39.361us 4.47% 64.512us 10.752us 8.289us 7.95% 8.289us 1.381us 6 - aten::add 2.40% 34.591us 4.34% 62.531us 10.422us 8.159us 7.83% 8.159us 1.360us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 7.40% 7.712us 1.285us 6 - Activity Buffer Request 15.86% 228.735us 15.86% 228.735us 228.735us 1.312us 1.26% 1.312us 1.312us 1 - aten::empty_strided 2.19% 31.580us 2.19% 31.580us 5.263us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.29% 191.574us 13.29% 191.574us 31.929us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.47% 64.452us 5.67% 81.763us 3.407us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.20% 17.311us 1.20% 17.311us 0.721us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 17.08% 246.315us 17.08% 246.315us 5.132us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.38% 5.431us 0.38% 5.431us 5.431us 0.000us 0.00% 0.000us 0.000us 1 + aten::clone 1.39% 18.922us 38.11% 517.523us 86.254us 0.000us 0.00% 9.054us 1.509us 6 + aten::sub 2.65% 35.970us 4.36% 59.161us 9.860us 8.288us 7.95% 8.288us 1.381us 6 + aten::add 2.30% 31.190us 3.85% 52.221us 8.703us 8.160us 7.83% 8.160us 1.360us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.743us 7.43% 7.743us 1.290us 6 + Activity Buffer Request 14.68% 199.344us 14.68% 199.344us 199.344us 1.311us 1.26% 1.311us 1.311us 1 + aten::empty_strided 2.13% 28.940us 2.13% 28.940us 4.823us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.49% 237.454us 17.49% 237.454us 39.576us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.56% 61.882us 5.80% 78.761us 3.282us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.24% 16.879us 1.24% 16.879us 0.703us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.11% 205.206us 15.11% 205.206us 4.275us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.39% 5.300us 0.39% 5.300us 5.300us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.442ms -Self CUDA time total: 104.224us +Self CPU time total: 1.358ms +Self CUDA time total: 104.221us @@ -4526,27 +4526,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.004ms 809.35% 1.004ms 1.004ms 1 - torch_eager 20.77% 293.259us 99.59% 1.406ms 1.406ms 0.000us 0.00% 125.785us 125.785us 1 - aten::mul 11.90% 168.115us 21.00% 296.496us 12.354us 65.470us 52.80% 65.470us 2.728us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.470us 52.80% 65.470us 2.728us 24 - aten::copy_ 7.29% 102.893us 38.63% 545.534us 30.307us 39.326us 31.72% 41.117us 2.284us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.831us 23.25% 28.831us 2.403us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.198us 15.48% 19.198us 1.600us 12 - aten::clone 1.46% 20.620us 31.97% 451.491us 75.249us 0.000us 0.00% 12.286us 2.048us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.495us 8.46% 10.495us 1.749us 6 - aten::add 2.69% 38.013us 4.51% 63.752us 10.625us 9.599us 7.74% 9.599us 1.600us 6 - aten::sub 3.27% 46.223us 5.18% 73.182us 12.197us 9.599us 7.74% 9.599us 1.600us 6 - Activity Buffer Request 12.70% 179.315us 12.70% 179.315us 179.315us 1.791us 1.44% 1.791us 1.791us 1 - aten::empty_strided 2.12% 29.941us 2.12% 29.941us 4.990us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.35% 188.535us 13.35% 188.535us 31.423us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.72% 66.700us 5.92% 83.651us 3.485us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.20% 16.951us 1.20% 16.951us 0.706us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 18.12% 255.870us 18.12% 255.870us 5.331us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.41% 5.720us 0.41% 5.720us 5.720us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 922.549us 744.77% 922.549us 922.549us 1 + torch_eager 21.60% 303.651us 99.57% 1.400ms 1.400ms 0.000us 0.00% 125.727us 125.727us 1 + aten::mul 10.95% 153.925us 18.62% 261.836us 10.910us 65.152us 52.60% 65.152us 2.715us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.152us 52.60% 65.152us 2.715us 24 + aten::copy_ 6.95% 97.683us 42.26% 594.153us 33.008us 39.455us 31.85% 41.311us 2.295us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.863us 23.30% 28.863us 2.405us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.264us 15.55% 19.264us 1.605us 12 + aten::clone 1.38% 19.461us 36.95% 519.462us 86.577us 0.000us 0.00% 12.448us 2.075us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.592us 8.55% 10.592us 1.765us 6 + aten::add 2.18% 30.620us 3.65% 51.250us 8.542us 9.664us 7.80% 9.664us 1.611us 6 + aten::sub 2.57% 36.162us 4.29% 60.263us 10.044us 9.600us 7.75% 9.600us 1.600us 6 + Activity Buffer Request 14.58% 204.944us 14.58% 204.944us 204.944us 1.856us 1.50% 1.856us 1.856us 1 + aten::empty_strided 2.07% 29.171us 2.07% 29.171us 4.862us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 16.64% 233.894us 16.64% 233.894us 38.982us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.45% 62.564us 5.70% 80.065us 3.336us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.24% 17.501us 1.24% 17.501us 0.729us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.96% 210.274us 14.96% 210.274us 4.381us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.43% 5.990us 0.43% 5.990us 5.990us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.412ms -Self CUDA time total: 123.994us +Self CPU time total: 1.406ms +Self CUDA time total: 123.871us @@ -4556,27 +4556,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.022ms 574.36% 1.022ms 1.022ms 1 - torch_eager 9.88% 300.789us 99.79% 3.040ms 3.040ms 0.000us 0.00% 180.741us 180.741us 1 - aten::mul 5.87% 178.699us 10.24% 311.746us 12.989us 95.331us 53.59% 95.331us 3.972us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 95.331us 53.59% 95.331us 3.972us 24 - aten::copy_ 3.42% 104.263us 70.83% 2.158ms 119.861us 57.731us 32.45% 60.579us 3.366us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.803us 22.94% 40.803us 3.400us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.831us 13.96% 24.831us 2.069us 12 - aten::clone 0.72% 21.809us 67.85% 2.067ms 344.453us 0.000us 0.00% 19.776us 3.296us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 9.52% 16.928us 2.821us 6 - aten::add 1.17% 35.582us 2.06% 62.693us 10.449us 12.480us 7.02% 12.480us 2.080us 6 - aten::sub 1.34% 40.961us 2.23% 67.853us 11.309us 12.351us 6.94% 12.351us 2.059us 6 - Activity Buffer Request 58.86% 1.793ms 58.86% 1.793ms 1.793ms 2.848us 1.60% 2.848us 2.848us 1 - aten::empty_strided 1.00% 30.461us 1.00% 30.461us 5.077us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.14% 187.135us 6.14% 187.135us 31.189us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.22% 67.750us 2.85% 86.690us 3.612us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.62% 18.940us 0.62% 18.940us 0.789us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.55% 260.511us 8.55% 260.511us 5.427us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.21% 6.321us 0.21% 6.321us 6.321us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 897.539us 506.00% 897.539us 897.539us 1 + torch_eager 9.74% 279.839us 99.80% 2.869ms 2.869ms 0.000us 0.00% 180.226us 180.226us 1 + aten::mul 5.31% 152.543us 9.01% 258.914us 10.788us 94.688us 53.38% 94.688us 3.945us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.688us 53.38% 94.688us 3.945us 24 + aten::copy_ 3.38% 97.055us 72.52% 2.084ms 115.804us 57.633us 32.49% 60.481us 3.360us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.641us 22.91% 40.641us 3.387us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.057us 14.13% 25.057us 2.088us 12 + aten::clone 0.77% 22.210us 70.07% 2.014ms 335.666us 0.000us 0.00% 19.840us 3.307us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.992us 9.58% 16.992us 2.832us 6 + aten::add 1.09% 31.359us 1.84% 52.920us 8.820us 12.577us 7.09% 12.577us 2.096us 6 + aten::sub 1.28% 36.869us 2.08% 59.850us 9.975us 12.480us 7.04% 12.480us 2.080us 6 + Activity Buffer Request 59.04% 1.697ms 59.04% 1.697ms 1.697ms 2.848us 1.61% 2.848us 2.848us 1 + aten::empty_strided 1.03% 29.731us 1.03% 29.731us 4.955us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.09% 232.664us 8.09% 232.664us 38.777us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.21% 63.552us 2.81% 80.641us 3.360us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.59% 17.089us 0.59% 17.089us 0.712us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.26% 208.724us 7.26% 208.724us 4.348us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.20% 5.620us 0.20% 5.620us 5.620us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.046ms -Self CUDA time total: 177.893us +Self CPU time total: 2.874ms +Self CUDA time total: 177.378us @@ -4586,27 +4586,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.032ms 346.83% 1.032ms 1.032ms 1 - torch_eager 10.94% 330.454us 99.81% 3.014ms 3.014ms 0.000us 0.00% 315.361us 315.361us 1 - aten::mul 5.75% 173.643us 10.12% 305.657us 12.736us 145.696us 48.95% 145.696us 6.071us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.696us 48.95% 145.696us 6.071us 24 - aten::copy_ 3.49% 105.289us 69.61% 2.102ms 116.764us 110.818us 37.23% 128.546us 7.141us 18 - aten::clone 0.91% 27.529us 66.74% 2.015ms 335.845us 0.000us 0.00% 71.233us 11.872us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.313us 19.26% 57.313us 4.776us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.505us 17.98% 53.505us 8.918us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.119us 13.82% 41.119us 3.427us 12 - aten::sub 1.33% 40.040us 2.29% 68.991us 11.499us 20.607us 6.92% 20.607us 3.434us 6 - aten::add 1.18% 35.771us 2.04% 61.541us 10.257us 20.512us 6.89% 20.512us 3.419us 6 - Activity Buffer Request 56.83% 1.716ms 56.83% 1.716ms 1.716ms 17.728us 5.96% 17.728us 17.728us 1 - aten::empty_strided 1.04% 31.352us 1.04% 31.352us 5.225us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.87% 207.436us 6.87% 207.436us 34.573us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.27% 68.592us 2.86% 86.271us 3.595us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.59% 17.679us 0.59% 17.679us 0.737us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.61% 259.847us 8.61% 259.847us 5.413us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.19% 5.690us 0.19% 5.690us 5.690us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 898.810us 301.67% 898.810us 898.810us 1 + torch_eager 9.55% 282.366us 99.82% 2.951ms 2.951ms 0.000us 0.00% 315.933us 315.933us 1 + aten::mul 5.12% 151.293us 8.70% 257.314us 10.721us 145.309us 48.77% 145.309us 6.055us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.309us 48.77% 145.309us 6.055us 24 + aten::copy_ 3.22% 95.316us 73.31% 2.167ms 120.400us 111.488us 37.42% 129.472us 7.193us 18 + aten::clone 0.72% 21.328us 70.93% 2.097ms 349.496us 0.000us 0.00% 72.192us 12.032us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.280us 19.22% 57.280us 4.773us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 54.208us 18.19% 54.208us 9.035us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.152us 13.81% 41.152us 3.429us 12 + aten::sub 1.21% 35.903us 1.98% 58.432us 9.739us 20.607us 6.92% 20.607us 3.434us 6 + aten::add 1.05% 31.001us 1.75% 51.651us 8.608us 20.545us 6.90% 20.545us 3.424us 6 + Activity Buffer Request 60.43% 1.786ms 60.43% 1.786ms 1.786ms 17.984us 6.04% 17.984us 17.984us 1 + aten::empty_strided 1.00% 29.601us 1.00% 29.601us 4.934us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 7.72% 228.244us 7.72% 228.244us 38.041us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.20% 64.933us 2.81% 83.072us 3.461us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.61% 18.139us 0.61% 18.139us 0.756us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 6.98% 206.421us 6.98% 206.421us 4.300us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.18% 5.270us 0.18% 5.270us 5.270us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.019ms -Self CUDA time total: 297.633us +Self CPU time total: 2.956ms +Self CUDA time total: 297.949us @@ -4616,27 +4616,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 997.224us 560.81% 997.224us 997.224us 1 - torch_eager 20.41% 294.666us 99.60% 1.438ms 1.438ms 0.000us 0.00% 180.699us 180.699us 1 - aten::mul 11.81% 170.426us 20.82% 300.510us 12.521us 95.263us 53.57% 95.263us 3.969us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 95.263us 53.57% 95.263us 3.969us 24 - aten::copy_ 6.98% 100.734us 40.22% 580.546us 32.253us 57.725us 32.46% 60.604us 3.367us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.895us 23.00% 40.895us 3.408us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.832us 13.96% 24.832us 2.069us 12 - aten::clone 1.48% 21.402us 34.03% 491.202us 81.867us 0.000us 0.00% 19.709us 3.285us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.830us 9.46% 16.830us 2.805us 6 - aten::sub 2.72% 39.260us 4.81% 69.410us 11.568us 12.416us 6.98% 12.416us 2.069us 6 - aten::add 2.46% 35.491us 4.17% 60.132us 10.022us 12.416us 6.98% 12.416us 2.069us 6 - Activity Buffer Request 14.76% 213.066us 14.76% 213.066us 213.066us 2.879us 1.62% 2.879us 2.879us 1 - aten::empty_strided 2.03% 29.329us 2.03% 29.329us 4.888us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.55% 195.534us 13.55% 195.534us 32.589us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.52% 65.271us 5.66% 81.651us 3.402us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.13% 16.380us 1.13% 16.380us 0.683us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 17.74% 256.087us 17.74% 256.087us 5.335us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.40% 5.801us 0.40% 5.801us 5.801us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 892.604us 503.24% 892.604us 892.604us 1 + torch_eager 19.40% 275.276us 99.62% 1.413ms 1.413ms 0.000us 0.00% 180.252us 180.252us 1 + aten::mul 10.76% 152.603us 18.14% 257.417us 10.726us 94.815us 53.46% 94.815us 3.951us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.815us 53.46% 94.815us 3.951us 24 + aten::copy_ 6.67% 94.615us 44.72% 634.467us 35.248us 57.598us 32.47% 60.478us 3.360us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.510us 22.84% 40.510us 3.376us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.959us 14.07% 24.959us 2.080us 12 + aten::clone 1.36% 19.279us 39.62% 562.091us 93.682us 0.000us 0.00% 19.968us 3.328us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 9.63% 17.088us 2.848us 6 + aten::add 2.18% 30.992us 4.25% 60.342us 10.057us 12.607us 7.11% 12.607us 2.101us 6 + aten::sub 2.49% 35.261us 4.05% 57.481us 9.580us 12.352us 6.96% 12.352us 2.059us 6 + Activity Buffer Request 18.10% 256.816us 18.10% 256.816us 256.816us 2.880us 1.62% 2.880us 2.880us 1 + aten::empty_strided 2.07% 29.380us 2.07% 29.380us 4.897us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 15.91% 225.774us 15.91% 225.774us 37.629us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.38% 62.100us 5.62% 79.779us 3.324us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.25% 17.679us 1.25% 17.679us 0.737us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.06% 213.646us 15.06% 213.646us 4.451us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.38% 5.461us 0.38% 5.461us 5.461us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.443ms -Self CUDA time total: 177.820us +Self CPU time total: 1.419ms +Self CUDA time total: 177.372us @@ -4646,27 +4646,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.006ms 337.57% 1.006ms 1.006ms 1 - torch_eager 19.59% 286.301us 99.64% 1.456ms 1.456ms 0.000us 0.00% 316.065us 316.065us 1 - aten::mul 11.95% 174.684us 20.79% 303.846us 12.660us 145.439us 48.82% 145.439us 6.060us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.439us 48.82% 145.439us 6.060us 24 - aten::copy_ 7.17% 104.791us 40.98% 598.785us 33.266us 111.649us 37.48% 129.826us 7.213us 18 - aten::clone 1.35% 19.700us 34.46% 503.492us 83.915us 0.000us 0.00% 72.450us 12.075us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.376us 19.26% 57.376us 4.781us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 54.273us 18.22% 54.273us 9.045us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.800us 13.70% 40.800us 3.400us 12 - aten::sub 2.64% 38.590us 4.63% 67.691us 11.282us 20.448us 6.86% 20.448us 3.408us 6 - aten::add 2.53% 37.009us 4.42% 64.522us 10.754us 20.352us 6.83% 20.352us 3.392us 6 - Activity Buffer Request 15.77% 230.486us 15.77% 230.486us 230.486us 18.177us 6.10% 18.177us 18.177us 1 - aten::empty_strided 2.02% 29.450us 2.02% 29.450us 4.908us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 12.98% 189.676us 12.98% 189.676us 31.613us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.60% 67.290us 5.86% 85.691us 3.570us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.26% 18.401us 1.26% 18.401us 0.767us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 17.77% 259.608us 17.77% 259.608us 5.409us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.36% 5.300us 0.36% 5.300us 5.300us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 993.373us 332.44% 993.373us 993.373us 1 + torch_eager 19.67% 287.571us 99.61% 1.456ms 1.456ms 0.000us 0.00% 317.118us 317.118us 1 + aten::mul 10.73% 156.833us 18.24% 266.665us 11.111us 145.536us 48.70% 145.536us 6.064us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.536us 48.70% 145.536us 6.064us 24 + aten::copy_ 6.97% 101.849us 44.88% 655.991us 36.444us 112.414us 37.62% 130.718us 7.262us 18 + aten::clone 1.31% 19.190us 34.67% 506.790us 84.465us 0.000us 0.00% 73.472us 12.245us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.246us 19.16% 57.246us 4.770us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 55.168us 18.46% 55.168us 9.195us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.864us 13.68% 40.864us 3.405us 12 + aten::sub 2.57% 37.523us 4.19% 61.312us 10.219us 20.447us 6.84% 20.447us 3.408us 6 + aten::add 2.11% 30.781us 3.53% 51.651us 8.608us 20.417us 6.83% 20.417us 3.403us 6 + Activity Buffer Request 13.85% 202.395us 13.85% 202.395us 202.395us 18.304us 6.13% 18.304us 18.304us 1 + aten::empty_strided 2.07% 30.250us 2.07% 30.250us 5.042us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 15.36% 224.584us 15.36% 224.584us 37.431us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.45% 65.071us 5.71% 83.402us 3.475us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.25% 18.331us 1.25% 18.331us 0.764us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 19.27% 281.654us 19.27% 281.654us 5.868us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.39% 5.741us 0.39% 5.741us 5.741us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.461ms -Self CUDA time total: 297.888us +Self CPU time total: 1.462ms +Self CUDA time total: 298.814us @@ -4676,27 +4676,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.028ms 173.98% 1.028ms 1.028ms 1 - torch_eager 20.54% 307.941us 99.60% 1.493ms 1.493ms 0.000us 0.00% 614.720us 614.720us 1 - aten::copy_ 6.96% 104.343us 39.80% 596.653us 33.147us 277.825us 47.01% 301.537us 16.752us 18 - aten::mul 11.79% 176.823us 20.94% 313.949us 13.081us 247.103us 41.81% 247.103us 10.296us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 247.103us 41.81% 247.103us 10.296us 24 - aten::clone 1.37% 20.599us 33.38% 500.402us 83.400us 0.000us 0.00% 210.816us 35.136us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 187.104us 31.66% 187.104us 31.184us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.721us 15.35% 90.721us 7.560us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 66.080us 11.18% 66.080us 5.507us 12 - aten::add 2.50% 37.532us 4.42% 66.263us 11.044us 33.056us 5.59% 33.056us 5.509us 6 - aten::sub 2.78% 41.622us 4.80% 72.031us 12.005us 33.024us 5.59% 33.024us 5.504us 6 - Activity Buffer Request 15.35% 230.085us 15.35% 230.085us 230.085us 23.712us 4.01% 23.712us 23.712us 1 - aten::empty_strided 1.95% 29.191us 1.95% 29.191us 4.865us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 12.52% 187.674us 12.52% 187.674us 31.279us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.56% 68.431us 5.78% 86.660us 3.611us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.22% 18.229us 1.22% 18.229us 0.760us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 18.06% 270.817us 18.06% 270.817us 5.642us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.40% 6.020us 0.40% 6.020us 6.020us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 897.247us 151.96% 897.247us 897.247us 1 + torch_eager 20.14% 274.634us 99.62% 1.358ms 1.358ms 0.000us 0.00% 614.305us 614.305us 1 + aten::copy_ 6.96% 94.863us 42.61% 580.932us 32.274us 278.975us 47.25% 302.847us 16.825us 18 + aten::mul 11.17% 152.301us 19.13% 260.794us 10.866us 245.758us 41.62% 245.758us 10.240us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 245.758us 41.62% 245.758us 10.240us 24 + aten::clone 1.47% 20.049us 37.19% 507.071us 84.512us 0.000us 0.00% 212.225us 35.371us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 188.353us 31.90% 188.353us 31.392us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.622us 15.35% 90.622us 7.552us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.700us 11.13% 65.700us 5.475us 12 + aten::add 2.26% 30.880us 3.85% 52.533us 8.756us 32.930us 5.58% 32.930us 5.488us 6 + aten::sub 2.73% 37.222us 4.45% 60.643us 10.107us 32.770us 5.55% 32.770us 5.462us 6 + Activity Buffer Request 15.15% 206.524us 15.15% 206.524us 206.524us 23.872us 4.04% 23.872us 23.872us 1 + aten::empty_strided 2.13% 29.022us 2.13% 29.022us 4.837us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 16.16% 220.325us 16.16% 220.325us 36.721us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.58% 62.481us 5.83% 79.533us 3.314us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.25% 17.052us 1.25% 17.052us 0.711us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.61% 212.787us 15.61% 212.787us 4.433us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.38% 5.230us 0.38% 5.230us 5.230us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.499ms -Self CUDA time total: 591.008us +Self CPU time total: 1.363ms +Self CUDA time total: 590.433us @@ -4706,59 +4706,59 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 13.10% 290.757us 64.42% 1.430ms 1.430ms 0.000us 0.00% 1.858ms 1.858ms 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.829ms 102.08% 1.829ms 1.829ms 1 - aten::copy_ 4.50% 99.996us 26.00% 577.207us 32.067us 806.683us 45.02% 872.922us 48.496us 18 - aten::mul 7.46% 165.620us 13.35% 296.315us 12.346us 837.369us 46.73% 837.369us 34.890us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 837.369us 46.73% 837.369us 34.890us 24 - aten::clone 0.93% 20.729us 21.91% 486.420us 81.070us 0.000us 0.00% 620.507us 103.418us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 554.268us 30.93% 554.268us 92.378us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 252.415us 14.09% 252.415us 21.035us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 147.935us 8.26% 147.935us 12.328us 12 - aten::sub 1.77% 39.361us 2.98% 66.203us 11.034us 89.536us 5.00% 89.536us 14.923us 6 - Activity Buffer Request 8.80% 195.395us 8.80% 195.395us 195.395us 66.239us 3.70% 66.239us 66.239us 1 - aten::add 1.55% 34.513us 2.81% 62.423us 10.404us 58.399us 3.26% 58.399us 9.733us 6 - aten::empty_strided 1.34% 29.799us 1.34% 29.799us 4.967us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.39% 208.344us 9.39% 208.344us 34.724us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.12% 69.193us 3.90% 86.553us 3.606us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.78% 17.360us 0.78% 17.360us 0.723us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 11.66% 258.919us 11.66% 258.919us 5.394us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 35.58% 789.949us 35.58% 789.949us 789.949us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 12.28% 274.098us 60.28% 1.346ms 1.346ms 0.000us 0.00% 1.856ms 1.856ms 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.827ms 102.10% 1.827ms 1.827ms 1 + aten::copy_ 4.28% 95.634us 25.22% 562.931us 31.274us 804.864us 44.98% 871.488us 48.416us 18 + aten::mul 6.88% 153.564us 11.78% 262.937us 10.956us 837.983us 46.83% 837.983us 34.916us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 837.983us 46.83% 837.983us 34.916us 24 + aten::clone 0.88% 19.740us 22.00% 491.160us 81.860us 0.000us 0.00% 619.552us 103.259us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 552.928us 30.90% 552.928us 92.155us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 251.936us 14.08% 251.936us 20.995us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 146.496us 8.19% 146.496us 12.208us 12 + aten::sub 1.66% 37.140us 2.70% 60.281us 10.047us 88.032us 4.92% 88.032us 14.672us 6 + Activity Buffer Request 8.33% 186.034us 8.33% 186.034us 186.034us 66.624us 3.72% 66.624us 66.624us 1 + aten::add 1.41% 31.471us 2.39% 53.240us 8.873us 58.464us 3.27% 58.464us 9.744us 6 + aten::empty_strided 1.36% 30.441us 1.36% 30.441us 5.074us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.01% 223.534us 10.01% 223.534us 37.256us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.89% 64.468us 3.67% 81.941us 3.414us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.78% 17.473us 0.78% 17.473us 0.728us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 9.50% 212.012us 9.50% 212.012us 4.417us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 39.72% 886.609us 39.72% 886.609us 886.609us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.220ms -Self CUDA time total: 1.792ms +Self CPU time total: 2.232ms +Self CUDA time total: 1.789ms impl wl p50(ms) ok -torch_eager cuda_B1_S128_H32_D128_R64 0.27 True -torch_eager cuda_B1_S128_H32_D64_R32 0.25 True -torch_eager cuda_B1_S128_H8_D128_R64 0.25 True -torch_eager cuda_B1_S128_H8_D64_R32 0.19 True -torch_eager cuda_B1_S2048_H32_D128_R64 0.25 True -torch_eager cuda_B1_S2048_H32_D64_R32 0.25 True -torch_eager cuda_B1_S2048_H8_D128_R64 0.24 True -torch_eager cuda_B1_S2048_H8_D64_R32 0.25 True -torch_eager cuda_B1_S512_H32_D128_R64 0.24 True -torch_eager cuda_B1_S512_H32_D64_R32 0.24 True -torch_eager cuda_B1_S512_H8_D128_R64 0.24 True -torch_eager cuda_B1_S512_H8_D64_R32 0.25 True -torch_eager cuda_B2_S128_H32_D128_R64 0.24 True -torch_eager cuda_B2_S128_H32_D64_R32 0.25 True -torch_eager cuda_B2_S128_H8_D128_R64 0.25 True -torch_eager cuda_B2_S128_H8_D64_R32 0.24 True -torch_eager cuda_B2_S2048_H32_D128_R64 0.65 True -torch_eager cuda_B2_S2048_H32_D64_R32 0.25 True -torch_eager cuda_B2_S2048_H8_D128_R64 0.25 True -torch_eager cuda_B2_S2048_H8_D64_R32 0.24 True -torch_eager cuda_B2_S512_H32_D128_R64 0.25 True -torch_eager cuda_B2_S512_H32_D64_R32 0.24 True -torch_eager cuda_B2_S512_H8_D128_R64 0.24 True -torch_eager cuda_B2_S512_H8_D64_R32 0.24 True +torch_eager cuda_B1_S128_H32_D128_R64 0.22 True +torch_eager cuda_B1_S128_H32_D64_R32 0.22 True +torch_eager cuda_B1_S128_H8_D128_R64 0.22 True +torch_eager cuda_B1_S128_H8_D64_R32 0.17 True +torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True +torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True +torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True +torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True +torch_eager cuda_B1_S512_H32_D128_R64 0.22 True +torch_eager cuda_B1_S512_H32_D64_R32 0.21 True +torch_eager cuda_B1_S512_H8_D128_R64 0.21 True +torch_eager cuda_B1_S512_H8_D64_R32 0.22 True +torch_eager cuda_B2_S128_H32_D128_R64 0.21 True +torch_eager cuda_B2_S128_H32_D64_R32 0.21 True +torch_eager cuda_B2_S128_H8_D128_R64 0.22 True +torch_eager cuda_B2_S128_H8_D64_R32 0.21 True +torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True +torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True +torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True +torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True +torch_eager cuda_B2_S512_H32_D128_R64 0.22 True +torch_eager cuda_B2_S512_H32_D64_R32 0.22 True +torch_eager cuda_B2_S512_H8_D128_R64 0.21 True +torch_eager cuda_B2_S512_H8_D64_R32 0.21 True▶ UV Install Logs