diff --git "a/rotary/impls/torch_rotary.html" "b/rotary/impls/torch_rotary.html" --- "a/rotary/impls/torch_rotary.html" +++ "b/rotary/impls/torch_rotary.html" @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.20s +Cell: nv | 0.23s | Raw @@ -3887,7 +3895,7 @@ Cell: nv | 0.20s
Wed Oct 29 14:26:51 2025 ++Wed Oct 29 15:50:24 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | |-----------------------------------------+------------------------+----------------------+ @@ -3896,7 +3904,7 @@ Cell: nv | 0.20s | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 32C P0 76W / 350W | 0MiB / 46068MiB | 11% Default | +| N/A 29C P0 88W / 350W | 0MiB / 46068MiB | 0% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -3918,9 +3926,9 @@ Cell: nv | 0.20s ▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 3.84s +Cell: benchmark | 7.50s | Raw @@ -3999,27 +4007,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.124ms 1261.56% 1.124ms 1.124ms 1 - torch_eager 14.73% 412.767us 99.72% 2.794ms 2.794ms 0.000us 0.00% 90.337us 90.337us 1 - aten::mul 6.25% 175.043us 11.07% 310.105us 12.921us 46.912us 52.64% 46.912us 1.955us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 46.912us 52.64% 46.912us 1.955us 24 - aten::copy_ 4.12% 115.463us 61.76% 1.730ms 96.132us 28.993us 32.53% 30.210us 1.678us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.368us 25.10% 22.368us 1.864us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.215us 14.83% 13.215us 1.101us 12 - aten::clone 1.31% 36.692us 59.66% 1.671ms 278.565us 0.000us 0.00% 7.842us 1.307us 6 - aten::sub 1.68% 47.063us 2.72% 76.213us 12.702us 6.655us 7.47% 6.655us 1.109us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.625us 7.43% 6.625us 1.104us 6 - aten::add 1.39% 39.044us 2.34% 65.583us 10.930us 6.560us 7.36% 6.560us 1.093us 6 - Activity Buffer Request 52.45% 1.470ms 52.45% 1.470ms 1.470ms 1.217us 1.37% 1.217us 1.217us 1 - aten::empty_strided 1.99% 55.621us 1.99% 55.621us 9.270us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.66% 74.431us 2.66% 74.431us 12.405us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.98% 83.492us 3.80% 106.494us 4.437us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.82% 23.002us 0.82% 23.002us 0.958us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 9.34% 261.675us 9.34% 261.675us 5.452us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.28% 7.890us 0.28% 7.890us 7.890us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.033ms 1157.58% 1.033ms 1.033ms 1 + torch_eager 14.26% 386.998us 99.70% 2.705ms 2.705ms 0.000us 0.00% 90.431us 90.431us 1 + aten::mul 6.08% 164.867us 10.45% 283.577us 11.816us 46.976us 52.65% 46.976us 1.957us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 46.976us 52.65% 46.976us 1.957us 24 + aten::copy_ 3.96% 107.533us 62.14% 1.686ms 93.665us 28.959us 32.46% 30.175us 1.676us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.303us 25.00% 22.303us 1.859us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.280us 14.89% 13.280us 1.107us 12 + aten::clone 1.58% 42.971us 61.19% 1.660ms 276.703us 0.000us 0.00% 7.872us 1.312us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.656us 7.46% 6.656us 1.109us 6 + aten::sub 1.73% 46.871us 2.69% 72.911us 12.152us 6.656us 7.46% 6.656us 1.109us 6 + aten::add 1.35% 36.531us 2.16% 58.672us 9.779us 6.624us 7.42% 6.624us 1.104us 6 + Activity Buffer Request 53.14% 1.442ms 53.14% 1.442ms 1.442ms 1.216us 1.36% 1.216us 1.216us 1 + aten::empty_strided 2.28% 61.772us 2.28% 61.772us 10.295us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.73% 74.144us 2.73% 74.144us 12.357us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.20% 86.920us 4.13% 112.081us 4.670us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.93% 25.161us 0.93% 25.161us 1.048us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.45% 229.371us 8.45% 229.371us 4.779us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.30% 8.270us 0.30% 8.270us 8.270us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.802ms -Self CUDA time total: 89.120us +Self CPU time total: 2.713ms +Self CUDA time total: 89.215us @@ -4029,27 +4037,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 968.092us 1071.28% 968.092us 968.092us 1 - torch_eager 12.50% 317.076us 99.79% 2.532ms 2.532ms 0.000us 0.00% 91.488us 91.488us 1 - aten::mul 6.07% 153.959us 10.35% 262.528us 10.939us 47.648us 52.73% 47.648us 1.985us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.648us 52.73% 47.648us 1.985us 24 - aten::copy_ 4.16% 105.603us 65.14% 1.653ms 91.828us 29.344us 32.47% 30.464us 1.692us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.592us 25.00% 22.592us 1.883us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.376us 14.80% 13.376us 1.115us 12 - aten::clone 1.12% 28.391us 62.74% 1.592ms 265.351us 0.000us 0.00% 7.872us 1.312us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 7.47% 6.752us 1.125us 6 - aten::sub 1.55% 39.261us 2.49% 63.132us 10.522us 6.688us 7.40% 6.688us 1.115us 6 - aten::add 1.47% 37.180us 2.35% 59.741us 9.957us 6.688us 7.40% 6.688us 1.115us 6 - Activity Buffer Request 56.17% 1.425ms 56.17% 1.425ms 1.425ms 1.120us 1.24% 1.120us 1.120us 1 - aten::empty_strided 2.04% 51.662us 2.04% 51.662us 8.610us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.12% 53.792us 2.12% 53.792us 8.965us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.04% 77.153us 3.82% 96.932us 4.039us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.78% 19.779us 0.78% 19.779us 0.824us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.79% 223.101us 8.79% 223.101us 4.648us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.21% 5.210us 0.21% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 920.346us 1019.88% 920.346us 920.346us 1 + torch_eager 11.67% 287.669us 99.75% 2.459ms 2.459ms 0.000us 0.00% 91.392us 91.392us 1 + aten::mul 5.97% 147.150us 10.47% 258.131us 10.755us 47.681us 52.84% 47.681us 1.987us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.681us 52.84% 47.681us 1.987us 24 + aten::copy_ 4.01% 98.743us 66.94% 1.650ms 91.665us 29.184us 32.34% 30.335us 1.685us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.433us 24.86% 22.433us 1.869us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.376us 14.82% 13.376us 1.115us 12 + aten::clone 0.96% 23.772us 64.13% 1.581ms 263.446us 0.000us 0.00% 7.902us 1.317us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.751us 7.48% 6.751us 1.125us 6 + aten::sub 1.51% 37.314us 2.51% 61.954us 10.326us 6.720us 7.45% 6.720us 1.120us 6 + aten::add 1.33% 32.821us 2.21% 54.451us 9.075us 6.656us 7.38% 6.656us 1.109us 6 + Activity Buffer Request 58.20% 1.434ms 58.20% 1.434ms 1.434ms 1.151us 1.28% 1.151us 1.151us 1 + aten::empty_strided 1.33% 32.830us 1.33% 32.830us 5.472us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.21% 54.420us 2.21% 54.420us 9.070us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.84% 69.900us 3.65% 89.853us 3.744us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.81% 19.953us 0.81% 19.953us 0.831us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.92% 219.731us 8.92% 219.731us 4.578us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.25% 6.050us 0.25% 6.050us 6.050us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.538ms -Self CUDA time total: 90.368us +Self CPU time total: 2.465ms +Self CUDA time total: 90.241us @@ -4059,27 +4067,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.007ms 1071.77% 1.007ms 1.007ms 1 - torch_eager 12.81% 333.813us 99.77% 2.600ms 2.600ms 0.000us 0.00% 95.234us 95.234us 1 - aten::mul 6.17% 160.752us 10.75% 280.063us 11.669us 48.706us 51.86% 48.706us 2.029us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.706us 51.86% 48.706us 2.029us 24 - aten::copy_ 4.30% 112.081us 64.85% 1.690ms 93.891us 30.753us 32.74% 32.065us 1.781us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.009us 24.50% 23.009us 1.917us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.463us 15.40% 14.463us 1.205us 12 - aten::clone 1.08% 28.070us 62.18% 1.621ms 270.093us 0.000us 0.00% 9.056us 1.509us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 8.25% 7.744us 1.291us 6 - aten::sub 1.50% 39.201us 2.50% 65.063us 10.844us 7.263us 7.73% 7.263us 1.211us 6 - aten::add 1.40% 36.592us 2.30% 59.882us 9.980us 7.200us 7.67% 7.200us 1.200us 6 - Activity Buffer Request 55.61% 1.449ms 55.61% 1.449ms 1.449ms 1.312us 1.40% 1.312us 1.312us 1 - aten::empty_strided 1.87% 48.773us 1.87% 48.773us 8.129us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.21% 57.593us 2.21% 57.593us 9.599us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.85% 74.230us 3.62% 94.450us 3.935us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.78% 20.220us 0.78% 20.220us 0.842us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 9.19% 239.464us 9.19% 239.464us 4.989us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.23% 5.970us 0.23% 5.970us 5.970us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 909.568us 966.47% 909.568us 909.568us 1 + torch_eager 11.23% 276.876us 99.79% 2.460ms 2.460ms 0.000us 0.00% 95.424us 95.424us 1 + aten::mul 6.27% 154.461us 10.66% 262.794us 10.950us 48.800us 51.85% 48.800us 2.033us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.800us 51.85% 48.800us 2.033us 24 + aten::copy_ 4.02% 99.094us 67.67% 1.668ms 92.677us 30.912us 32.85% 32.224us 1.790us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.008us 24.45% 23.008us 1.917us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.400us 15.30% 14.400us 1.200us 12 + aten::clone 0.93% 22.950us 64.64% 1.593ms 265.583us 0.000us 0.00% 9.216us 1.536us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 8.40% 7.904us 1.317us 6 + aten::sub 1.56% 38.564us 2.52% 62.034us 10.339us 7.200us 7.65% 7.200us 1.200us 6 + aten::add 1.24% 30.660us 2.12% 52.250us 8.708us 7.200us 7.65% 7.200us 1.200us 6 + Activity Buffer Request 58.87% 1.451ms 58.87% 1.451ms 1.451ms 1.312us 1.39% 1.312us 1.312us 1 + aten::empty_strided 1.24% 30.531us 1.24% 30.531us 5.089us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.20% 54.240us 2.20% 54.240us 9.040us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.65% 65.401us 3.42% 84.323us 3.513us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.77% 18.922us 0.77% 18.922us 0.788us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.80% 216.993us 8.80% 216.993us 4.521us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.21% 5.190us 0.21% 5.190us 5.190us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.606ms -Self CUDA time total: 93.922us +Self CPU time total: 2.465ms +Self CUDA time total: 94.112us @@ -4089,27 +4097,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 976.889us 967.02% 976.889us 976.889us 1 - torch_eager 12.01% 329.416us 99.82% 2.739ms 2.739ms 0.000us 0.00% 102.333us 102.333us 1 - aten::mul 5.67% 155.545us 9.73% 266.927us 11.122us 52.800us 52.27% 52.800us 2.200us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.800us 52.27% 52.800us 2.200us 24 - aten::copy_ 3.82% 104.765us 68.18% 1.871ms 103.922us 32.349us 32.02% 33.661us 1.870us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.574us 24.33% 24.574us 2.048us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.872us 15.71% 15.872us 1.323us 12 - aten::clone 1.07% 29.290us 65.23% 1.790ms 298.277us 0.000us 0.00% 9.087us 1.515us 6 - aten::sub 1.39% 38.150us 2.28% 62.431us 10.405us 7.936us 7.86% 7.936us 1.323us 6 - aten::add 1.24% 34.113us 2.07% 56.743us 9.457us 7.936us 7.86% 7.936us 1.323us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.775us 7.70% 7.775us 1.296us 6 - Activity Buffer Request 52.33% 1.436ms 52.33% 1.436ms 1.436ms 1.312us 1.30% 1.312us 1.312us 1 - aten::empty_strided 1.16% 31.821us 1.16% 31.821us 5.304us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.42% 258.335us 9.42% 258.335us 43.056us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.63% 72.071us 3.33% 91.411us 3.809us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.70% 19.340us 0.70% 19.340us 0.806us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.39% 230.176us 8.39% 230.176us 4.795us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.18% 5.010us 0.18% 5.010us 5.010us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 892.572us 880.74% 892.572us 892.572us 1 + torch_eager 11.35% 283.366us 99.78% 2.492ms 2.492ms 0.000us 0.00% 102.687us 102.687us 1 + aten::mul 5.93% 148.202us 10.19% 254.513us 10.605us 52.956us 52.25% 52.956us 2.207us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.956us 52.25% 52.956us 2.207us 24 + aten::copy_ 3.94% 98.395us 68.27% 1.705ms 94.725us 32.482us 32.05% 33.826us 1.879us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.641us 24.31% 24.641us 2.053us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.905us 15.69% 15.905us 1.325us 12 + aten::clone 0.86% 21.380us 65.50% 1.636ms 272.651us 0.000us 0.00% 9.185us 1.531us 6 + aten::add 1.24% 31.000us 2.12% 53.041us 8.840us 8.032us 7.93% 8.032us 1.339us 6 + aten::sub 1.40% 35.052us 2.32% 58.022us 9.670us 7.873us 7.77% 7.873us 1.312us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.841us 7.74% 7.841us 1.307us 6 + Activity Buffer Request 52.43% 1.309ms 52.43% 1.309ms 1.309ms 1.344us 1.33% 1.344us 1.344us 1 + aten::empty_strided 1.32% 33.071us 1.32% 33.071us 5.512us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.52% 237.764us 9.52% 237.764us 39.627us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.60% 64.825us 3.35% 83.624us 3.484us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.75% 18.799us 0.75% 18.799us 0.783us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.44% 210.793us 8.44% 210.793us 4.392us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.22% 5.611us 0.22% 5.611us 5.611us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.744ms -Self CUDA time total: 101.021us +Self CPU time total: 2.498ms +Self CUDA time total: 101.343us @@ -4119,27 +4127,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 972.954us 1035.95% 972.954us 972.954us 1 - torch_eager 11.82% 323.628us 99.83% 2.734ms 2.734ms 0.000us 0.00% 95.231us 95.231us 1 - aten::mul 5.48% 150.092us 9.71% 265.906us 11.079us 48.958us 52.13% 48.958us 2.040us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.958us 52.13% 48.958us 2.040us 24 - aten::copy_ 4.01% 109.805us 68.55% 1.878ms 104.307us 30.784us 32.78% 32.096us 1.783us 18 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 907.478us 966.25% 907.478us 907.478us 1 + torch_eager 11.02% 305.318us 99.81% 2.765ms 2.765ms 0.000us 0.00% 95.230us 95.230us 1 + aten::mul 5.24% 145.172us 9.20% 254.787us 10.616us 49.023us 52.20% 49.023us 2.043us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.023us 52.20% 49.023us 2.043us 24 + aten::copy_ 3.74% 103.536us 70.23% 1.945ms 108.067us 30.719us 32.71% 32.031us 1.779us 18 void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 24.40% 22.912us 1.909us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.177us 15.09% 14.177us 1.181us 12 - aten::clone 0.98% 26.740us 65.50% 1.794ms 299.012us 0.000us 0.00% 9.184us 1.531us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 8.38% 7.872us 1.312us 6 - aten::sub 1.35% 37.100us 2.22% 60.781us 10.130us 7.106us 7.57% 7.106us 1.184us 6 - aten::add 1.26% 34.471us 2.07% 56.641us 9.440us 7.071us 7.53% 7.071us 1.178us 6 - Activity Buffer Request 53.28% 1.459ms 53.28% 1.459ms 1.459ms 1.312us 1.40% 1.312us 1.312us 1 - aten::empty_strided 1.12% 30.591us 1.12% 30.591us 5.098us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.84% 242.034us 8.84% 242.034us 40.339us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.64% 72.284us 3.37% 92.363us 3.848us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.73% 20.079us 0.73% 20.079us 0.837us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.33% 228.067us 8.33% 228.067us 4.751us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.17% 4.701us 0.17% 4.701us 4.701us 0.000us 0.00% 0.000us 0.000us 1 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.176us 15.09% 14.176us 1.181us 12 + aten::clone 1.09% 30.110us 67.87% 1.880ms 313.329us 0.000us 0.00% 9.119us 1.520us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.807us 8.31% 7.807us 1.301us 6 + aten::sub 1.24% 34.480us 2.10% 58.270us 9.712us 7.104us 7.56% 7.104us 1.184us 6 + aten::add 1.09% 30.091us 1.87% 51.880us 8.647us 7.072us 7.53% 7.072us 1.179us 6 + Activity Buffer Request 52.12% 1.444ms 52.12% 1.444ms 1.444ms 1.312us 1.40% 1.312us 1.312us 1 + aten::empty_strided 1.13% 31.430us 1.13% 31.430us 5.238us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 12.15% 336.439us 12.15% 336.439us 56.073us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.48% 68.768us 3.17% 87.719us 3.655us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.68% 18.951us 0.68% 18.951us 0.790us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.82% 216.674us 7.82% 216.674us 4.514us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.19% 5.210us 0.19% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.739ms -Self CUDA time total: 93.919us +Self CPU time total: 2.770ms +Self CUDA time total: 93.918us @@ -4149,27 +4157,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 940.506us 929.78% 940.506us 940.506us 1 - torch_eager 10.47% 280.203us 99.80% 2.672ms 2.672ms 0.000us 0.00% 102.466us 102.466us 1 - aten::mul 5.68% 151.942us 9.93% 265.874us 11.078us 52.767us 52.17% 52.767us 2.199us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.767us 52.17% 52.767us 2.199us 24 - aten::copy_ 3.99% 106.699us 69.68% 1.866ms 103.641us 32.384us 32.01% 33.696us 1.872us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.672us 24.39% 24.672us 2.056us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.003us 15.82% 16.003us 1.334us 12 - aten::clone 0.80% 21.540us 66.42% 1.778ms 296.379us 0.000us 0.00% 9.024us 1.504us 6 - aten::sub 1.42% 38.052us 2.40% 64.133us 10.689us 8.002us 7.91% 8.002us 1.334us 6 - aten::add 1.23% 32.860us 2.10% 56.182us 9.364us 8.001us 7.91% 8.001us 1.333us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 7.62% 7.712us 1.285us 6 - Activity Buffer Request 54.45% 1.458ms 54.45% 1.458ms 1.458ms 1.312us 1.30% 1.312us 1.312us 1 - aten::empty_strided 1.14% 30.450us 1.14% 30.450us 5.075us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.74% 234.006us 8.74% 234.006us 39.001us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.58% 69.109us 3.28% 87.850us 3.660us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.70% 18.741us 0.70% 18.741us 0.781us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.61% 230.527us 8.61% 230.527us 4.803us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.20% 5.400us 0.20% 5.400us 5.400us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 917.786us 906.49% 917.786us 917.786us 1 + torch_eager 10.59% 290.695us 99.81% 2.741ms 2.741ms 0.000us 0.00% 102.558us 102.558us 1 + aten::mul 5.39% 148.136us 9.30% 255.477us 10.645us 52.735us 52.09% 52.735us 2.197us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.735us 52.09% 52.735us 2.197us 24 + aten::copy_ 4.15% 114.085us 70.69% 1.941ms 107.839us 32.512us 32.11% 33.824us 1.879us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.704us 24.40% 24.704us 2.059us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.999us 15.80% 15.999us 1.333us 12 + aten::clone 0.78% 21.500us 67.65% 1.858ms 309.627us 0.000us 0.00% 9.120us 1.520us 6 + aten::sub 1.39% 38.270us 2.26% 62.070us 10.345us 8.063us 7.96% 8.063us 1.344us 6 + aten::add 1.13% 31.111us 1.93% 52.881us 8.813us 7.936us 7.84% 7.936us 1.323us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 7.71% 7.808us 1.301us 6 + Activity Buffer Request 52.71% 1.447ms 52.71% 1.447ms 1.447ms 1.312us 1.30% 1.312us 1.312us 1 + aten::empty_strided 1.19% 32.762us 1.19% 32.762us 5.460us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 11.56% 317.516us 11.56% 317.516us 52.919us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.38% 65.270us 3.07% 84.260us 3.511us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.69% 18.990us 0.69% 18.990us 0.791us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.83% 214.935us 7.83% 214.935us 4.478us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.19% 5.200us 0.19% 5.200us 5.200us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.677ms -Self CUDA time total: 101.154us +Self CPU time total: 2.746ms +Self CUDA time total: 101.246us @@ -4179,27 +4187,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.015ms 844.44% 1.015ms 1.015ms 1 - torch_eager 10.99% 299.529us 99.80% 2.720ms 2.720ms 0.000us 0.00% 122.045us 122.045us 1 - aten::mul 5.97% 162.734us 10.28% 280.227us 11.676us 61.856us 51.45% 61.856us 2.577us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 61.856us 51.45% 61.856us 2.577us 24 - aten::copy_ 4.97% 135.364us 68.63% 1.870ms 103.912us 39.199us 32.61% 41.023us 2.279us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.704us 23.88% 28.704us 2.392us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.166us 15.94% 19.166us 1.597us 12 - aten::clone 0.84% 22.992us 64.39% 1.755ms 292.512us 0.000us 0.00% 12.319us 2.053us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.495us 8.73% 10.495us 1.749us 6 - aten::add 1.19% 32.530us 2.08% 56.691us 9.448us 9.598us 7.98% 9.598us 1.600us 6 - aten::sub 1.40% 38.111us 2.30% 62.811us 10.468us 9.568us 7.96% 9.568us 1.595us 6 - Activity Buffer Request 52.53% 1.432ms 52.53% 1.432ms 1.432ms 1.824us 1.52% 1.824us 1.824us 1 - aten::empty_strided 1.18% 32.290us 1.18% 32.290us 5.382us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.53% 232.585us 8.53% 232.585us 38.764us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.71% 73.938us 3.49% 95.000us 3.958us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.77% 21.062us 0.77% 21.062us 0.878us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.70% 237.086us 8.70% 237.086us 4.939us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.20% 5.570us 0.20% 5.570us 5.570us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 896.601us 744.17% 896.601us 896.601us 1 + torch_eager 10.66% 286.835us 99.81% 2.687ms 2.687ms 0.000us 0.00% 122.275us 122.275us 1 + aten::mul 5.47% 147.118us 9.41% 253.291us 10.554us 61.985us 51.45% 61.985us 2.583us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 61.985us 51.45% 61.985us 2.583us 24 + aten::copy_ 3.72% 100.260us 70.38% 1.894ms 105.246us 39.265us 32.59% 41.057us 2.281us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.834us 23.93% 28.834us 2.403us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.233us 15.96% 19.233us 1.603us 12 + aten::clone 0.83% 22.211us 67.89% 1.827ms 304.542us 0.000us 0.00% 12.223us 2.037us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 8.66% 10.431us 1.738us 6 + aten::add 1.14% 30.799us 1.94% 52.140us 8.690us 9.632us 7.99% 9.632us 1.605us 6 + aten::sub 1.37% 36.770us 2.23% 59.970us 9.995us 9.601us 7.97% 9.601us 1.600us 6 + Activity Buffer Request 53.18% 1.431ms 53.18% 1.431ms 1.431ms 1.792us 1.49% 1.792us 1.792us 1 + aten::empty_strided 1.21% 32.491us 1.21% 32.491us 5.415us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 11.26% 303.147us 11.26% 303.147us 50.525us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.49% 66.932us 3.17% 85.280us 3.553us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.68% 18.348us 0.68% 18.348us 0.765us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.81% 210.347us 7.81% 210.347us 4.382us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.19% 5.020us 0.19% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.726ms -Self CUDA time total: 120.221us +Self CPU time total: 2.692ms +Self CUDA time total: 120.483us @@ -4209,27 +4217,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 951.101us 552.87% 951.101us 951.101us 1 - torch_eager 11.67% 313.772us 99.81% 2.683ms 2.683ms 0.000us 0.00% 174.878us 174.878us 1 - aten::mul 5.73% 154.081us 9.89% 265.836us 11.076us 89.599us 52.08% 89.599us 3.733us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.599us 52.08% 89.599us 3.733us 24 - aten::copy_ 3.89% 104.453us 68.40% 1.838ms 102.128us 57.664us 33.52% 60.512us 3.362us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.832us 23.74% 40.832us 3.403us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.767us 14.40% 24.767us 2.064us 12 - aten::clone 1.01% 27.120us 65.39% 1.758ms 292.937us 0.000us 0.00% 19.680us 3.280us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.832us 9.78% 16.832us 2.805us 6 - aten::add 1.27% 34.231us 2.14% 57.531us 9.588us 12.416us 7.22% 12.416us 2.069us 6 - aten::sub 1.34% 36.001us 2.22% 59.581us 9.930us 12.351us 7.18% 12.351us 2.059us 6 - Activity Buffer Request 53.45% 1.437ms 53.45% 1.437ms 1.437ms 2.848us 1.66% 2.848us 2.848us 1 - aten::empty_strided 1.13% 30.290us 1.13% 30.290us 5.048us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.55% 229.865us 8.55% 229.865us 38.311us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.63% 70.721us 3.36% 90.322us 3.763us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.73% 19.601us 0.73% 19.601us 0.817us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.41% 225.976us 8.41% 225.976us 4.708us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.19% 5.001us 0.19% 5.001us 5.001us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 885.202us 514.56% 885.202us 885.202us 1 + torch_eager 18.81% 279.303us 99.64% 1.480ms 1.480ms 0.000us 0.00% 174.944us 174.944us 1 + aten::mul 9.70% 144.115us 16.98% 252.116us 10.505us 89.439us 51.99% 89.439us 3.727us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.439us 51.99% 89.439us 3.727us 24 + aten::copy_ 6.85% 101.723us 47.28% 702.206us 39.011us 57.632us 33.50% 60.544us 3.364us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.608us 23.60% 40.608us 3.384us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.961us 14.51% 24.961us 2.080us 12 + aten::clone 1.41% 20.892us 42.46% 630.635us 105.106us 0.000us 0.00% 19.936us 3.323us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.024us 9.90% 17.024us 2.837us 6 + aten::add 2.07% 30.702us 3.51% 52.142us 8.690us 12.545us 7.29% 12.545us 2.091us 6 + aten::sub 2.41% 35.732us 4.00% 59.442us 9.907us 12.416us 7.22% 12.416us 2.069us 6 + Activity Buffer Request 17.15% 254.675us 17.15% 254.675us 254.675us 2.912us 1.69% 2.912us 2.912us 1 + aten::empty_strided 2.07% 30.780us 2.07% 30.780us 5.130us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 19.36% 287.456us 19.36% 287.456us 47.909us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.32% 64.164us 5.58% 82.803us 3.450us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.26% 18.639us 1.26% 18.639us 0.777us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.24% 211.503us 14.24% 211.503us 4.406us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.36% 5.410us 0.36% 5.410us 5.410us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.688ms -Self CUDA time total: 172.030us +Self CPU time total: 1.485ms +Self CUDA time total: 172.032us @@ -4239,27 +4247,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 927.996us 768.63% 927.996us 927.996us 1 - torch_eager 20.13% 284.369us 99.65% 1.408ms 1.408ms 0.000us 0.00% 122.557us 122.557us 1 - aten::mul 10.77% 152.163us 18.72% 264.405us 11.017us 62.048us 51.39% 62.048us 2.585us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.048us 51.39% 62.048us 2.585us 24 - aten::copy_ 7.56% 106.823us 43.43% 613.475us 34.082us 39.390us 32.63% 41.213us 2.290us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.864us 23.91% 28.864us 2.405us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.296us 15.98% 19.296us 1.608us 12 - aten::clone 1.39% 19.620us 37.04% 523.281us 87.213us 0.000us 0.00% 12.349us 2.058us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.526us 8.72% 10.526us 1.754us 6 - aten::add 2.28% 32.232us 3.86% 54.523us 9.087us 9.696us 8.03% 9.696us 1.616us 6 - aten::sub 2.48% 35.082us 4.10% 57.982us 9.664us 9.600us 7.95% 9.600us 1.600us 6 - Activity Buffer Request 14.96% 211.375us 14.96% 211.375us 211.375us 1.823us 1.51% 1.823us 1.823us 1 - aten::empty_strided 2.07% 29.290us 2.07% 29.290us 4.882us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 16.27% 229.815us 16.27% 229.815us 38.302us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.68% 66.168us 5.95% 84.051us 3.502us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.27% 17.883us 1.27% 17.883us 0.745us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.78% 222.895us 15.78% 222.895us 4.644us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.35% 4.970us 0.35% 4.970us 4.970us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 907.735us 751.64% 907.735us 907.735us 1 + torch_eager 18.35% 272.536us 99.65% 1.480ms 1.480ms 0.000us 0.00% 122.527us 122.527us 1 + aten::mul 9.89% 146.883us 17.48% 259.553us 10.815us 62.078us 51.40% 62.078us 2.587us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.078us 51.40% 62.078us 2.587us 24 + aten::copy_ 6.65% 98.730us 45.99% 682.885us 37.938us 39.328us 32.57% 41.088us 2.283us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.832us 23.87% 28.832us 2.403us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.361us 16.03% 19.361us 1.613us 12 + aten::clone 2.58% 38.249us 42.54% 631.763us 105.294us 0.000us 0.00% 12.256us 2.043us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.496us 8.69% 10.496us 1.749us 6 + aten::add 2.13% 31.663us 3.60% 53.483us 8.914us 9.728us 8.06% 9.728us 1.621us 6 + aten::sub 2.35% 34.954us 3.91% 58.043us 9.674us 9.633us 7.98% 9.633us 1.605us 6 + Activity Buffer Request 16.88% 250.706us 16.88% 250.706us 250.706us 1.760us 1.46% 1.760us 1.760us 1 + aten::empty_strided 2.15% 31.912us 2.15% 31.912us 5.319us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 18.48% 274.437us 18.48% 274.437us 45.739us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.31% 63.964us 5.59% 83.053us 3.461us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.29% 19.089us 1.29% 19.089us 0.795us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.59% 216.591us 14.59% 216.591us 4.512us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.35% 5.220us 0.35% 5.220us 5.220us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.413ms -Self CUDA time total: 120.734us +Self CPU time total: 1.485ms +Self CUDA time total: 120.767us @@ -4269,27 +4277,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 941.367us 547.21% 941.367us 941.367us 1 - torch_eager 19.36% 280.543us 99.66% 1.444ms 1.444ms 0.000us 0.00% 174.877us 174.877us 1 - aten::mul 10.67% 154.592us 18.48% 267.677us 11.153us 89.535us 52.05% 89.535us 3.731us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.535us 52.05% 89.535us 3.731us 24 - aten::copy_ 7.38% 106.934us 44.27% 641.329us 35.629us 57.694us 33.54% 60.542us 3.363us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.701us 23.66% 40.701us 3.392us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.800us 14.42% 24.800us 2.067us 12 - aten::clone 1.44% 20.830us 37.97% 550.103us 91.684us 0.000us 0.00% 19.841us 3.307us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.993us 9.88% 16.993us 2.832us 6 - aten::add 2.36% 34.121us 3.90% 56.522us 9.420us 12.448us 7.24% 12.448us 2.075us 6 - aten::sub 2.56% 37.161us 4.27% 61.881us 10.313us 12.352us 7.18% 12.352us 2.059us 6 - Activity Buffer Request 16.20% 234.686us 16.20% 234.686us 234.686us 2.848us 1.66% 2.848us 2.848us 1 - aten::empty_strided 2.02% 29.270us 2.02% 29.270us 4.878us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 15.95% 231.027us 15.95% 231.027us 38.505us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.63% 67.091us 5.92% 85.764us 3.573us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.29% 18.673us 1.29% 18.673us 0.778us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.80% 228.888us 15.80% 228.888us 4.768us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.34% 4.980us 0.34% 4.980us 4.980us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 894.399us 519.81% 894.399us 894.399us 1 + torch_eager 10.51% 278.801us 99.79% 2.648ms 2.648ms 0.000us 0.00% 174.911us 174.911us 1 + aten::mul 5.47% 145.104us 9.49% 251.734us 10.489us 89.535us 52.04% 89.535us 3.731us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.535us 52.04% 89.535us 3.731us 24 + aten::copy_ 3.73% 98.901us 70.34% 1.866ms 103.682us 57.696us 33.53% 60.544us 3.364us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.704us 23.66% 40.704us 3.392us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.832us 14.43% 24.832us 2.069us 12 + aten::clone 0.84% 22.190us 67.69% 1.796ms 299.337us 0.000us 0.00% 19.840us 3.307us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.992us 9.88% 16.992us 2.832us 6 + aten::sub 1.44% 38.162us 2.33% 61.942us 10.324us 12.448us 7.23% 12.448us 2.075us 6 + aten::add 1.15% 30.549us 1.97% 52.171us 8.695us 12.384us 7.20% 12.384us 2.064us 6 + Activity Buffer Request 54.02% 1.433ms 54.02% 1.433ms 1.433ms 2.848us 1.66% 2.848us 2.848us 1 + aten::empty_strided 1.13% 30.052us 1.13% 30.052us 5.009us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.37% 275.065us 10.37% 275.065us 45.844us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.49% 65.991us 3.19% 84.601us 3.525us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.70% 18.610us 0.70% 18.610us 0.775us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.95% 211.023us 7.95% 211.023us 4.396us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.21% 5.640us 0.21% 5.640us 5.640us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.449ms -Self CUDA time total: 172.029us +Self CPU time total: 2.653ms +Self CUDA time total: 172.063us @@ -4299,27 +4307,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 950.141us 334.64% 950.141us 950.141us 1 - torch_eager 11.47% 310.562us 99.82% 2.702ms 2.702ms 0.000us 0.00% 302.012us 302.012us 1 - aten::mul 5.57% 150.802us 9.64% 260.955us 10.873us 133.822us 47.13% 133.822us 5.576us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.822us 47.13% 133.822us 5.576us 24 - aten::copy_ 3.88% 105.155us 69.00% 1.868ms 103.782us 109.151us 38.44% 127.231us 7.068us 18 - aten::clone 0.99% 26.749us 66.03% 1.788ms 297.926us 0.000us 0.00% 69.886us 11.648us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.345us 20.20% 57.345us 4.779us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 51.806us 18.25% 51.806us 8.634us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.959us 14.43% 40.959us 3.413us 12 - aten::sub 1.29% 34.831us 2.15% 58.172us 9.695us 20.607us 7.26% 20.607us 3.435us 6 - aten::add 1.26% 34.242us 2.11% 57.104us 9.517us 20.352us 7.17% 20.352us 3.392us 6 - Activity Buffer Request 54.34% 1.471ms 54.34% 1.471ms 1.471ms 18.080us 6.37% 18.080us 18.080us 1 - aten::empty_strided 1.13% 30.492us 1.13% 30.492us 5.082us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.33% 225.535us 8.33% 225.535us 37.589us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.63% 71.143us 3.33% 90.164us 3.757us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.70% 19.021us 0.70% 19.021us 0.793us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.22% 222.598us 8.22% 222.598us 4.637us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.18% 4.920us 0.18% 4.920us 4.920us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 888.595us 313.84% 888.595us 888.595us 1 + torch_eager 18.64% 271.692us 99.64% 1.452ms 1.452ms 0.000us 0.00% 301.536us 301.536us 1 + aten::mul 9.98% 145.418us 17.29% 252.060us 10.503us 132.896us 46.94% 132.896us 5.537us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 132.896us 46.94% 132.896us 5.537us 24 + aten::copy_ 6.89% 100.362us 46.38% 676.084us 37.560us 109.376us 38.63% 127.776us 7.099us 18 + aten::clone 1.48% 21.511us 41.22% 600.853us 100.142us 0.000us 0.00% 70.560us 11.760us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.216us 20.21% 57.216us 4.768us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.160us 18.42% 52.160us 8.693us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.864us 14.43% 40.864us 3.405us 12 + aten::sub 2.41% 35.143us 4.02% 58.572us 9.762us 20.512us 7.24% 20.512us 3.419us 6 + aten::add 2.12% 30.932us 3.62% 52.783us 8.797us 20.352us 7.19% 20.352us 3.392us 6 + Activity Buffer Request 16.97% 247.406us 16.97% 247.406us 247.406us 18.400us 6.50% 18.400us 18.400us 1 + aten::empty_strided 2.15% 31.370us 2.15% 31.370us 5.228us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 18.35% 267.496us 18.35% 267.496us 44.583us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.85% 70.742us 6.06% 88.302us 3.679us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.20% 17.560us 1.20% 17.560us 0.732us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.59% 212.742us 14.59% 212.742us 4.432us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.36% 5.280us 0.36% 5.280us 5.280us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.707ms -Self CUDA time total: 283.932us +Self CPU time total: 1.458ms +Self CUDA time total: 283.136us @@ -4329,27 +4337,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 966.098us 169.87% 966.098us 966.098us 1 - torch_eager 20.40% 290.715us 99.64% 1.420ms 1.420ms 0.000us 0.00% 592.377us 592.377us 1 - aten::copy_ 7.41% 105.615us 41.73% 594.574us 33.032us 275.293us 48.40% 298.941us 16.608us 18 - aten::mul 10.90% 155.244us 18.92% 269.648us 11.235us 227.071us 39.93% 227.071us 9.461us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 227.071us 39.93% 227.071us 9.461us 24 - aten::clone 1.44% 20.483us 35.30% 502.923us 83.821us 0.000us 0.00% 207.134us 34.522us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 183.486us 32.26% 183.486us 30.581us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 91.807us 16.14% 91.807us 7.651us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 66.365us 11.67% 66.365us 5.530us 12 - aten::sub 2.66% 37.929us 4.43% 63.131us 10.522us 33.790us 5.94% 33.790us 5.632us 6 - aten::add 2.47% 35.251us 4.15% 59.172us 9.862us 32.575us 5.73% 32.575us 5.429us 6 - Activity Buffer Request 13.81% 196.814us 13.81% 196.814us 196.814us 23.648us 4.16% 23.648us 23.648us 1 - aten::empty_strided 2.02% 28.790us 2.02% 28.790us 4.798us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 15.63% 222.685us 15.63% 222.685us 37.114us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 5.20% 74.092us 6.55% 93.282us 3.887us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.35% 19.190us 1.35% 19.190us 0.800us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.35% 232.987us 16.35% 232.987us 4.854us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.36% 5.080us 0.36% 5.080us 5.080us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 944.856us 167.33% 944.856us 944.856us 1 + torch_eager 19.10% 286.874us 99.66% 1.497ms 1.497ms 0.000us 0.00% 588.218us 588.218us 1 + aten::copy_ 6.48% 97.352us 44.49% 668.224us 37.124us 273.885us 48.50% 297.437us 16.524us 18 + aten::mul 11.54% 173.280us 19.20% 288.361us 12.015us 224.990us 39.84% 224.990us 9.375us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 224.990us 39.84% 224.990us 9.375us 24 + aten::clone 1.34% 20.121us 39.51% 593.393us 98.899us 0.000us 0.00% 206.910us 34.485us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 183.358us 32.47% 183.358us 30.560us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.527us 16.03% 90.527us 7.544us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.791us 11.65% 65.791us 5.483us 12 + aten::sub 2.45% 36.872us 4.07% 61.073us 10.179us 33.407us 5.92% 33.407us 5.568us 6 + aten::add 2.13% 32.018us 3.64% 54.631us 9.105us 32.384us 5.74% 32.384us 5.397us 6 + Activity Buffer Request 16.63% 249.816us 16.63% 249.816us 249.816us 23.552us 4.17% 23.552us 23.552us 1 + aten::empty_strided 2.02% 30.350us 2.02% 30.350us 5.058us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.28% 259.545us 17.28% 259.545us 43.258us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.52% 67.913us 5.81% 87.211us 3.634us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.28% 19.298us 1.28% 19.298us 0.804us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.87% 223.406us 14.87% 223.406us 4.654us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.34% 5.141us 0.34% 5.141us 5.141us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.425ms -Self CUDA time total: 568.729us +Self CPU time total: 1.502ms +Self CUDA time total: 564.666us @@ -4359,27 +4367,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 975.032us 1053.20% 975.032us 975.032us 1 - torch_eager 19.78% 289.798us 99.66% 1.460ms 1.460ms 0.000us 0.00% 93.698us 93.698us 1 - aten::mul 11.08% 162.260us 19.21% 281.475us 11.728us 49.665us 53.65% 49.665us 2.069us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.665us 53.65% 49.665us 2.069us 24 - aten::copy_ 7.16% 104.830us 42.02% 615.673us 34.204us 29.441us 31.80% 30.561us 1.698us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.657us 24.47% 22.657us 1.888us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.472us 14.55% 13.472us 1.123us 12 - aten::clone 1.39% 20.311us 36.25% 531.032us 88.505us 0.000us 0.00% 7.904us 1.317us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 7.33% 6.784us 1.131us 6 - aten::add 2.30% 33.730us 3.98% 58.302us 9.717us 6.752us 7.29% 6.752us 1.125us 6 - aten::sub 2.57% 37.640us 4.45% 65.262us 10.877us 6.720us 7.26% 6.720us 1.120us 6 - Activity Buffer Request 14.75% 216.135us 14.75% 216.135us 216.135us 1.120us 1.21% 1.120us 1.120us 1 - aten::empty_strided 2.59% 37.931us 2.59% 37.931us 6.322us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 15.29% 223.986us 15.29% 223.986us 37.331us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.89% 71.623us 6.23% 91.274us 3.803us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.34% 19.651us 1.34% 19.651us 0.819us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.53% 242.131us 16.53% 242.131us 5.044us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.34% 5.040us 0.34% 5.040us 5.040us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 916.448us 990.96% 916.448us 916.448us 1 + torch_eager 10.63% 281.892us 99.80% 2.647ms 2.647ms 0.000us 0.00% 93.601us 93.601us 1 + aten::mul 5.58% 148.028us 9.67% 256.571us 10.690us 49.634us 53.67% 49.634us 2.068us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.634us 53.67% 49.634us 2.068us 24 + aten::copy_ 3.99% 105.971us 69.88% 1.854ms 102.991us 29.439us 31.83% 30.559us 1.698us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.655us 24.50% 22.655us 1.888us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.408us 14.50% 13.408us 1.117us 12 + aten::clone 0.82% 21.802us 66.79% 1.772ms 295.325us 0.000us 0.00% 7.904us 1.317us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 7.34% 6.784us 1.131us 6 + aten::sub 1.36% 36.061us 2.24% 59.441us 9.907us 6.720us 7.27% 6.720us 1.120us 6 + aten::add 1.25% 33.260us 2.10% 55.590us 9.265us 6.688us 7.23% 6.688us 1.115us 6 + Activity Buffer Request 54.00% 1.433ms 54.00% 1.433ms 1.433ms 1.120us 1.21% 1.120us 1.120us 1 + aten::empty_strided 1.13% 29.861us 1.13% 29.861us 4.977us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.52% 252.488us 9.52% 252.488us 42.081us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.59% 68.801us 3.33% 88.471us 3.686us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.74% 19.670us 0.74% 19.670us 0.820us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.18% 216.965us 8.18% 216.965us 4.520us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.20% 5.410us 0.20% 5.410us 5.410us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.465ms -Self CUDA time total: 92.578us +Self CPU time total: 2.653ms +Self CUDA time total: 92.481us @@ -4389,27 +4397,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 963.643us 1001.81% 963.643us 963.643us 1 - torch_eager 11.60% 311.071us 99.82% 2.676ms 2.676ms 0.000us 0.00% 97.534us 97.534us 1 - aten::mul 5.66% 151.593us 10.00% 268.127us 11.172us 51.103us 53.13% 51.103us 2.129us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.103us 53.13% 51.103us 2.129us 24 - aten::copy_ 3.93% 105.441us 68.13% 1.826ms 101.459us 30.911us 32.14% 32.255us 1.792us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.007us 23.92% 23.007us 1.917us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.176us 14.74% 14.176us 1.181us 12 - aten::clone 1.04% 27.830us 65.21% 1.748ms 291.325us 0.000us 0.00% 9.248us 1.541us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 8.22% 7.904us 1.317us 6 - aten::sub 1.38% 37.040us 2.30% 61.581us 10.264us 7.103us 7.38% 7.103us 1.184us 6 - aten::add 1.19% 32.000us 2.05% 54.860us 9.143us 7.073us 7.35% 7.073us 1.179us 6 - Activity Buffer Request 53.57% 1.436ms 53.57% 1.436ms 1.436ms 1.344us 1.40% 1.344us 1.344us 1 - aten::empty_strided 1.19% 31.921us 1.19% 31.921us 5.320us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.14% 218.236us 8.14% 218.236us 36.373us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.76% 74.059us 3.52% 94.290us 3.929us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.75% 20.231us 0.75% 20.231us 0.843us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.60% 230.408us 8.60% 230.408us 4.800us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.18% 4.700us 0.18% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 888.809us 923.35% 888.809us 888.809us 1 + torch_eager 19.05% 273.129us 99.67% 1.429ms 1.429ms 0.000us 0.00% 97.571us 97.571us 1 + aten::mul 10.09% 144.695us 17.61% 252.506us 10.521us 51.232us 53.22% 51.232us 2.135us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.232us 53.22% 51.232us 2.135us 24 + aten::copy_ 6.72% 96.301us 45.37% 650.385us 36.132us 30.786us 31.98% 32.098us 1.783us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.944us 23.84% 22.944us 1.912us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.241us 14.79% 14.241us 1.187us 12 + aten::clone 1.39% 19.911us 40.43% 579.513us 96.586us 0.000us 0.00% 9.154us 1.526us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.842us 8.15% 7.842us 1.307us 6 + aten::add 2.26% 32.360us 3.79% 54.320us 9.053us 7.136us 7.41% 7.136us 1.189us 6 + aten::sub 2.55% 36.551us 4.17% 59.791us 9.965us 7.105us 7.38% 7.105us 1.184us 6 + Activity Buffer Request 16.56% 237.415us 16.56% 237.415us 237.415us 1.312us 1.36% 1.312us 1.312us 1 + aten::empty_strided 2.18% 31.230us 2.18% 31.230us 5.205us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.96% 257.447us 17.96% 257.447us 42.908us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.71% 67.539us 6.11% 87.581us 3.649us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.40% 20.042us 1.40% 20.042us 0.835us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.80% 212.233us 14.80% 212.233us 4.422us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.33% 4.690us 0.33% 4.690us 4.690us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.681ms -Self CUDA time total: 96.190us +Self CPU time total: 1.434ms +Self CUDA time total: 96.259us @@ -4419,27 +4427,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 984.120us 950.08% 984.120us 984.120us 1 - torch_eager 21.32% 307.609us 99.66% 1.438ms 1.438ms 0.000us 0.00% 104.863us 104.863us 1 - aten::mul 11.11% 160.241us 19.03% 274.535us 11.439us 55.232us 53.32% 55.232us 2.301us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.232us 53.32% 55.232us 2.301us 24 - aten::copy_ 7.56% 109.063us 40.34% 581.983us 32.332us 32.383us 31.26% 33.663us 1.870us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.639us 23.79% 24.639us 2.053us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.968us 15.42% 15.968us 1.331us 12 - aten::clone 1.50% 21.672us 34.18% 493.044us 82.174us 0.000us 0.00% 9.024us 1.504us 6 - aten::add 2.60% 37.520us 4.33% 62.511us 10.418us 8.031us 7.75% 8.031us 1.339us 6 - aten::sub 2.72% 39.231us 4.56% 65.841us 10.973us 7.937us 7.66% 7.937us 1.323us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 7.48% 7.744us 1.291us 6 - Activity Buffer Request 13.05% 188.244us 13.05% 188.244us 188.244us 1.280us 1.24% 1.280us 1.280us 1 - aten::empty_strided 2.28% 32.882us 2.28% 32.882us 5.480us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 14.94% 215.555us 14.94% 215.555us 35.926us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.93% 71.162us 6.28% 90.612us 3.776us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.35% 19.450us 1.35% 19.450us 0.810us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.29% 235.016us 16.29% 235.016us 4.896us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.34% 4.880us 0.34% 4.880us 4.880us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 903.536us 870.95% 903.536us 903.536us 1 + torch_eager 18.87% 271.956us 99.65% 1.436ms 1.436ms 0.000us 0.00% 105.053us 105.053us 1 + aten::mul 10.20% 146.935us 17.83% 256.897us 10.704us 55.262us 53.27% 55.262us 2.303us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.262us 53.27% 55.262us 2.303us 24 + aten::copy_ 6.83% 98.437us 45.05% 649.198us 36.067us 32.478us 31.31% 33.790us 1.877us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.670us 23.78% 24.670us 2.056us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.001us 15.42% 16.001us 1.333us 12 + aten::clone 1.50% 21.580us 40.06% 577.333us 96.222us 0.000us 0.00% 9.120us 1.520us 6 + aten::sub 2.49% 35.841us 4.72% 67.992us 11.332us 8.001us 7.71% 8.001us 1.333us 6 + aten::add 2.31% 33.350us 3.86% 55.670us 9.278us 8.000us 7.71% 8.000us 1.333us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 7.53% 7.808us 1.301us 6 + Activity Buffer Request 16.46% 237.265us 16.46% 237.265us 237.265us 1.312us 1.26% 1.312us 1.312us 1 + aten::empty_strided 2.16% 31.090us 2.16% 31.090us 5.182us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.50% 252.196us 17.50% 252.196us 42.033us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.40% 63.461us 5.67% 81.650us 3.402us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.26% 18.189us 1.26% 18.189us 0.758us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.66% 225.733us 15.66% 225.733us 4.703us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.35% 5.060us 0.35% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.443ms -Self CUDA time total: 103.583us +Self CPU time total: 1.441ms +Self CUDA time total: 103.741us @@ -4449,27 +4457,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 935.122us 757.84% 935.122us 935.122us 1 - torch_eager 19.99% 283.519us 99.60% 1.412ms 1.412ms 0.000us 0.00% 125.153us 125.153us 1 - aten::mul 10.97% 155.634us 18.77% 266.135us 11.089us 65.024us 52.70% 65.024us 2.709us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.024us 52.70% 65.024us 2.709us 24 - aten::copy_ 7.53% 106.809us 43.10% 611.203us 33.956us 39.201us 31.77% 40.961us 2.276us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.769us 23.31% 28.769us 2.397us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.168us 15.53% 19.168us 1.597us 12 - aten::clone 1.50% 21.262us 37.00% 524.722us 87.454us 0.000us 0.00% 12.192us 2.032us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 8.45% 10.432us 1.739us 6 - aten::add 2.41% 34.151us 3.94% 55.922us 9.320us 9.664us 7.83% 9.664us 1.611us 6 - aten::sub 2.49% 35.371us 4.21% 59.711us 9.952us 9.504us 7.70% 9.504us 1.584us 6 - Activity Buffer Request 14.55% 206.375us 14.55% 206.375us 206.375us 1.760us 1.43% 1.760us 1.760us 1 - aten::empty_strided 2.12% 30.049us 2.12% 30.049us 5.008us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 16.20% 229.735us 16.20% 229.735us 38.289us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.63% 65.693us 5.97% 84.623us 3.526us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.33% 18.930us 1.33% 18.930us 0.789us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.86% 224.896us 15.86% 224.896us 4.685us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.40% 5.729us 0.40% 5.729us 5.729us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 903.548us 729.80% 903.548us 903.548us 1 + torch_eager 10.56% 280.674us 99.81% 2.652ms 2.652ms 0.000us 0.00% 125.567us 125.567us 1 + aten::mul 5.49% 145.805us 9.46% 251.467us 10.478us 65.184us 52.65% 65.184us 2.716us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.184us 52.65% 65.184us 2.716us 24 + aten::copy_ 3.75% 99.563us 70.08% 1.862ms 103.468us 39.422us 31.84% 41.182us 2.288us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.928us 23.37% 28.928us 2.411us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.201us 15.51% 19.201us 1.600us 12 + aten::clone 0.92% 24.379us 67.48% 1.793ms 298.872us 0.000us 0.00% 12.254us 2.042us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.494us 8.48% 10.494us 1.749us 6 + aten::add 1.15% 30.622us 1.96% 52.162us 8.694us 9.633us 7.78% 9.633us 1.606us 6 + aten::sub 1.45% 38.422us 2.36% 62.661us 10.443us 9.568us 7.73% 9.568us 1.595us 6 + Activity Buffer Request 54.94% 1.460ms 54.94% 1.460ms 1.460ms 1.760us 1.42% 1.760us 1.760us 1 + aten::empty_strided 1.16% 30.801us 1.16% 30.801us 5.133us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.14% 242.866us 9.14% 242.866us 40.478us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.56% 67.990us 3.30% 87.783us 3.658us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.74% 19.793us 0.74% 19.793us 0.825us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.96% 211.432us 7.96% 211.432us 4.405us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.19% 5.160us 0.19% 5.160us 5.160us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.418ms -Self CUDA time total: 123.393us +Self CPU time total: 2.658ms +Self CUDA time total: 123.807us @@ -4479,27 +4487,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 964.101us 931.86% 964.101us 964.101us 1 - torch_eager 11.58% 311.269us 99.80% 2.682ms 2.682ms 0.000us 0.00% 104.772us 104.772us 1 - aten::mul 5.74% 154.165us 9.94% 267.067us 11.128us 55.236us 53.39% 55.236us 2.301us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.236us 53.39% 55.236us 2.301us 24 - aten::copy_ 4.07% 109.351us 68.30% 1.836ms 101.989us 32.287us 31.21% 33.599us 1.867us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.511us 23.69% 24.511us 2.043us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.937us 15.40% 15.937us 1.328us 12 - aten::clone 1.02% 27.532us 65.06% 1.749ms 291.482us 0.000us 0.00% 9.088us 1.515us 6 - aten::add 1.31% 35.310us 2.20% 59.141us 9.857us 7.969us 7.70% 7.969us 1.328us 6 - aten::sub 1.38% 37.131us 2.33% 62.602us 10.434us 7.968us 7.70% 7.968us 1.328us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 7.52% 7.776us 1.296us 6 - Activity Buffer Request 53.54% 1.439ms 53.54% 1.439ms 1.439ms 1.312us 1.27% 1.312us 1.312us 1 - aten::empty_strided 1.12% 30.190us 1.12% 30.190us 5.032us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.09% 217.335us 8.09% 217.335us 36.223us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.62% 70.291us 3.31% 88.901us 3.704us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.69% 18.610us 0.69% 18.610us 0.775us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.64% 232.137us 8.64% 232.137us 4.836us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.20% 5.481us 0.20% 5.481us 5.481us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 889.436us 855.74% 889.436us 889.436us 1 + torch_eager 19.42% 274.045us 99.59% 1.406ms 1.406ms 0.000us 0.00% 105.282us 105.282us 1 + aten::mul 10.41% 146.921us 18.18% 256.563us 10.690us 55.486us 53.38% 55.486us 2.312us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.486us 53.38% 55.486us 2.312us 24 + aten::copy_ 6.82% 96.302us 44.56% 628.895us 34.939us 32.513us 31.28% 33.857us 1.881us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.705us 23.77% 24.705us 2.059us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.939us 15.34% 15.939us 1.328us 12 + aten::clone 1.41% 19.928us 39.46% 556.871us 92.812us 0.000us 0.00% 9.152us 1.525us 6 + aten::sub 2.56% 36.082us 4.16% 58.744us 9.791us 7.970us 7.67% 7.970us 1.328us 6 + aten::add 2.23% 31.511us 3.85% 54.282us 9.047us 7.969us 7.67% 7.969us 1.328us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 7.51% 7.808us 1.301us 6 + Activity Buffer Request 15.99% 225.676us 15.99% 225.676us 225.676us 1.344us 1.29% 1.344us 1.344us 1 + aten::empty_strided 2.17% 30.631us 2.17% 30.631us 5.105us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.52% 247.335us 17.52% 247.335us 41.223us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.52% 63.850us 5.84% 82.475us 3.436us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.32% 18.625us 1.32% 18.625us 0.776us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.21% 214.657us 15.21% 214.657us 4.472us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.41% 5.810us 0.41% 5.810us 5.810us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.688ms -Self CUDA time total: 103.460us +Self CPU time total: 1.411ms +Self CUDA time total: 103.938us @@ -4509,27 +4517,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 964.072us 780.68% 964.072us 964.072us 1 - torch_eager 11.45% 316.268us 99.81% 2.758ms 2.758ms 0.000us 0.00% 125.283us 125.283us 1 - aten::mul 5.46% 150.776us 9.46% 261.336us 10.889us 65.090us 52.71% 65.090us 2.712us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.090us 52.71% 65.090us 2.712us 24 - aten::copy_ 3.85% 106.511us 68.83% 1.902ms 105.647us 39.266us 31.80% 41.058us 2.281us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.802us 23.32% 28.802us 2.400us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.135us 15.50% 19.135us 1.595us 12 - aten::clone 1.09% 30.231us 66.11% 1.827ms 304.441us 0.000us 0.00% 12.256us 2.043us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.464us 8.47% 10.464us 1.744us 6 - aten::add 1.22% 33.650us 2.08% 57.431us 9.572us 9.599us 7.77% 9.599us 1.600us 6 - aten::sub 1.35% 37.292us 2.48% 68.652us 11.442us 9.536us 7.72% 9.536us 1.589us 6 - Activity Buffer Request 54.53% 1.507ms 54.53% 1.507ms 1.507ms 1.792us 1.45% 1.792us 1.792us 1 - aten::empty_strided 1.19% 32.821us 1.19% 32.821us 5.470us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.01% 221.424us 8.01% 221.424us 36.904us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.55% 70.592us 3.23% 89.363us 3.723us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.68% 18.771us 0.68% 18.771us 0.782us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.42% 232.664us 8.42% 232.664us 4.847us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.19% 5.190us 0.19% 5.190us 5.190us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 888.135us 717.15% 888.135us 888.135us 1 + torch_eager 18.91% 268.465us 99.65% 1.415ms 1.415ms 0.000us 0.00% 125.666us 125.666us 1 + aten::mul 10.15% 144.114us 17.70% 251.265us 10.469us 65.346us 52.77% 65.346us 2.723us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.346us 52.77% 65.346us 2.723us 24 + aten::copy_ 6.90% 97.992us 45.41% 644.725us 35.818us 39.328us 31.76% 41.152us 2.286us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.800us 23.26% 28.800us 2.400us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.168us 15.48% 19.168us 1.597us 12 + aten::clone 1.46% 20.690us 40.33% 572.532us 95.422us 0.000us 0.00% 12.352us 2.059us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.528us 8.50% 10.528us 1.755us 6 + aten::add 2.19% 31.029us 3.69% 52.390us 8.732us 9.600us 7.75% 9.600us 1.600us 6 + aten::sub 2.50% 35.469us 4.13% 58.580us 9.763us 9.568us 7.73% 9.568us 1.595us 6 + Activity Buffer Request 15.69% 222.765us 15.69% 222.765us 222.765us 1.824us 1.47% 1.824us 1.824us 1 + aten::empty_strided 2.29% 32.500us 2.29% 32.500us 5.417us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 18.50% 262.716us 18.50% 262.716us 43.786us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.70% 66.710us 6.07% 86.108us 3.588us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.37% 19.398us 1.37% 19.398us 0.808us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.99% 212.875us 14.99% 212.875us 4.435us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.35% 5.010us 0.35% 5.010us 5.010us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.763ms -Self CUDA time total: 123.491us +Self CPU time total: 1.420ms +Self CUDA time total: 123.842us @@ -4539,27 +4547,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 934.855us 527.63% 934.855us 934.855us 1 - torch_eager 19.51% 283.728us 99.66% 1.450ms 1.450ms 0.000us 0.00% 180.061us 180.061us 1 - aten::mul 10.43% 151.748us 18.10% 263.338us 10.972us 95.007us 53.62% 95.007us 3.959us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 95.007us 53.62% 95.007us 3.959us 24 - aten::copy_ 7.11% 103.461us 44.35% 645.065us 35.837us 57.664us 32.55% 60.544us 3.364us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.608us 22.92% 40.608us 3.384us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.510us 13.83% 24.510us 2.042us 12 - aten::clone 1.46% 21.280us 38.39% 558.424us 93.071us 0.000us 0.00% 19.936us 3.323us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.056us 9.63% 17.056us 2.843us 6 - aten::add 2.36% 34.271us 3.99% 58.001us 9.667us 12.287us 6.93% 12.287us 2.048us 6 - aten::sub 2.55% 37.161us 4.24% 61.641us 10.274us 12.223us 6.90% 12.223us 2.037us 6 - Activity Buffer Request 17.53% 255.006us 17.53% 255.006us 255.006us 2.880us 1.63% 2.880us 2.880us 1 - aten::empty_strided 2.02% 29.311us 2.02% 29.311us 4.885us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 15.21% 221.267us 15.21% 221.267us 36.878us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.73% 68.750us 6.01% 87.372us 3.641us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.28% 18.622us 1.28% 18.622us 0.776us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.48% 225.131us 15.48% 225.131us 4.690us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.34% 4.880us 0.34% 4.880us 4.880us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 910.045us 513.35% 910.045us 910.045us 1 + torch_eager 9.66% 280.213us 99.83% 2.894ms 2.894ms 0.000us 0.00% 180.188us 180.188us 1 + aten::mul 5.18% 150.102us 9.00% 260.863us 10.869us 94.655us 53.39% 94.655us 3.944us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.655us 53.39% 94.655us 3.944us 24 + aten::copy_ 3.40% 98.673us 72.45% 2.101ms 116.706us 57.885us 32.65% 60.797us 3.378us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.799us 23.01% 40.799us 3.400us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.736us 13.95% 24.736us 2.061us 12 + aten::clone 0.79% 22.860us 70.00% 2.030ms 338.262us 0.000us 0.00% 19.998us 3.333us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.086us 9.64% 17.086us 2.848us 6 + aten::add 1.13% 32.880us 1.89% 54.761us 9.127us 12.416us 7.00% 12.416us 2.069us 6 + aten::sub 1.18% 34.239us 1.98% 57.551us 9.592us 12.320us 6.95% 12.320us 2.053us 6 + Activity Buffer Request 58.76% 1.704ms 58.76% 1.704ms 1.704ms 2.912us 1.64% 2.912us 2.912us 1 + aten::empty_strided 1.11% 32.150us 1.11% 32.150us 5.358us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.21% 238.144us 8.21% 238.144us 39.691us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.29% 66.481us 2.94% 85.213us 3.551us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.65% 18.732us 0.65% 18.732us 0.781us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.46% 216.224us 7.46% 216.224us 4.505us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.17% 5.070us 0.17% 5.070us 5.070us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.455ms -Self CUDA time total: 177.181us +Self CPU time total: 2.899ms +Self CUDA time total: 177.276us @@ -4569,27 +4577,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 936.902us 314.34% 936.902us 936.902us 1 - torch_eager 19.95% 279.505us 99.63% 1.396ms 1.396ms 0.000us 0.00% 315.267us 315.267us 1 - aten::mul 10.85% 152.079us 18.94% 265.395us 11.058us 146.176us 49.04% 146.176us 6.091us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 146.176us 49.04% 146.176us 6.091us 24 - aten::copy_ 7.66% 107.385us 42.60% 596.937us 33.163us 110.978us 37.23% 128.194us 7.122us 18 - aten::clone 1.45% 20.319us 36.31% 508.783us 84.797us 0.000us 0.00% 70.625us 11.771us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.569us 19.32% 57.569us 4.797us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.409us 17.92% 53.409us 8.902us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.897us 13.72% 40.897us 3.408us 12 - aten::sub 2.61% 36.531us 4.38% 61.402us 10.234us 20.449us 6.86% 20.449us 3.408us 6 - aten::add 2.39% 33.533us 3.98% 55.753us 9.292us 20.448us 6.86% 20.448us 3.408us 6 - Activity Buffer Request 14.75% 206.705us 14.75% 206.705us 206.705us 17.216us 5.78% 17.216us 17.216us 1 - aten::empty_strided 2.13% 29.842us 2.13% 29.842us 4.974us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 15.44% 216.385us 15.44% 216.385us 36.064us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.91% 68.874us 6.21% 87.042us 3.627us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.30% 18.168us 1.30% 18.168us 0.757us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.19% 226.869us 16.19% 226.869us 4.726us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.37% 5.161us 0.37% 5.161us 5.161us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 919.612us 310.44% 919.612us 919.612us 1 + torch_eager 10.49% 286.464us 99.82% 2.726ms 2.726ms 0.000us 0.00% 313.057us 313.057us 1 + aten::mul 5.34% 145.716us 9.29% 253.789us 10.575us 145.182us 49.01% 145.182us 6.049us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.182us 49.01% 145.182us 6.049us 24 + aten::copy_ 3.69% 100.696us 70.60% 1.928ms 107.115us 109.985us 37.13% 126.817us 7.045us 18 + aten::clone 0.88% 23.951us 68.02% 1.858ms 309.597us 0.000us 0.00% 69.474us 11.579us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.343us 19.36% 57.343us 4.779us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.642us 17.77% 52.642us 8.774us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.058us 13.86% 41.058us 3.421us 12 + aten::sub 1.33% 36.191us 2.18% 59.621us 9.937us 20.609us 6.96% 20.609us 3.435us 6 + aten::add 1.14% 31.230us 1.95% 53.190us 8.865us 20.449us 6.90% 20.449us 3.408us 6 + Activity Buffer Request 56.07% 1.531ms 56.07% 1.531ms 1.531ms 16.832us 5.68% 16.832us 16.832us 1 + aten::empty_strided 1.17% 32.070us 1.17% 32.070us 5.345us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.59% 234.696us 8.59% 234.696us 39.116us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.53% 69.062us 3.26% 88.922us 3.705us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.73% 19.860us 0.73% 19.860us 0.827us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.86% 214.752us 7.86% 214.752us 4.474us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.18% 4.930us 0.18% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.401ms -Self CUDA time total: 298.051us +Self CPU time total: 2.731ms +Self CUDA time total: 296.225us @@ -4599,27 +4607,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 953.069us 538.57% 953.069us 953.069us 1 - torch_eager 19.36% 280.983us 99.62% 1.446ms 1.446ms 0.000us 0.00% 179.812us 179.812us 1 - aten::mul 10.74% 155.876us 18.65% 270.688us 11.279us 94.916us 53.64% 94.916us 3.955us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.916us 53.64% 94.916us 3.955us 24 - aten::copy_ 7.70% 111.823us 43.62% 633.117us 35.173us 57.568us 32.53% 60.416us 3.356us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.544us 22.91% 40.544us 3.379us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.480us 13.83% 24.480us 2.040us 12 - aten::clone 1.50% 21.731us 37.58% 545.384us 90.897us 0.000us 0.00% 19.872us 3.312us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.024us 9.62% 17.024us 2.837us 6 - aten::add 2.38% 34.509us 4.05% 58.781us 9.797us 12.256us 6.93% 12.256us 2.043us 6 - aten::sub 2.51% 36.442us 4.13% 59.923us 9.987us 12.224us 6.91% 12.224us 2.037us 6 - Activity Buffer Request 15.40% 223.485us 15.40% 223.485us 223.485us 2.848us 1.61% 2.848us 2.848us 1 - aten::empty_strided 2.13% 30.930us 2.13% 30.930us 5.155us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 15.79% 229.197us 15.79% 229.197us 38.200us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.88% 70.882us 6.18% 89.652us 3.735us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.29% 18.770us 1.29% 18.770us 0.782us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.93% 231.177us 15.93% 231.177us 4.816us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.38% 5.510us 0.38% 5.510us 5.510us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 889.394us 501.79% 889.394us 889.394us 1 + torch_eager 17.97% 266.975us 99.65% 1.481ms 1.481ms 0.000us 0.00% 180.092us 180.092us 1 + aten::mul 9.80% 145.611us 16.96% 251.937us 10.497us 94.974us 53.58% 94.974us 3.957us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.974us 53.58% 94.974us 3.957us 24 + aten::copy_ 6.75% 100.282us 47.98% 712.837us 39.602us 57.694us 32.55% 60.542us 3.363us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.736us 22.98% 40.736us 3.395us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.576us 13.87% 24.576us 2.048us 12 + aten::clone 1.38% 20.549us 43.06% 639.725us 106.621us 0.000us 0.00% 19.806us 3.301us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.958us 9.57% 16.958us 2.826us 6 + aten::sub 2.49% 37.040us 4.14% 61.531us 10.255us 12.289us 6.93% 12.289us 2.048us 6 + aten::add 2.11% 31.282us 3.59% 53.402us 8.900us 12.287us 6.93% 12.287us 2.048us 6 + Activity Buffer Request 19.87% 295.257us 19.87% 295.257us 295.257us 2.848us 1.61% 2.848us 2.848us 1 + aten::empty_strided 2.04% 30.372us 2.04% 30.372us 5.062us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.34% 257.637us 17.34% 257.637us 42.940us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.31% 64.000us 5.58% 82.951us 3.456us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.28% 18.951us 1.28% 18.951us 0.790us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.31% 212.598us 14.31% 212.598us 4.429us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.35% 5.130us 0.35% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.451ms -Self CUDA time total: 176.964us +Self CPU time total: 1.486ms +Self CUDA time total: 177.244us @@ -4629,27 +4637,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 992.756us 332.77% 992.756us 992.756us 1 - torch_eager 20.12% 289.006us 99.66% 1.432ms 1.432ms 0.000us 0.00% 316.222us 316.222us 1 - aten::mul 11.31% 162.528us 19.47% 279.759us 11.657us 146.880us 49.23% 146.880us 6.120us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 146.880us 49.23% 146.880us 6.120us 24 - aten::copy_ 7.73% 111.012us 41.48% 595.895us 33.105us 110.942us 37.19% 128.830us 7.157us 18 - aten::clone 1.55% 22.310us 35.21% 505.793us 84.299us 0.000us 0.00% 71.424us 11.904us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.406us 19.24% 57.406us 4.784us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.536us 17.94% 53.536us 8.923us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.512us 13.58% 40.512us 3.376us 12 - aten::add 2.53% 36.289us 4.25% 61.011us 10.169us 20.352us 6.82% 20.352us 3.392us 6 - aten::sub 2.59% 37.162us 4.41% 63.291us 10.549us 20.160us 6.76% 20.160us 3.360us 6 - Activity Buffer Request 13.10% 188.164us 13.10% 188.164us 188.164us 17.888us 6.00% 17.888us 17.888us 1 - aten::empty_strided 2.24% 32.121us 2.24% 32.121us 5.354us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 15.74% 226.067us 15.74% 226.067us 37.678us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.81% 69.111us 6.15% 88.363us 3.682us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.34% 19.252us 1.34% 19.252us 0.802us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.62% 238.734us 16.62% 238.734us 4.974us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.34% 4.940us 0.34% 4.940us 4.940us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 909.016us 306.24% 909.016us 909.016us 1 + torch_eager 19.05% 269.264us 99.66% 1.409ms 1.409ms 0.000us 0.00% 314.684us 314.684us 1 + aten::mul 10.56% 149.323us 19.02% 268.875us 11.203us 145.440us 49.00% 145.440us 6.060us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.440us 49.00% 145.440us 6.060us 24 + aten::copy_ 6.96% 98.305us 44.09% 623.125us 34.618us 110.751us 37.31% 128.606us 7.145us 18 + aten::clone 1.45% 20.520us 38.80% 548.422us 91.404us 0.000us 0.00% 71.453us 11.909us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.153us 19.25% 57.153us 4.763us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.598us 18.06% 53.598us 8.933us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.638us 13.69% 40.638us 3.387us 12 + aten::add 2.27% 32.070us 3.85% 54.390us 9.065us 20.352us 6.86% 20.352us 3.392us 6 + aten::sub 2.35% 33.277us 4.05% 57.282us 9.547us 20.286us 6.83% 20.286us 3.381us 6 + Activity Buffer Request 15.96% 225.655us 15.96% 225.655us 225.655us 17.855us 6.02% 17.855us 17.855us 1 + aten::empty_strided 2.15% 30.350us 2.15% 30.350us 5.058us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 16.79% 237.294us 16.79% 237.294us 39.549us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.69% 66.249us 6.00% 84.797us 3.533us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.31% 18.548us 1.31% 18.548us 0.773us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 16.11% 227.748us 16.11% 227.748us 4.745us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.34% 4.840us 0.34% 4.840us 4.840us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.437ms -Self CUDA time total: 298.334us +Self CPU time total: 1.413ms +Self CUDA time total: 296.829us @@ -4659,27 +4667,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 957.657us 163.29% 957.657us 957.657us 1 - torch_eager 20.09% 288.813us 99.63% 1.432ms 1.432ms 0.000us 0.00% 610.425us 610.425us 1 - aten::copy_ 7.31% 105.011us 42.63% 612.724us 34.040us 268.572us 45.79% 292.508us 16.250us 18 - aten::mul 10.71% 153.870us 18.84% 270.776us 11.282us 252.607us 43.07% 252.607us 10.525us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 252.607us 43.07% 252.607us 10.525us 24 - aten::clone 1.42% 20.480us 36.58% 525.692us 87.615us 0.000us 0.00% 201.566us 33.594us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 177.630us 30.29% 177.630us 29.605us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.942us 15.51% 90.942us 7.578us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.310us 11.14% 65.310us 5.443us 12 - aten::sub 2.69% 38.720us 4.45% 63.991us 10.665us 32.991us 5.63% 32.991us 5.499us 6 - aten::add 2.37% 34.041us 3.93% 56.461us 9.410us 32.319us 5.51% 32.319us 5.387us 6 - Activity Buffer Request 15.99% 229.866us 15.99% 229.866us 229.866us 23.936us 4.08% 23.936us 23.936us 1 - aten::empty_strided 2.02% 29.010us 2.02% 29.010us 4.835us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 14.72% 211.585us 14.72% 211.585us 35.264us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.83% 69.478us 6.24% 89.671us 3.736us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.40% 20.193us 1.40% 20.193us 0.841us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.06% 230.859us 16.06% 230.859us 4.810us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.37% 5.320us 0.37% 5.320us 5.320us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 916.757us 157.09% 916.757us 916.757us 1 + torch_eager 19.46% 274.242us 99.65% 1.404ms 1.404ms 0.000us 0.00% 607.350us 607.350us 1 + aten::copy_ 7.01% 98.793us 43.42% 611.905us 33.995us 268.603us 46.03% 292.379us 16.243us 18 + aten::mul 10.57% 148.926us 18.84% 265.480us 11.062us 249.086us 42.68% 249.086us 10.379us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 249.086us 42.68% 249.086us 10.379us 24 + aten::clone 1.44% 20.340us 38.12% 537.253us 89.542us 0.000us 0.00% 202.173us 33.696us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 178.397us 30.57% 178.397us 29.733us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.206us 15.46% 90.206us 7.517us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.885us 11.29% 65.885us 5.490us 12 + aten::sub 2.63% 37.022us 4.37% 61.602us 10.267us 33.151us 5.68% 33.151us 5.525us 6 + aten::add 2.33% 32.810us 3.92% 55.180us 9.197us 32.734us 5.61% 32.734us 5.456us 6 + Activity Buffer Request 15.58% 219.605us 15.58% 219.605us 219.605us 23.776us 4.07% 23.776us 23.776us 1 + aten::empty_strided 2.10% 29.631us 2.10% 29.631us 4.938us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 16.49% 232.396us 16.49% 232.396us 38.733us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.73% 66.612us 6.10% 85.953us 3.581us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.37% 19.341us 1.37% 19.341us 0.806us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.94% 224.615us 15.94% 224.615us 4.679us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.35% 4.910us 0.35% 4.910us 4.910us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.437ms -Self CUDA time total: 586.489us +Self CPU time total: 1.409ms +Self CUDA time total: 583.574us @@ -4689,55 +4697,61 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 9.43% 329.378us 77.87% 2.720ms 2.720ms 0.000us 0.00% 1.842ms 1.842ms 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.815ms 102.19% 1.815ms 1.815ms 1 - aten::copy_ 3.09% 107.951us 52.68% 1.840ms 102.235us 794.051us 44.71% 860.068us 47.782us 18 - aten::mul 4.59% 160.365us 8.02% 279.997us 11.667us 834.368us 46.99% 834.368us 34.765us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 834.368us 46.99% 834.368us 34.765us 24 - aten::clone 0.80% 28.034us 50.14% 1.751ms 291.882us 0.000us 0.00% 627.394us 104.566us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 561.377us 31.61% 561.377us 93.563us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 232.674us 13.10% 232.674us 19.389us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 147.392us 8.30% 147.392us 12.283us 12 - aten::sub 1.14% 39.970us 1.89% 66.170us 11.028us 89.952us 5.07% 89.952us 14.992us 6 - Activity Buffer Request 41.31% 1.443ms 41.31% 1.443ms 1.443ms 66.017us 3.72% 66.017us 66.017us 1 - aten::add 0.95% 33.281us 1.61% 56.271us 9.379us 57.440us 3.23% 57.440us 9.573us 6 - aten::empty_strided 0.85% 29.670us 0.85% 29.670us 4.945us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.22% 217.146us 6.22% 217.146us 36.191us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.01% 70.292us 2.58% 90.182us 3.758us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.57% 19.890us 0.57% 19.890us 0.829us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 6.90% 240.975us 6.90% 240.975us 5.020us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 22.13% 773.090us 22.13% 773.090us 773.090us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 12.10% 272.127us 61.47% 1.382ms 1.382ms 0.000us 0.00% 1.837ms 1.837ms 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.810ms 102.21% 1.810ms 1.810ms 1 + aten::copy_ 4.74% 106.692us 27.02% 607.756us 33.764us 794.110us 44.84% 859.966us 47.776us 18 + aten::mul 6.35% 142.895us 11.18% 251.386us 10.474us 829.085us 46.82% 829.085us 34.545us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 829.085us 46.82% 829.085us 34.545us 24 + aten::clone 0.94% 21.099us 23.42% 526.743us 87.790us 0.000us 0.00% 627.678us 104.613us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 561.822us 31.73% 561.822us 93.637us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 232.288us 13.12% 232.288us 19.357us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 147.650us 8.34% 147.650us 12.304us 12 + aten::sub 1.58% 35.541us 2.61% 58.661us 9.777us 89.538us 5.06% 89.538us 14.923us 6 + Activity Buffer Request 9.29% 208.845us 9.29% 208.845us 208.845us 65.856us 3.72% 65.856us 65.856us 1 + aten::add 1.43% 32.251us 2.42% 54.461us 9.077us 58.112us 3.28% 58.112us 9.685us 6 + aten::empty_strided 1.39% 31.342us 1.39% 31.342us 5.224us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.27% 230.957us 10.27% 230.957us 38.493us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.99% 67.270us 3.80% 85.550us 3.565us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.81% 18.280us 0.81% 18.280us 0.762us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 9.56% 215.083us 9.56% 215.083us 4.481us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 38.53% 866.589us 38.53% 866.589us 866.589us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.493ms -Self CUDA time total: 1.776ms +Self CPU time total: 2.249ms +Self CUDA time total: 1.771ms impl wl p50(ms) ok -torch_eager cuda_B1_S128_H32_D128_R64 0.22 True +torch_eager cuda_B1_S128_H32_D128_R64 0.21 True torch_eager cuda_B1_S128_H32_D64_R32 0.22 True -torch_eager cuda_B1_S128_H8_D128_R64 0.23 True -torch_eager cuda_B1_S128_H8_D64_R32 0.18 True -torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True -torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True -torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True -torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True -torch_eager cuda_B1_S512_H32_D128_R64 0.22 True -torch_eager cuda_B1_S512_H32_D64_R32 0.22 True -torch_eager cuda_B1_S512_H8_D128_R64 0.22 True +torch_eager cuda_B1_S128_H8_D128_R64 0.22 True +torch_eager cuda_B1_S128_H8_D64_R32 0.17 True +torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True +torch_eager cuda_B1_S2048_H32_D64_R32 0.21 True +torch_eager cuda_B1_S2048_H8_D128_R64 0.21 True +torch_eager cuda_B1_S2048_H8_D64_R32 0.21 True +torch_eager cuda_B1_S512_H32_D128_R64 0.21 True +torch_eager cuda_B1_S512_H32_D64_R32 0.21 True +torch_eager cuda_B1_S512_H8_D128_R64 0.21 True torch_eager cuda_B1_S512_H8_D64_R32 0.22 True -torch_eager cuda_B2_S128_H32_D128_R64 0.22 True -torch_eager cuda_B2_S128_H32_D64_R32 0.22 True -torch_eager cuda_B2_S128_H8_D128_R64 0.22 True -torch_eager cuda_B2_S128_H8_D64_R32 0.22 True +torch_eager cuda_B2_S128_H32_D128_R64 0.21 True +torch_eager cuda_B2_S128_H32_D64_R32 0.21 True +torch_eager cuda_B2_S128_H8_D128_R64 0.21 True +torch_eager cuda_B2_S128_H8_D64_R32 0.21 True torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True -torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True -torch_eager cuda_B2_S512_H32_D128_R64 0.22 True -torch_eager cuda_B2_S512_H32_D64_R32 0.22 True +torch_eager cuda_B2_S2048_H8_D64_R32 0.21 True +torch_eager cuda_B2_S512_H32_D128_R64 0.21 True +torch_eager cuda_B2_S512_H32_D64_R32 0.21 True torch_eager cuda_B2_S512_H8_D128_R64 0.21 True -torch_eager cuda_B2_S512_H8_D64_R32 0.22 True +torch_eager cuda_B2_S512_H8_D64_R32 0.21 True+▶ UV Install Logs+ +Artifacts:
rotary.jsonl