diff --git "a/rotary/impls/torch_rotary.html" "b/rotary/impls/torch_rotary.html" --- "a/rotary/impls/torch_rotary.html" +++ "b/rotary/impls/torch_rotary.html" @@ -3904,7 +3904,7 @@ Cell: nv | 0.25s
Fri Dec 19 18:55:27 2025 ++Fri Dec 19 19:54:55 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 | +-----------------------------------------+------------------------+----------------------+ @@ -3913,7 +3913,7 @@ Cell: nv | 0.25s | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 28C P0 85W / 350W | 0MiB / 46068MiB | 34% Default | +| N/A 36C P0 90W / 350W | 0MiB / 46068MiB | 17% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -3935,9 +3935,9 @@ Cell: nv | 0.25s ▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 4.08s +Cell: benchmark | 7.93s | Raw @@ -4016,27 +4016,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 975.966us 1092.36% 975.966us 975.966us 1 - torch_eager 12.28% 354.583us 99.52% 2.875ms 2.875ms 0.000us 0.00% 90.561us 90.561us 1 - aten::mul 5.61% 162.144us 9.55% 275.765us 11.490us 47.009us 52.62% 47.009us 1.959us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.009us 52.62% 47.009us 1.959us 24 - aten::copy_ 3.85% 111.174us 67.17% 1.940ms 107.795us 29.153us 32.63% 30.369us 1.687us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.337us 25.00% 22.337us 1.861us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.183us 14.76% 13.183us 1.099us 12 - aten::clone 1.24% 35.691us 65.68% 1.897ms 316.212us 0.000us 0.00% 8.032us 1.339us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 7.63% 6.816us 1.136us 6 - aten::sub 1.48% 42.821us 2.36% 68.262us 11.377us 6.592us 7.38% 6.592us 1.099us 6 - aten::add 1.19% 34.391us 1.96% 56.751us 9.459us 6.591us 7.38% 6.591us 1.098us 6 - Activity Buffer Request 58.57% 1.692ms 58.57% 1.692ms 1.692ms 1.216us 1.36% 1.216us 1.216us 1 - aten::empty_strided 1.90% 54.851us 1.90% 54.851us 9.142us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.58% 74.440us 2.58% 74.440us 12.407us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.39% 69.079us 3.06% 88.511us 3.688us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.67% 19.432us 0.67% 19.432us 0.810us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.77% 224.384us 7.77% 224.384us 4.675us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.48% 13.900us 0.48% 13.900us 13.900us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.121ms 1256.65% 1.121ms 1.121ms 1 + torch_eager 12.91% 396.632us 99.49% 3.057ms 3.057ms 0.000us 0.00% 90.428us 90.428us 1 + aten::mul 6.11% 187.745us 10.64% 326.977us 13.624us 47.168us 52.87% 47.168us 1.965us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.168us 52.87% 47.168us 1.965us 24 + aten::copy_ 3.73% 114.507us 64.51% 1.982ms 110.114us 28.957us 32.46% 30.173us 1.676us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.335us 25.04% 22.335us 1.861us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.087us 14.67% 13.087us 1.091us 12 + aten::clone 1.26% 38.732us 62.92% 1.933ms 322.183us 0.000us 0.00% 7.838us 1.306us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.622us 7.42% 6.622us 1.104us 6 + aten::sub 1.53% 47.130us 2.58% 79.281us 13.214us 6.559us 7.35% 6.559us 1.093us 6 + aten::add 1.27% 38.901us 2.21% 67.791us 11.299us 6.528us 7.32% 6.528us 1.088us 6 + Activity Buffer Request 55.82% 1.715ms 55.82% 1.715ms 1.715ms 1.216us 1.36% 1.216us 1.216us 1 + aten::empty_strided 2.00% 61.422us 2.00% 61.422us 10.237us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.49% 76.434us 2.49% 76.434us 12.739us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.64% 81.233us 3.38% 103.763us 4.323us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.73% 22.530us 0.73% 22.530us 0.939us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 9.00% 276.368us 9.00% 276.368us 5.758us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.51% 15.650us 0.51% 15.650us 15.650us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.889ms -Self CUDA time total: 89.345us +Self CPU time total: 3.072ms +Self CUDA time total: 89.212us @@ -4046,27 +4046,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 912.155us 1011.53% 912.155us 912.155us 1 - torch_eager 10.76% 301.514us 99.79% 2.795ms 2.795ms 0.000us 0.00% 91.296us 91.296us 1 - aten::mul 5.37% 150.316us 9.21% 258.115us 10.755us 47.452us 52.62% 47.452us 1.977us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.452us 52.62% 47.452us 1.977us 24 - aten::copy_ 3.63% 101.610us 71.00% 1.989ms 110.503us 29.347us 32.54% 30.467us 1.693us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.595us 25.06% 22.595us 1.883us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.377us 14.83% 13.377us 1.115us 12 - aten::clone 0.94% 26.442us 68.40% 1.916ms 319.344us 0.000us 0.00% 7.872us 1.312us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 7.49% 6.752us 1.125us 6 - aten::add 1.08% 30.320us 1.84% 51.590us 8.598us 6.689us 7.42% 6.689us 1.115us 6 - aten::sub 1.25% 34.880us 2.05% 57.350us 9.558us 6.688us 7.42% 6.688us 1.115us 6 - Activity Buffer Request 62.97% 1.764ms 62.97% 1.764ms 1.764ms 1.120us 1.24% 1.120us 1.120us 1 - aten::empty_strided 1.15% 32.200us 1.15% 32.200us 5.367us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.15% 60.121us 2.15% 60.121us 10.020us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.19% 61.232us 2.82% 79.123us 3.297us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.64% 17.891us 0.64% 17.891us 0.745us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.67% 214.771us 7.67% 214.771us 4.474us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.21% 5.950us 0.21% 5.950us 5.950us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.007ms 1113.49% 1.007ms 1.007ms 1 + torch_eager 10.54% 298.489us 99.78% 2.827ms 2.827ms 0.000us 0.00% 91.582us 91.582us 1 + aten::mul 6.23% 176.544us 10.66% 301.947us 12.581us 47.549us 52.58% 47.549us 1.981us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.549us 52.58% 47.549us 1.981us 24 + aten::copy_ 3.68% 104.131us 68.91% 1.952ms 108.446us 29.376us 32.49% 30.529us 1.696us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.591us 24.98% 22.591us 1.883us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.504us 14.93% 13.504us 1.125us 12 + aten::clone 0.77% 21.889us 65.76% 1.863ms 310.451us 0.000us 0.00% 7.938us 1.323us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.785us 7.50% 6.785us 1.131us 6 + aten::sub 1.41% 40.059us 2.39% 67.780us 11.297us 6.784us 7.50% 6.784us 1.131us 6 + aten::add 1.28% 36.151us 2.32% 65.853us 10.975us 6.720us 7.43% 6.720us 1.120us 6 + Activity Buffer Request 60.61% 1.717ms 60.61% 1.717ms 1.717ms 1.153us 1.28% 1.153us 1.153us 1 + aten::empty_strided 1.12% 31.742us 1.12% 31.742us 5.290us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.04% 57.701us 2.04% 57.701us 9.617us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.45% 69.273us 3.06% 86.813us 3.617us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.62% 17.540us 0.62% 17.540us 0.731us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 9.05% 256.228us 9.05% 256.228us 5.338us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.22% 6.150us 0.22% 6.150us 6.150us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.801ms -Self CUDA time total: 90.176us +Self CPU time total: 2.833ms +Self CUDA time total: 90.429us @@ -4076,27 +4076,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 913.022us 972.45% 913.022us 913.022us 1 - torch_eager 10.98% 301.066us 99.80% 2.737ms 2.737ms 0.000us 0.00% 95.200us 95.200us 1 - aten::mul 5.60% 153.463us 9.59% 262.853us 10.952us 48.737us 51.91% 48.737us 2.031us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.737us 51.91% 48.737us 2.031us 24 - aten::copy_ 3.66% 100.372us 70.16% 1.924ms 106.884us 30.688us 32.69% 31.999us 1.778us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 24.40% 22.912us 1.909us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.464us 15.41% 14.464us 1.205us 12 - aten::clone 0.95% 26.091us 67.47% 1.850ms 308.365us 0.000us 0.00% 9.087us 1.514us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 8.28% 7.776us 1.296us 6 - aten::add 1.14% 31.200us 1.93% 53.020us 8.837us 7.264us 7.74% 7.264us 1.211us 6 - aten::sub 1.31% 35.942us 2.13% 58.462us 9.744us 7.200us 7.67% 7.200us 1.200us 6 - Activity Buffer Request 62.12% 1.703ms 62.12% 1.703ms 1.703ms 1.311us 1.40% 1.311us 1.311us 1 - aten::empty_strided 1.13% 31.120us 1.13% 31.120us 5.187us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.07% 56.870us 2.07% 56.870us 9.478us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.29% 62.820us 2.92% 80.090us 3.337us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.63% 17.270us 0.63% 17.270us 0.720us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.91% 217.003us 7.91% 217.003us 4.521us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.20% 5.600us 0.20% 5.600us 5.600us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.006ms 1072.44% 1.006ms 1.006ms 1 + torch_eager 10.52% 298.395us 99.78% 2.831ms 2.831ms 0.000us 0.00% 95.037us 95.037us 1 + aten::mul 6.11% 173.334us 10.70% 303.588us 12.649us 48.703us 51.91% 48.703us 2.029us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.703us 51.91% 48.703us 2.029us 24 + aten::copy_ 3.65% 103.459us 69.19% 1.963ms 109.044us 30.654us 32.67% 31.870us 1.771us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.975us 24.49% 22.975us 1.915us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.464us 15.42% 14.464us 1.205us 12 + aten::clone 0.76% 21.692us 65.97% 1.871ms 311.909us 0.000us 0.00% 8.895us 1.482us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.679us 8.18% 7.679us 1.280us 6 + aten::add 1.20% 34.161us 2.12% 60.242us 10.040us 7.233us 7.71% 7.233us 1.206us 6 + aten::sub 1.38% 39.061us 2.38% 67.492us 11.249us 7.231us 7.71% 7.231us 1.205us 6 + Activity Buffer Request 60.90% 1.728ms 60.90% 1.728ms 1.728ms 1.216us 1.30% 1.216us 1.216us 1 + aten::empty_strided 1.08% 30.530us 1.08% 30.530us 5.088us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.06% 58.531us 2.06% 58.531us 9.755us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.41% 68.445us 3.03% 85.834us 3.576us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.61% 17.389us 0.61% 17.389us 0.725us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 9.10% 258.010us 9.10% 258.010us 5.375us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.22% 6.240us 0.22% 6.240us 6.240us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.742ms -Self CUDA time total: 93.889us +Self CPU time total: 2.837ms +Self CUDA time total: 93.821us @@ -4106,27 +4106,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 939.645us 923.40% 939.645us 939.645us 1 - torch_eager 10.25% 301.966us 99.82% 2.940ms 2.940ms 0.000us 0.00% 103.039us 103.039us 1 - aten::mul 5.17% 152.369us 8.89% 261.782us 10.908us 52.831us 51.92% 52.831us 2.201us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.831us 51.92% 52.831us 2.201us 24 - aten::copy_ 3.48% 102.582us 71.53% 2.107ms 117.036us 32.481us 31.92% 33.761us 1.876us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.737us 24.31% 24.737us 2.061us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.447us 16.16% 16.447us 1.371us 12 - aten::clone 0.93% 27.431us 68.93% 2.030ms 338.362us 0.000us 0.00% 9.024us 1.504us 6 - aten::sub 1.21% 35.711us 2.01% 59.102us 9.850us 8.319us 8.18% 8.319us 1.387us 6 - aten::add 1.06% 31.350us 1.79% 52.800us 8.800us 8.128us 7.99% 8.128us 1.355us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 7.61% 7.744us 1.291us 6 - Activity Buffer Request 57.63% 1.697ms 57.63% 1.697ms 1.697ms 1.280us 1.26% 1.280us 1.280us 1 - aten::empty_strided 1.03% 30.229us 1.03% 30.229us 5.038us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.25% 243.123us 8.25% 243.123us 40.520us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.79% 82.098us 3.39% 99.942us 4.164us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.61% 17.844us 0.61% 17.844us 0.743us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.40% 217.957us 7.40% 217.957us 4.541us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.18% 5.350us 0.18% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.008ms 991.12% 1.008ms 1.008ms 1 + torch_eager 11.39% 291.220us 99.77% 2.552ms 2.552ms 0.000us 0.00% 103.069us 103.069us 1 + aten::mul 6.75% 172.580us 11.78% 301.232us 12.551us 52.797us 51.90% 52.797us 2.200us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.797us 51.90% 52.797us 2.200us 24 + aten::copy_ 3.99% 102.084us 65.85% 1.684ms 93.565us 32.544us 31.99% 33.888us 1.883us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.736us 24.32% 24.736us 2.061us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.384us 16.11% 16.384us 1.365us 12 + aten::clone 0.85% 21.760us 62.39% 1.596ms 265.957us 0.000us 0.00% 9.152us 1.525us 6 + aten::sub 1.61% 41.170us 2.75% 70.342us 11.724us 8.256us 8.12% 8.256us 1.376us 6 + aten::add 1.37% 35.121us 2.54% 64.851us 10.809us 8.128us 7.99% 8.128us 1.355us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 7.68% 7.808us 1.301us 6 + Activity Buffer Request 49.27% 1.260ms 49.27% 1.260ms 1.260ms 1.344us 1.32% 1.344us 1.344us 1 + aten::empty_strided 1.23% 31.560us 1.23% 31.560us 5.260us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.74% 249.077us 9.74% 249.077us 41.513us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.69% 68.742us 3.39% 86.703us 3.613us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.70% 17.961us 0.70% 17.961us 0.748us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 10.18% 260.466us 10.18% 260.466us 5.426us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.23% 5.870us 0.23% 5.870us 5.870us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.945ms -Self CUDA time total: 101.759us +Self CPU time total: 2.558ms +Self CUDA time total: 101.725us @@ -4136,27 +4136,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 916.066us 979.04% 916.066us 916.066us 1 - torch_eager 10.22% 297.753us 99.80% 2.907ms 2.907ms 0.000us 0.00% 94.848us 94.848us 1 - aten::mul 5.27% 153.390us 8.99% 261.760us 10.907us 48.609us 51.95% 48.609us 2.025us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.609us 51.95% 48.609us 2.025us 24 - aten::copy_ 3.49% 101.793us 71.96% 2.096ms 116.462us 30.720us 32.83% 32.000us 1.778us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.976us 24.56% 22.976us 1.915us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.239us 15.22% 14.239us 1.187us 12 - aten::clone 1.04% 30.259us 69.44% 2.023ms 337.157us 0.000us 0.00% 9.024us 1.504us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 8.28% 7.744us 1.291us 6 - aten::sub 1.21% 35.379us 2.01% 58.671us 9.778us 7.135us 7.63% 7.135us 1.189us 6 - aten::add 1.06% 30.800us 1.78% 51.980us 8.663us 7.104us 7.59% 7.104us 1.184us 6 - Activity Buffer Request 59.25% 1.726ms 59.25% 1.726ms 1.726ms 1.280us 1.37% 1.280us 1.280us 1 - aten::empty_strided 1.07% 31.081us 1.07% 31.081us 5.180us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.98% 203.214us 6.98% 203.214us 33.869us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.14% 62.478us 2.73% 79.587us 3.316us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.59% 17.109us 0.59% 17.109us 0.713us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.49% 218.173us 7.49% 218.173us 4.545us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.20% 5.800us 0.20% 5.800us 5.800us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.014ms 1081.02% 1.014ms 1.014ms 1 + torch_eager 10.25% 325.467us 99.80% 3.170ms 3.170ms 0.000us 0.00% 95.040us 95.040us 1 + aten::mul 5.52% 175.344us 9.66% 306.970us 12.790us 48.800us 52.05% 48.800us 2.033us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.800us 52.05% 48.800us 2.033us 24 + aten::copy_ 3.31% 105.233us 71.23% 2.263ms 125.701us 30.784us 32.83% 32.064us 1.781us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.041us 24.57% 23.041us 1.920us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.176us 15.12% 14.176us 1.181us 12 + aten::clone 0.88% 27.881us 68.64% 2.180ms 363.408us 0.000us 0.00% 9.023us 1.504us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.743us 8.26% 7.743us 1.290us 6 + aten::sub 1.21% 38.440us 2.12% 67.480us 11.247us 7.136us 7.61% 7.136us 1.189us 6 + aten::add 1.04% 32.931us 1.90% 60.251us 10.042us 7.040us 7.51% 7.040us 1.173us 6 + Activity Buffer Request 54.81% 1.741ms 54.81% 1.741ms 1.741ms 1.280us 1.37% 1.280us 1.280us 1 + aten::empty_strided 1.02% 32.521us 1.02% 32.521us 5.420us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.79% 342.870us 10.79% 342.870us 57.145us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.15% 68.383us 2.74% 86.883us 3.620us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.58% 18.500us 0.58% 18.500us 0.771us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.23% 261.446us 8.23% 261.446us 5.447us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.20% 6.450us 0.20% 6.450us 6.450us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.913ms -Self CUDA time total: 93.568us +Self CPU time total: 3.177ms +Self CUDA time total: 93.760us @@ -4166,27 +4166,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 889.506us 873.60% 889.506us 889.506us 1 - torch_eager 9.47% 263.734us 99.81% 2.780ms 2.780ms 0.000us 0.00% 103.133us 103.133us 1 - aten::mul 5.38% 149.780us 9.23% 257.043us 10.710us 52.958us 52.01% 52.958us 2.207us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.958us 52.01% 52.958us 2.207us 24 - aten::copy_ 3.60% 100.145us 72.47% 2.019ms 112.141us 32.385us 31.81% 33.697us 1.872us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.609us 24.17% 24.609us 2.051us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.478us 16.18% 16.478us 1.373us 12 - aten::clone 0.76% 21.289us 69.67% 1.941ms 323.462us 0.000us 0.00% 9.088us 1.515us 6 - aten::sub 1.26% 35.149us 2.08% 57.971us 9.662us 8.318us 8.17% 8.318us 1.386us 6 - aten::add 1.12% 31.181us 1.89% 52.690us 8.782us 8.160us 8.01% 8.160us 1.360us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 7.64% 7.776us 1.296us 6 - Activity Buffer Request 59.87% 1.668ms 59.87% 1.668ms 1.668ms 1.312us 1.29% 1.312us 1.312us 1 - aten::empty_strided 1.08% 30.102us 1.08% 30.102us 5.017us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.81% 189.822us 6.81% 189.822us 31.637us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.23% 62.254us 2.83% 78.841us 3.285us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.60% 16.587us 0.60% 16.587us 0.691us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.62% 212.384us 7.62% 212.384us 4.425us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.19% 5.281us 0.19% 5.281us 5.281us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.001ms 983.06% 1.001ms 1.001ms 1 + torch_eager 9.84% 295.255us 99.81% 2.995ms 2.995ms 0.000us 0.00% 103.136us 103.136us 1 + aten::mul 5.72% 171.754us 9.96% 298.939us 12.456us 52.896us 51.95% 52.896us 2.204us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.896us 51.95% 52.896us 2.204us 24 + aten::copy_ 3.46% 103.695us 71.00% 2.131ms 118.374us 32.417us 31.84% 33.729us 1.874us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.673us 24.23% 24.673us 2.056us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.511us 16.22% 16.511us 1.376us 12 + aten::clone 0.71% 21.410us 67.93% 2.039ms 339.780us 0.000us 0.00% 9.056us 1.509us 6 + aten::sub 1.35% 40.422us 2.36% 70.682us 11.780us 8.319us 8.17% 8.319us 1.386us 6 + aten::add 1.19% 35.599us 2.15% 64.481us 10.747us 8.192us 8.05% 8.192us 1.365us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 7.61% 7.744us 1.291us 6 + Activity Buffer Request 57.66% 1.731ms 57.66% 1.731ms 1.731ms 1.312us 1.29% 1.312us 1.312us 1 + aten::empty_strided 1.04% 31.291us 1.04% 31.291us 5.215us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 7.41% 222.325us 7.41% 222.325us 37.054us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.17% 65.101us 2.75% 82.681us 3.445us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.59% 17.580us 0.59% 17.580us 0.733us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.68% 260.519us 8.68% 260.519us 5.427us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.19% 5.671us 0.19% 5.671us 5.671us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.785ms -Self CUDA time total: 101.821us +Self CPU time total: 3.001ms +Self CUDA time total: 101.824us @@ -4196,27 +4196,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 917.279us 760.79% 917.279us 917.279us 1 - torch_eager 9.49% 270.296us 99.79% 2.843ms 2.843ms 0.000us 0.00% 122.393us 122.393us 1 - aten::mul 5.35% 152.294us 9.23% 262.963us 10.957us 62.014us 51.43% 62.014us 2.584us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.014us 51.43% 62.014us 2.584us 24 - aten::copy_ 3.56% 101.468us 72.26% 2.058ms 114.357us 39.420us 32.69% 41.243us 2.291us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.894us 23.96% 28.894us 2.408us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.136us 15.87% 19.136us 1.595us 12 - aten::clone 0.72% 20.551us 69.70% 1.986ms 330.917us 0.000us 0.00% 12.349us 2.058us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.526us 8.73% 10.526us 1.754us 6 - aten::sub 1.26% 35.841us 2.08% 59.281us 9.880us 9.600us 7.96% 9.600us 1.600us 6 - aten::add 1.13% 32.290us 1.93% 54.990us 9.165us 9.536us 7.91% 9.536us 1.589us 6 - Activity Buffer Request 59.91% 1.707ms 59.91% 1.707ms 1.707ms 1.823us 1.51% 1.823us 1.823us 1 - aten::empty_strided 1.30% 36.920us 1.30% 36.920us 6.153us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.60% 187.893us 6.60% 187.893us 31.316us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.17% 61.750us 2.78% 79.243us 3.302us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.61% 17.493us 0.61% 17.493us 0.729us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.70% 219.342us 7.70% 219.342us 4.570us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.21% 5.890us 0.21% 5.890us 5.890us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.015ms 840.95% 1.015ms 1.015ms 1 + torch_eager 9.74% 291.664us 99.80% 2.987ms 2.987ms 0.000us 0.00% 122.530us 122.530us 1 + aten::mul 5.84% 174.903us 10.28% 307.588us 12.816us 62.144us 51.48% 62.144us 2.589us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.144us 51.48% 62.144us 2.589us 24 + aten::copy_ 3.43% 102.713us 70.67% 2.115ms 117.501us 39.265us 32.53% 41.090us 2.283us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.864us 23.91% 28.864us 2.405us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.296us 15.99% 19.296us 1.608us 12 + aten::clone 0.71% 21.240us 67.46% 2.019ms 336.492us 0.000us 0.00% 12.226us 2.038us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.401us 8.62% 10.401us 1.733us 6 + aten::add 1.16% 34.581us 2.15% 64.442us 10.740us 9.696us 8.03% 9.696us 1.616us 6 + aten::sub 1.36% 40.831us 2.36% 70.613us 11.769us 9.600us 7.95% 9.600us 1.600us 6 + Activity Buffer Request 57.99% 1.736ms 57.99% 1.736ms 1.736ms 1.825us 1.51% 1.825us 1.825us 1 + aten::empty_strided 1.04% 31.082us 1.04% 31.082us 5.180us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.60% 197.545us 6.60% 197.545us 32.924us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.24% 66.920us 2.85% 85.310us 3.555us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.61% 18.390us 0.61% 18.390us 0.766us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 9.07% 271.392us 9.07% 271.392us 5.654us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.20% 6.001us 0.20% 6.001us 6.001us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.849ms -Self CUDA time total: 120.570us +Self CPU time total: 2.993ms +Self CUDA time total: 120.705us @@ -4226,27 +4226,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 924.864us 536.51% 924.864us 924.864us 1 - torch_eager 10.45% 305.132us 99.82% 2.915ms 2.915ms 0.000us 0.00% 175.296us 175.296us 1 - aten::mul 5.14% 150.181us 8.96% 261.653us 10.902us 89.408us 51.87% 89.408us 3.725us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.408us 51.87% 89.408us 3.725us 24 - aten::copy_ 3.52% 102.713us 71.68% 2.093ms 116.286us 57.856us 33.56% 60.768us 3.376us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.768us 23.65% 40.768us 3.397us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.120us 14.57% 25.120us 2.093us 12 - aten::clone 1.06% 30.960us 69.26% 2.022ms 337.044us 0.000us 0.00% 20.000us 3.333us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 9.91% 17.088us 2.848us 6 - aten::add 1.07% 31.150us 1.81% 52.780us 8.797us 12.672us 7.35% 12.672us 2.112us 6 - aten::sub 1.22% 35.659us 2.01% 58.661us 9.777us 12.448us 7.22% 12.448us 2.075us 6 - Activity Buffer Request 59.67% 1.742ms 59.67% 1.742ms 1.742ms 2.912us 1.69% 2.912us 2.912us 1 - aten::empty_strided 1.10% 32.260us 1.10% 32.260us 5.377us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.30% 183.953us 6.30% 183.953us 30.659us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.15% 62.747us 2.74% 80.125us 3.339us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.60% 17.378us 0.60% 17.378us 0.724us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.55% 220.376us 7.55% 220.376us 4.591us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.18% 5.231us 0.18% 5.231us 5.231us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.002ms 581.09% 1.002ms 1.002ms 1 + torch_eager 20.60% 295.691us 99.58% 1.430ms 1.430ms 0.000us 0.00% 175.327us 175.327us 1 + aten::mul 11.87% 170.476us 20.75% 297.916us 12.413us 89.503us 51.89% 89.503us 3.729us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.503us 51.89% 89.503us 3.729us 24 + aten::copy_ 7.24% 103.879us 39.87% 572.479us 31.804us 57.887us 33.56% 60.735us 3.374us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.832us 23.67% 40.832us 3.403us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.089us 14.55% 25.089us 2.091us 12 + aten::clone 1.51% 21.660us 33.62% 482.661us 80.443us 0.000us 0.00% 19.903us 3.317us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.055us 9.89% 17.055us 2.842us 6 + aten::sub 2.80% 40.184us 4.70% 67.454us 11.242us 12.546us 7.27% 12.546us 2.091us 6 + aten::add 2.41% 34.531us 4.25% 60.992us 10.165us 12.543us 7.27% 12.543us 2.090us 6 + Activity Buffer Request 14.04% 201.545us 14.04% 201.545us 201.545us 2.848us 1.65% 2.848us 2.848us 1 + aten::empty_strided 2.07% 29.712us 2.07% 29.712us 4.952us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.60% 195.253us 13.60% 195.253us 32.542us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.57% 65.572us 5.83% 83.731us 3.489us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.26% 18.159us 1.26% 18.159us 0.757us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 17.62% 252.973us 17.62% 252.973us 5.270us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.42% 6.061us 0.42% 6.061us 6.061us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.920ms -Self CUDA time total: 172.384us +Self CPU time total: 1.436ms +Self CUDA time total: 172.479us @@ -4256,27 +4256,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 891.930us 739.34% 891.930us 891.930us 1 - torch_eager 21.05% 265.905us 99.59% 1.258ms 1.258ms 0.000us 0.00% 122.398us 122.398us 1 - aten::mul 11.89% 150.169us 20.37% 257.293us 10.721us 62.046us 51.43% 62.046us 2.585us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.046us 51.43% 62.046us 2.585us 24 - aten::copy_ 7.91% 99.966us 39.16% 494.681us 27.482us 39.361us 32.63% 41.121us 2.285us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.961us 24.01% 28.961us 2.413us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.231us 15.94% 19.231us 1.603us 12 - aten::clone 1.55% 19.527us 32.87% 415.246us 69.208us 0.000us 0.00% 12.160us 2.027us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 8.62% 10.400us 1.733us 6 - aten::add 2.51% 31.670us 4.18% 52.860us 8.810us 9.663us 8.01% 9.663us 1.611us 6 - aten::sub 2.77% 34.981us 4.60% 58.113us 9.685us 9.568us 7.93% 9.568us 1.595us 6 - Activity Buffer Request 11.68% 147.573us 11.68% 147.573us 147.573us 1.760us 1.46% 1.760us 1.760us 1 - aten::empty_strided 2.49% 31.403us 2.49% 31.403us 5.234us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 14.63% 184.842us 14.63% 184.842us 30.807us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.89% 61.717us 6.20% 78.378us 3.266us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.32% 16.661us 1.32% 16.661us 0.694us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.92% 213.746us 16.92% 213.746us 4.453us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.41% 5.161us 0.41% 5.161us 5.161us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 998.580us 827.30% 998.580us 998.580us 1 + torch_eager 20.47% 288.290us 99.58% 1.403ms 1.403ms 0.000us 0.00% 122.464us 122.464us 1 + aten::mul 12.45% 175.343us 21.53% 303.228us 12.634us 62.175us 51.51% 62.175us 2.591us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.175us 51.51% 62.175us 2.591us 24 + aten::copy_ 7.38% 103.973us 38.88% 547.625us 30.424us 39.265us 32.53% 41.025us 2.279us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.800us 23.86% 28.800us 2.400us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.264us 15.96% 19.264us 1.605us 12 + aten::clone 1.40% 19.710us 32.30% 454.921us 75.820us 0.000us 0.00% 12.225us 2.038us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.465us 8.67% 10.465us 1.744us 6 + aten::add 2.42% 34.049us 4.36% 61.370us 10.228us 9.695us 8.03% 9.695us 1.616us 6 + aten::sub 2.76% 38.918us 4.74% 66.750us 11.125us 9.569us 7.93% 9.569us 1.595us 6 + Activity Buffer Request 13.00% 183.145us 13.00% 183.145us 183.145us 1.760us 1.46% 1.760us 1.760us 1 + aten::empty_strided 2.13% 30.010us 2.13% 30.010us 5.002us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.33% 187.794us 13.33% 187.794us 31.299us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.79% 67.456us 6.09% 85.722us 3.572us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.30% 18.266us 1.30% 18.266us 0.761us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 18.16% 255.751us 18.16% 255.751us 5.328us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.42% 5.851us 0.42% 5.851us 5.851us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.263ms -Self CUDA time total: 120.638us +Self CPU time total: 1.409ms +Self CUDA time total: 120.704us @@ -4286,27 +4286,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 896.465us 520.02% 896.465us 896.465us 1 - torch_eager 20.64% 267.500us 99.56% 1.290ms 1.290ms 0.000us 0.00% 175.237us 175.237us 1 - aten::mul 11.74% 152.190us 20.16% 261.202us 10.883us 89.569us 51.96% 89.569us 3.732us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.569us 51.96% 89.569us 3.732us 24 - aten::copy_ 7.78% 100.776us 40.08% 519.323us 28.851us 57.731us 33.49% 60.579us 3.366us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.803us 23.67% 40.803us 3.400us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.089us 14.55% 25.089us 2.091us 12 - aten::clone 1.61% 20.829us 34.03% 440.978us 73.496us 0.000us 0.00% 19.776us 3.296us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 9.82% 16.928us 2.821us 6 - aten::add 2.36% 30.520us 4.00% 51.870us 8.645us 12.608us 7.31% 12.608us 2.101us 6 - aten::sub 2.79% 36.204us 4.56% 59.064us 9.844us 12.481us 7.24% 12.481us 2.080us 6 - Activity Buffer Request 13.56% 175.743us 13.56% 175.743us 175.743us 2.848us 1.65% 2.848us 2.848us 1 - aten::empty_strided 2.40% 31.051us 2.40% 31.051us 5.175us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.98% 181.154us 13.98% 181.154us 30.192us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.80% 62.202us 6.12% 79.302us 3.304us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.32% 17.100us 1.32% 17.100us 0.713us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.58% 214.872us 16.58% 214.872us 4.477us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.44% 5.731us 0.44% 5.731us 5.731us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 990.551us 575.17% 990.551us 990.551us 1 + torch_eager 9.55% 281.107us 99.81% 2.938ms 2.938ms 0.000us 0.00% 175.100us 175.100us 1 + aten::mul 5.81% 170.942us 10.23% 301.167us 12.549us 89.468us 51.95% 89.468us 3.728us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.468us 51.95% 89.468us 3.728us 24 + aten::copy_ 3.50% 102.971us 70.92% 2.088ms 115.984us 57.728us 33.52% 60.608us 3.367us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.800us 23.69% 40.800us 3.400us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.024us 14.53% 25.024us 2.085us 12 + aten::clone 0.79% 23.370us 68.06% 2.004ms 333.942us 0.000us 0.00% 19.808us 3.301us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 9.83% 16.928us 2.821us 6 + aten::add 1.19% 35.020us 2.08% 61.181us 10.197us 12.513us 7.27% 12.513us 2.085us 6 + aten::sub 1.27% 37.451us 2.29% 67.451us 11.242us 12.511us 7.26% 12.511us 2.085us 6 + Activity Buffer Request 58.47% 1.721ms 58.47% 1.721ms 1.721ms 2.880us 1.67% 2.880us 2.880us 1 + aten::empty_strided 1.04% 30.722us 1.04% 30.722us 5.120us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.59% 193.904us 6.59% 193.904us 32.317us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.29% 67.323us 2.91% 85.623us 3.568us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.62% 18.300us 0.62% 18.300us 0.762us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.69% 255.869us 8.69% 255.869us 5.331us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.19% 5.631us 0.19% 5.631us 5.631us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.296ms -Self CUDA time total: 172.389us +Self CPU time total: 2.944ms +Self CUDA time total: 172.220us @@ -4316,27 +4316,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 907.999us 318.14% 907.999us 907.999us 1 - torch_eager 10.33% 297.182us 99.78% 2.870ms 2.870ms 0.000us 0.00% 303.265us 303.265us 1 - aten::mul 5.13% 147.584us 8.90% 255.976us 10.666us 133.567us 46.80% 133.567us 5.565us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.567us 46.80% 133.567us 5.565us 24 - aten::copy_ 3.51% 101.030us 71.83% 2.067ms 114.810us 110.722us 38.79% 128.578us 7.143us 18 - aten::clone 0.94% 26.992us 69.34% 1.995ms 332.479us 0.000us 0.00% 71.233us 11.872us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.345us 20.09% 57.345us 4.779us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.377us 18.70% 53.377us 8.896us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.120us 14.41% 41.120us 3.427us 12 - aten::sub 1.21% 34.680us 2.01% 57.770us 9.628us 20.833us 7.30% 20.833us 3.472us 6 - aten::add 1.09% 31.394us 1.89% 54.514us 9.086us 20.287us 7.11% 20.287us 3.381us 6 - Activity Buffer Request 59.89% 1.723ms 59.89% 1.723ms 1.723ms 17.856us 6.26% 17.856us 17.856us 1 - aten::empty_strided 1.06% 30.410us 1.06% 30.410us 5.068us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.35% 182.753us 6.35% 182.753us 30.459us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.22% 63.807us 2.82% 81.011us 3.375us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.60% 17.204us 0.60% 17.204us 0.717us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.46% 214.543us 7.46% 214.543us 4.470us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.22% 6.420us 0.22% 6.420us 6.420us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 996.043us 348.79% 996.043us 996.043us 1 + torch_eager 20.87% 290.654us 99.57% 1.387ms 1.387ms 0.000us 0.00% 304.030us 304.030us 1 + aten::mul 12.33% 171.709us 21.56% 300.326us 12.514us 133.152us 46.63% 133.152us 5.548us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.152us 46.63% 133.152us 5.548us 24 + aten::copy_ 7.20% 100.319us 38.33% 533.940us 29.663us 111.295us 38.97% 129.758us 7.209us 18 + aten::clone 1.43% 19.920us 31.74% 442.109us 73.685us 0.000us 0.00% 72.510us 12.085us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.248us 20.05% 57.248us 4.771us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 54.047us 18.93% 54.047us 9.008us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.120us 14.40% 41.120us 3.427us 12 + aten::sub 2.81% 39.111us 4.76% 66.281us 11.047us 20.672us 7.24% 20.672us 3.445us 6 + aten::add 2.49% 34.670us 4.42% 61.560us 10.260us 20.448us 7.16% 20.448us 3.408us 6 + Activity Buffer Request 12.17% 169.484us 12.17% 169.484us 169.484us 18.463us 6.47% 18.463us 18.463us 1 + aten::empty_strided 2.13% 29.640us 2.13% 29.640us 4.940us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.75% 191.524us 13.75% 191.524us 31.921us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.84% 67.441us 6.08% 84.703us 3.529us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.24% 17.262us 1.24% 17.262us 0.719us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 18.33% 255.290us 18.33% 255.290us 5.319us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.43% 5.940us 0.43% 5.940us 5.940us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.877ms -Self CUDA time total: 285.409us +Self CPU time total: 1.393ms +Self CUDA time total: 285.567us @@ -4346,27 +4346,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 936.507us 164.60% 936.507us 936.507us 1 - torch_eager 21.10% 274.795us 99.58% 1.297ms 1.297ms 0.000us 0.00% 592.769us 592.769us 1 - aten::copy_ 7.72% 100.512us 38.34% 499.337us 27.741us 273.025us 47.99% 296.833us 16.491us 18 - aten::mul 11.94% 155.505us 20.72% 269.807us 11.242us 230.561us 40.52% 230.561us 9.607us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 230.561us 40.52% 230.561us 9.607us 24 - aten::clone 1.56% 20.351us 32.41% 422.047us 70.341us 0.000us 0.00% 206.465us 34.411us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 182.657us 32.10% 182.657us 30.443us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.368us 15.88% 90.368us 7.531us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.375us 11.49% 65.375us 5.448us 12 - aten::sub 2.96% 38.510us 4.81% 62.580us 10.430us 32.991us 5.80% 32.991us 5.498us 6 - aten::add 2.43% 31.581us 4.16% 54.201us 9.034us 32.384us 5.69% 32.384us 5.397us 6 - Activity Buffer Request 11.20% 145.802us 11.20% 145.802us 145.802us 23.808us 4.18% 23.808us 23.808us 1 - aten::empty_strided 2.79% 36.361us 2.79% 36.361us 6.060us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 14.42% 187.792us 14.42% 187.792us 31.299us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.78% 62.259us 6.10% 79.429us 3.310us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.32% 17.170us 1.32% 17.170us 0.715us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 17.37% 226.223us 17.37% 226.223us 4.713us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.42% 5.440us 0.42% 5.440us 5.440us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.018ms 178.84% 1.018ms 1.018ms 1 + torch_eager 20.93% 294.497us 99.60% 1.402ms 1.402ms 0.000us 0.00% 592.727us 592.727us 1 + aten::copy_ 7.21% 101.432us 37.46% 527.132us 29.285us 273.662us 48.09% 297.374us 16.521us 18 + aten::mul 12.77% 179.663us 21.96% 309.058us 12.877us 229.816us 40.39% 229.816us 9.576us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 229.816us 40.39% 229.816us 9.576us 24 + aten::clone 1.44% 20.252us 30.89% 434.772us 72.462us 0.000us 0.00% 206.559us 34.426us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 182.847us 32.13% 182.847us 30.474us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.815us 15.96% 90.815us 7.568us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.537us 11.52% 65.537us 5.461us 12 + aten::sub 2.76% 38.901us 5.06% 71.272us 11.879us 32.865us 5.78% 32.865us 5.477us 6 + aten::add 2.62% 36.830us 4.51% 63.532us 10.589us 32.672us 5.74% 32.672us 5.445us 6 + Activity Buffer Request 11.90% 167.504us 11.90% 167.504us 167.504us 23.712us 4.17% 23.712us 23.712us 1 + aten::empty_strided 2.15% 30.291us 2.15% 30.291us 5.049us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.10% 184.385us 13.10% 184.385us 30.731us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.77% 67.152us 6.08% 85.571us 3.565us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.31% 18.419us 1.31% 18.419us 0.767us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 18.64% 262.279us 18.64% 262.279us 5.464us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.40% 5.670us 0.40% 5.670us 5.670us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.302ms -Self CUDA time total: 568.961us +Self CPU time total: 1.407ms +Self CUDA time total: 569.015us @@ -4376,27 +4376,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 912.214us 987.11% 912.214us 912.214us 1 - torch_eager 20.73% 270.361us 99.60% 1.299ms 1.299ms 0.000us 0.00% 93.533us 93.533us 1 - aten::mul 11.80% 153.962us 20.35% 265.434us 11.060us 49.471us 53.53% 49.471us 2.061us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.471us 53.53% 49.471us 2.061us 24 - aten::copy_ 7.83% 102.175us 39.79% 518.911us 28.828us 29.343us 31.75% 30.463us 1.692us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.559us 24.41% 22.559us 1.880us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.599us 14.72% 13.599us 1.133us 12 - aten::clone 1.57% 20.491us 33.39% 435.488us 72.581us 0.000us 0.00% 7.904us 1.317us 6 - aten::add 2.51% 32.770us 4.19% 54.680us 9.113us 6.815us 7.37% 6.815us 1.136us 6 - aten::sub 2.78% 36.231us 4.59% 59.802us 9.967us 6.784us 7.34% 6.784us 1.131us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 7.34% 6.784us 1.131us 6 - Activity Buffer Request 12.62% 164.552us 12.62% 164.552us 164.552us 1.120us 1.21% 1.120us 1.120us 1 - aten::empty_strided 2.31% 30.071us 2.31% 30.071us 5.012us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 14.34% 187.054us 14.34% 187.054us 31.176us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.77% 62.210us 6.08% 79.271us 3.303us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.31% 17.061us 1.31% 17.061us 0.711us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 17.03% 222.083us 17.03% 222.083us 4.627us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.40% 5.200us 0.40% 5.200us 5.200us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.000ms 1080.76% 1.000ms 1.000ms 1 + torch_eager 9.93% 294.202us 99.82% 2.956ms 2.956ms 0.000us 0.00% 93.659us 93.659us 1 + aten::mul 5.79% 171.584us 10.10% 299.275us 12.470us 49.565us 53.56% 49.565us 2.065us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.565us 53.56% 49.565us 2.065us 24 + aten::copy_ 3.59% 106.321us 70.88% 2.099ms 116.627us 29.406us 31.78% 30.526us 1.696us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.623us 24.45% 22.623us 1.885us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.568us 14.66% 13.568us 1.131us 12 + aten::clone 0.71% 20.970us 67.84% 2.009ms 334.883us 0.000us 0.00% 7.903us 1.317us 6 + aten::sub 1.35% 39.881us 2.33% 69.031us 11.505us 6.784us 7.33% 6.784us 1.131us 6 + aten::add 1.15% 34.189us 1.99% 58.861us 9.810us 6.784us 7.33% 6.784us 1.131us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.783us 7.33% 6.783us 1.130us 6 + Activity Buffer Request 58.33% 1.728ms 58.33% 1.728ms 1.728ms 1.120us 1.21% 1.120us 1.120us 1 + aten::empty_strided 1.04% 30.880us 1.04% 30.880us 5.147us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.53% 193.506us 6.53% 193.506us 32.251us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.25% 66.634us 2.83% 83.844us 3.493us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.58% 17.210us 0.58% 17.210us 0.717us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.56% 253.394us 8.56% 253.394us 5.279us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.18% 5.350us 0.18% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.304ms -Self CUDA time total: 92.413us +Self CPU time total: 2.962ms +Self CUDA time total: 92.539us @@ -4406,27 +4406,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 927.225us 964.57% 927.225us 927.225us 1 - torch_eager 10.42% 298.637us 99.83% 2.862ms 2.862ms 0.000us 0.00% 97.440us 97.440us 1 - aten::mul 5.27% 151.202us 9.04% 259.193us 10.800us 51.264us 53.33% 51.264us 2.136us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.264us 53.33% 51.264us 2.136us 24 - aten::copy_ 3.63% 103.959us 71.40% 2.047ms 113.712us 30.719us 31.96% 32.031us 1.780us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.878us 23.80% 22.878us 1.907us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.145us 14.71% 14.145us 1.179us 12 - aten::clone 0.93% 26.740us 68.76% 1.971ms 328.494us 0.000us 0.00% 9.153us 1.526us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.841us 8.16% 7.841us 1.307us 6 - aten::add 1.34% 38.470us 2.12% 60.910us 10.152us 7.105us 7.39% 7.105us 1.184us 6 - aten::sub 1.24% 35.481us 2.03% 58.281us 9.714us 7.040us 7.32% 7.040us 1.173us 6 - Activity Buffer Request 58.96% 1.690ms 58.96% 1.690ms 1.690ms 1.312us 1.36% 1.312us 1.312us 1 - aten::empty_strided 1.07% 30.811us 1.07% 30.811us 5.135us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.60% 189.265us 6.60% 189.265us 31.544us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.21% 63.469us 2.80% 80.301us 3.346us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.59% 16.832us 0.59% 16.832us 0.701us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.56% 216.723us 7.56% 216.723us 4.515us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.17% 4.900us 0.17% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.005ms 1046.18% 1.005ms 1.005ms 1 + torch_eager 20.70% 290.067us 99.57% 1.395ms 1.395ms 0.000us 0.00% 97.374us 97.374us 1 + aten::mul 12.93% 181.162us 22.03% 308.587us 12.858us 51.138us 53.23% 51.138us 2.131us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.138us 53.23% 51.138us 2.131us 24 + aten::copy_ 7.20% 100.934us 37.57% 526.405us 29.245us 30.750us 32.01% 32.061us 1.781us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.975us 23.92% 22.975us 1.915us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.175us 14.76% 14.175us 1.181us 12 + aten::clone 1.53% 21.448us 31.30% 438.559us 73.093us 0.000us 0.00% 9.086us 1.514us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.775us 8.09% 7.775us 1.296us 6 + aten::sub 2.81% 39.410us 4.84% 67.761us 11.294us 7.103us 7.39% 7.103us 1.184us 6 + aten::add 2.52% 35.299us 4.51% 63.202us 10.534us 7.072us 7.36% 7.072us 1.179us 6 + Activity Buffer Request 11.90% 166.735us 11.90% 166.735us 166.735us 1.311us 1.36% 1.311us 1.311us 1 + aten::empty_strided 2.22% 31.071us 2.22% 31.071us 5.178us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.33% 186.813us 13.33% 186.813us 31.136us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.90% 68.622us 6.17% 86.384us 3.599us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.27% 17.762us 1.27% 17.762us 0.740us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 18.24% 255.602us 18.24% 255.602us 5.325us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.43% 6.050us 0.43% 6.050us 6.050us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.867ms -Self CUDA time total: 96.128us +Self CPU time total: 1.401ms +Self CUDA time total: 96.063us @@ -4436,27 +4436,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 904.125us 867.77% 904.125us 904.125us 1 - torch_eager 21.17% 270.270us 99.58% 1.271ms 1.271ms 0.000us 0.00% 105.501us 105.501us 1 - aten::mul 11.71% 149.569us 20.29% 259.044us 10.793us 55.455us 53.23% 55.455us 2.311us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.455us 53.23% 55.455us 2.311us 24 - aten::copy_ 7.94% 101.312us 39.18% 500.180us 27.788us 32.416us 31.11% 33.728us 1.874us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.704us 23.71% 24.704us 2.059us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.318us 15.66% 16.318us 1.360us 12 - aten::clone 1.62% 20.642us 32.82% 418.978us 69.830us 0.000us 0.00% 9.024us 1.504us 6 - aten::sub 2.73% 34.811us 4.52% 57.673us 9.612us 8.256us 7.92% 8.256us 1.376us 6 - aten::add 2.55% 32.590us 4.27% 54.470us 9.078us 8.062us 7.74% 8.062us 1.344us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 7.40% 7.712us 1.285us 6 - Activity Buffer Request 11.60% 148.113us 11.60% 148.113us 148.113us 1.312us 1.26% 1.312us 1.312us 1 - aten::empty_strided 2.40% 30.670us 2.40% 30.670us 5.112us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 14.64% 186.881us 14.64% 186.881us 31.147us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.82% 61.602us 6.15% 78.501us 3.271us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.32% 16.899us 1.32% 16.899us 0.704us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 17.08% 218.091us 17.08% 218.091us 4.544us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.42% 5.300us 0.42% 5.300us 5.300us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.008ms 966.66% 1.008ms 1.008ms 1 + torch_eager 19.87% 287.726us 99.59% 1.442ms 1.442ms 0.000us 0.00% 105.564us 105.564us 1 + aten::mul 12.48% 180.745us 21.62% 312.998us 13.042us 55.327us 53.07% 55.327us 2.305us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.327us 53.07% 55.327us 2.305us 24 + aten::copy_ 7.05% 102.025us 39.57% 572.964us 31.831us 32.416us 31.09% 33.727us 1.874us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.737us 23.73% 24.737us 2.061us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.510us 15.84% 16.510us 1.376us 12 + aten::clone 1.40% 20.271us 33.26% 481.521us 80.254us 0.000us 0.00% 8.990us 1.498us 6 + aten::sub 2.64% 38.220us 4.70% 68.062us 11.344us 8.351us 8.01% 8.351us 1.392us 6 + aten::add 2.49% 36.083us 4.34% 62.773us 10.462us 8.159us 7.83% 8.159us 1.360us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.679us 7.37% 7.679us 1.280us 6 + Activity Buffer Request 13.36% 193.394us 13.36% 193.394us 193.394us 1.311us 1.26% 1.311us 1.311us 1 + aten::empty_strided 2.11% 30.520us 2.11% 30.520us 5.087us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 14.10% 204.105us 14.10% 204.105us 34.017us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.72% 68.322us 5.99% 86.681us 3.612us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.27% 18.359us 1.27% 18.359us 0.765us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 18.11% 262.225us 18.11% 262.225us 5.463us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.41% 5.880us 0.41% 5.880us 5.880us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.277ms -Self CUDA time total: 104.189us +Self CPU time total: 1.448ms +Self CUDA time total: 104.253us @@ -4466,27 +4466,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 918.160us 742.36% 918.160us 918.160us 1 - torch_eager 20.43% 265.704us 99.60% 1.295ms 1.295ms 0.000us 0.00% 125.506us 125.506us 1 - aten::mul 11.98% 155.738us 20.81% 270.642us 11.277us 65.023us 52.57% 65.023us 2.709us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.023us 52.57% 65.023us 2.709us 24 - aten::copy_ 7.90% 102.742us 39.27% 510.720us 28.373us 39.331us 31.80% 41.155us 2.286us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.898us 23.36% 28.898us 2.408us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.328us 15.63% 19.328us 1.611us 12 - aten::clone 1.56% 20.250us 32.83% 426.918us 71.153us 0.000us 0.00% 12.257us 2.043us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.433us 8.44% 10.433us 1.739us 6 - aten::add 2.56% 33.260us 4.26% 55.420us 9.237us 9.728us 7.87% 9.728us 1.621us 6 - aten::sub 2.77% 36.070us 4.64% 60.350us 10.058us 9.600us 7.76% 9.600us 1.600us 6 - Activity Buffer Request 12.28% 159.673us 12.28% 159.673us 159.673us 1.824us 1.47% 1.824us 1.824us 1 - aten::empty_strided 2.40% 31.241us 2.40% 31.241us 5.207us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 14.07% 182.993us 14.07% 182.993us 30.499us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.89% 63.631us 6.22% 80.874us 3.370us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.33% 17.243us 1.33% 17.243us 0.718us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 17.43% 226.656us 17.43% 226.656us 4.722us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.40% 5.220us 0.40% 5.220us 5.220us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.006ms 813.90% 1.006ms 1.006ms 1 + torch_eager 9.94% 295.168us 99.80% 2.964ms 2.964ms 0.000us 0.00% 125.309us 125.309us 1 + aten::mul 5.76% 171.113us 10.09% 299.797us 12.492us 65.183us 52.76% 65.183us 2.716us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.183us 52.76% 65.183us 2.716us 24 + aten::copy_ 3.36% 99.933us 70.61% 2.097ms 116.499us 39.102us 31.65% 40.862us 2.270us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.702us 23.23% 28.702us 2.392us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.264us 15.59% 19.264us 1.605us 12 + aten::clone 0.72% 21.400us 67.60% 2.008ms 334.638us 0.000us 0.00% 12.160us 2.027us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 8.42% 10.400us 1.733us 6 + aten::sub 1.36% 40.342us 2.40% 71.402us 11.900us 9.664us 7.82% 9.664us 1.611us 6 + aten::add 1.19% 35.292us 2.11% 62.642us 10.440us 9.600us 7.77% 9.600us 1.600us 6 + Activity Buffer Request 58.22% 1.729ms 58.22% 1.729ms 1.729ms 1.760us 1.42% 1.760us 1.760us 1 + aten::empty_strided 1.03% 30.470us 1.03% 30.470us 5.078us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.55% 194.565us 6.55% 194.565us 32.427us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.28% 67.682us 2.90% 86.242us 3.593us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.62% 18.560us 0.62% 18.560us 0.773us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.77% 260.395us 8.77% 260.395us 5.425us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.20% 5.880us 0.20% 5.880us 5.880us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.300ms -Self CUDA time total: 123.682us +Self CPU time total: 2.970ms +Self CUDA time total: 123.549us @@ -4496,27 +4496,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 920.156us 882.07% 920.156us 920.156us 1 - torch_eager 10.51% 302.454us 99.79% 2.872ms 2.872ms 0.000us 0.00% 105.630us 105.630us 1 - aten::mul 5.35% 153.872us 9.14% 263.076us 10.962us 55.392us 53.10% 55.392us 2.308us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.392us 53.10% 55.392us 2.308us 24 - aten::copy_ 3.49% 100.493us 71.36% 2.053ms 114.078us 32.479us 31.13% 33.791us 1.877us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.736us 23.71% 24.736us 2.061us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.447us 15.77% 16.447us 1.371us 12 - aten::clone 1.01% 28.981us 68.90% 1.983ms 330.436us 0.000us 0.00% 9.055us 1.509us 6 - aten::sub 1.22% 35.200us 2.05% 58.980us 9.830us 8.288us 7.94% 8.288us 1.381us 6 - aten::add 1.08% 30.999us 1.81% 52.042us 8.674us 8.159us 7.82% 8.159us 1.360us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.743us 7.42% 7.743us 1.290us 6 - Activity Buffer Request 59.15% 1.702ms 59.15% 1.702ms 1.702ms 1.312us 1.26% 1.312us 1.312us 1 - aten::empty_strided 1.11% 32.080us 1.11% 32.080us 5.347us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.49% 186.892us 6.49% 186.892us 31.149us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.21% 63.599us 2.80% 80.681us 3.362us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.59% 17.082us 0.59% 17.082us 0.712us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.57% 217.817us 7.57% 217.817us 4.538us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.21% 6.030us 0.21% 6.030us 6.030us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 983.089us 943.25% 983.089us 983.089us 1 + torch_eager 19.98% 288.033us 99.62% 1.436ms 1.436ms 0.000us 0.00% 105.536us 105.536us 1 + aten::mul 11.74% 169.216us 20.40% 294.119us 12.255us 55.424us 53.18% 55.424us 2.309us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.424us 53.18% 55.424us 2.309us 24 + aten::copy_ 7.16% 103.226us 41.05% 591.856us 32.881us 32.352us 31.04% 33.664us 1.870us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.640us 23.64% 24.640us 2.053us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.448us 15.78% 16.448us 1.371us 12 + aten::clone 1.52% 21.981us 35.14% 506.672us 84.445us 0.000us 0.00% 9.024us 1.504us 6 + aten::sub 2.73% 39.361us 4.47% 64.512us 10.752us 8.289us 7.95% 8.289us 1.381us 6 + aten::add 2.40% 34.591us 4.34% 62.531us 10.422us 8.159us 7.83% 8.159us 1.360us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 7.40% 7.712us 1.285us 6 + Activity Buffer Request 15.86% 228.735us 15.86% 228.735us 228.735us 1.312us 1.26% 1.312us 1.312us 1 + aten::empty_strided 2.19% 31.580us 2.19% 31.580us 5.263us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.29% 191.574us 13.29% 191.574us 31.929us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.47% 64.452us 5.67% 81.763us 3.407us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.20% 17.311us 1.20% 17.311us 0.721us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 17.08% 246.315us 17.08% 246.315us 5.132us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.38% 5.431us 0.38% 5.431us 5.431us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.878ms -Self CUDA time total: 104.318us +Self CPU time total: 1.442ms +Self CUDA time total: 104.224us @@ -4526,27 +4526,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 907.897us 732.75% 907.897us 907.897us 1 - torch_eager 10.14% 290.438us 99.82% 2.860ms 2.860ms 0.000us 0.00% 125.694us 125.694us 1 - aten::mul 5.20% 149.121us 8.91% 255.361us 10.640us 65.405us 52.79% 65.405us 2.725us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.405us 52.79% 65.405us 2.725us 24 - aten::copy_ 3.52% 100.835us 71.92% 2.061ms 114.473us 39.359us 31.77% 41.151us 2.286us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.927us 23.35% 28.927us 2.411us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.138us 15.45% 19.138us 1.595us 12 - aten::clone 0.91% 26.167us 69.31% 1.986ms 330.944us 0.000us 0.00% 12.224us 2.037us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 8.42% 10.432us 1.739us 6 - aten::sub 1.49% 42.714us 2.30% 65.852us 10.975us 9.601us 7.75% 9.601us 1.600us 6 - aten::add 1.10% 31.550us 1.85% 52.920us 8.820us 9.537us 7.70% 9.537us 1.589us 6 - Activity Buffer Request 59.69% 1.710ms 59.69% 1.710ms 1.710ms 1.792us 1.45% 1.792us 1.792us 1 - aten::empty_strided 1.05% 30.111us 1.05% 30.111us 5.018us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.49% 186.023us 6.49% 186.023us 31.004us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.15% 61.602us 2.74% 78.501us 3.271us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.59% 16.899us 0.59% 16.899us 0.704us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.48% 214.199us 7.48% 214.199us 4.462us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.18% 5.240us 0.18% 5.240us 5.240us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.004ms 809.35% 1.004ms 1.004ms 1 + torch_eager 20.77% 293.259us 99.59% 1.406ms 1.406ms 0.000us 0.00% 125.785us 125.785us 1 + aten::mul 11.90% 168.115us 21.00% 296.496us 12.354us 65.470us 52.80% 65.470us 2.728us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.470us 52.80% 65.470us 2.728us 24 + aten::copy_ 7.29% 102.893us 38.63% 545.534us 30.307us 39.326us 31.72% 41.117us 2.284us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.831us 23.25% 28.831us 2.403us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.198us 15.48% 19.198us 1.600us 12 + aten::clone 1.46% 20.620us 31.97% 451.491us 75.249us 0.000us 0.00% 12.286us 2.048us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.495us 8.46% 10.495us 1.749us 6 + aten::add 2.69% 38.013us 4.51% 63.752us 10.625us 9.599us 7.74% 9.599us 1.600us 6 + aten::sub 3.27% 46.223us 5.18% 73.182us 12.197us 9.599us 7.74% 9.599us 1.600us 6 + Activity Buffer Request 12.70% 179.315us 12.70% 179.315us 179.315us 1.791us 1.44% 1.791us 1.791us 1 + aten::empty_strided 2.12% 29.941us 2.12% 29.941us 4.990us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.35% 188.535us 13.35% 188.535us 31.423us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.72% 66.700us 5.92% 83.651us 3.485us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.20% 16.951us 1.20% 16.951us 0.706us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 18.12% 255.870us 18.12% 255.870us 5.331us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.41% 5.720us 0.41% 5.720us 5.720us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.865ms -Self CUDA time total: 123.902us +Self CPU time total: 1.412ms +Self CUDA time total: 123.994us @@ -4556,27 +4556,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 893.688us 504.20% 893.688us 893.688us 1 - torch_eager 20.45% 267.171us 99.57% 1.301ms 1.301ms 0.000us 0.00% 180.096us 180.096us 1 - aten::mul 11.55% 150.960us 19.81% 258.802us 10.783us 94.720us 53.44% 94.720us 3.947us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.720us 53.44% 94.720us 3.947us 24 - aten::copy_ 7.76% 101.418us 41.00% 535.738us 29.763us 57.761us 32.59% 60.609us 3.367us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.769us 23.00% 40.769us 3.397us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.767us 13.97% 24.767us 2.064us 12 - aten::clone 1.49% 19.482us 34.75% 454.110us 75.685us 0.000us 0.00% 19.840us 3.307us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.992us 9.59% 16.992us 2.832us 6 - aten::add 2.31% 30.129us 3.96% 51.730us 8.622us 12.479us 7.04% 12.479us 2.080us 6 - aten::sub 2.69% 35.182us 4.51% 58.883us 9.814us 12.288us 6.93% 12.288us 2.048us 6 - Activity Buffer Request 14.52% 189.694us 14.52% 189.694us 189.694us 2.848us 1.61% 2.848us 2.848us 1 - aten::empty_strided 2.22% 29.022us 2.22% 29.022us 4.837us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 14.04% 183.444us 14.04% 183.444us 30.574us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.81% 62.873us 6.14% 80.203us 3.342us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.33% 17.330us 1.33% 17.330us 0.722us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.40% 214.326us 16.40% 214.326us 4.465us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.43% 5.660us 0.43% 5.660us 5.660us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.022ms 574.36% 1.022ms 1.022ms 1 + torch_eager 9.88% 300.789us 99.79% 3.040ms 3.040ms 0.000us 0.00% 180.741us 180.741us 1 + aten::mul 5.87% 178.699us 10.24% 311.746us 12.989us 95.331us 53.59% 95.331us 3.972us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 95.331us 53.59% 95.331us 3.972us 24 + aten::copy_ 3.42% 104.263us 70.83% 2.158ms 119.861us 57.731us 32.45% 60.579us 3.366us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.803us 22.94% 40.803us 3.400us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.831us 13.96% 24.831us 2.069us 12 + aten::clone 0.72% 21.809us 67.85% 2.067ms 344.453us 0.000us 0.00% 19.776us 3.296us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 9.52% 16.928us 2.821us 6 + aten::add 1.17% 35.582us 2.06% 62.693us 10.449us 12.480us 7.02% 12.480us 2.080us 6 + aten::sub 1.34% 40.961us 2.23% 67.853us 11.309us 12.351us 6.94% 12.351us 2.059us 6 + Activity Buffer Request 58.86% 1.793ms 58.86% 1.793ms 1.793ms 2.848us 1.60% 2.848us 2.848us 1 + aten::empty_strided 1.00% 30.461us 1.00% 30.461us 5.077us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.14% 187.135us 6.14% 187.135us 31.189us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.22% 67.750us 2.85% 86.690us 3.612us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.62% 18.940us 0.62% 18.940us 0.789us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.55% 260.511us 8.55% 260.511us 5.427us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.21% 6.321us 0.21% 6.321us 6.321us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.307ms -Self CUDA time total: 177.248us +Self CPU time total: 3.046ms +Self CUDA time total: 177.893us @@ -4586,27 +4586,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 902.974us 302.94% 902.974us 902.974us 1 - torch_eager 20.55% 268.018us 99.60% 1.299ms 1.299ms 0.000us 0.00% 315.867us 315.867us 1 - aten::mul 11.45% 149.281us 19.89% 259.364us 10.807us 145.821us 48.92% 145.821us 6.076us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.821us 48.92% 145.821us 6.076us 24 - aten::copy_ 7.72% 100.690us 40.33% 526.007us 29.223us 111.006us 37.24% 128.798us 7.155us 18 - aten::clone 1.57% 20.500us 34.18% 445.716us 74.286us 0.000us 0.00% 71.584us 11.931us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.214us 19.19% 57.214us 4.768us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.792us 18.05% 53.792us 8.965us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.248us 13.84% 41.248us 3.437us 12 - aten::sub 2.77% 36.180us 4.59% 59.800us 9.967us 20.672us 6.94% 20.672us 3.445us 6 - aten::add 2.40% 31.251us 4.19% 54.701us 9.117us 20.576us 6.90% 20.576us 3.429us 6 - Activity Buffer Request 13.76% 179.443us 13.76% 179.443us 179.443us 17.792us 5.97% 17.792us 17.792us 1 - aten::empty_strided 2.33% 30.440us 2.33% 30.440us 5.073us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 14.06% 183.364us 14.06% 183.364us 30.561us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.83% 63.022us 6.14% 80.061us 3.336us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.31% 17.039us 1.31% 17.039us 0.710us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.84% 219.663us 16.84% 219.663us 4.576us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.40% 5.210us 0.40% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.032ms 346.83% 1.032ms 1.032ms 1 + torch_eager 10.94% 330.454us 99.81% 3.014ms 3.014ms 0.000us 0.00% 315.361us 315.361us 1 + aten::mul 5.75% 173.643us 10.12% 305.657us 12.736us 145.696us 48.95% 145.696us 6.071us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.696us 48.95% 145.696us 6.071us 24 + aten::copy_ 3.49% 105.289us 69.61% 2.102ms 116.764us 110.818us 37.23% 128.546us 7.141us 18 + aten::clone 0.91% 27.529us 66.74% 2.015ms 335.845us 0.000us 0.00% 71.233us 11.872us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.313us 19.26% 57.313us 4.776us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.505us 17.98% 53.505us 8.918us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.119us 13.82% 41.119us 3.427us 12 + aten::sub 1.33% 40.040us 2.29% 68.991us 11.499us 20.607us 6.92% 20.607us 3.434us 6 + aten::add 1.18% 35.771us 2.04% 61.541us 10.257us 20.512us 6.89% 20.512us 3.419us 6 + Activity Buffer Request 56.83% 1.716ms 56.83% 1.716ms 1.716ms 17.728us 5.96% 17.728us 17.728us 1 + aten::empty_strided 1.04% 31.352us 1.04% 31.352us 5.225us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.87% 207.436us 6.87% 207.436us 34.573us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.27% 68.592us 2.86% 86.271us 3.595us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.59% 17.679us 0.59% 17.679us 0.737us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.61% 259.847us 8.61% 259.847us 5.413us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.19% 5.690us 0.19% 5.690us 5.690us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.304ms -Self CUDA time total: 298.075us +Self CPU time total: 3.019ms +Self CUDA time total: 297.633us @@ -4616,27 +4616,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 892.343us 502.65% 892.343us 892.343us 1 - torch_eager 20.29% 263.739us 99.57% 1.294ms 1.294ms 0.000us 0.00% 180.376us 180.376us 1 - aten::mul 11.80% 153.414us 20.02% 260.283us 10.845us 95.163us 53.60% 95.163us 3.965us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 95.163us 53.60% 95.163us 3.965us 24 - aten::copy_ 7.77% 100.952us 40.82% 530.557us 29.475us 57.598us 32.44% 60.446us 3.358us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.672us 22.91% 40.672us 3.389us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.767us 13.95% 24.767us 2.064us 12 - aten::clone 1.59% 20.651us 34.74% 451.538us 75.256us 0.000us 0.00% 19.774us 3.296us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.926us 9.53% 16.926us 2.821us 6 - aten::add 2.40% 31.260us 4.02% 52.270us 8.712us 12.447us 7.01% 12.447us 2.074us 6 - aten::sub 2.80% 36.342us 4.59% 59.652us 9.942us 12.320us 6.94% 12.320us 2.053us 6 - Activity Buffer Request 14.29% 185.813us 14.29% 185.813us 185.813us 2.848us 1.60% 2.848us 2.848us 1 - aten::empty_strided 2.21% 28.780us 2.21% 28.780us 4.797us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 14.07% 182.883us 14.07% 182.883us 30.480us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.73% 61.469us 6.03% 78.389us 3.266us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.30% 16.920us 1.30% 16.920us 0.705us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.32% 212.098us 16.32% 212.098us 4.419us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.43% 5.551us 0.43% 5.551us 5.551us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 997.224us 560.81% 997.224us 997.224us 1 + torch_eager 20.41% 294.666us 99.60% 1.438ms 1.438ms 0.000us 0.00% 180.699us 180.699us 1 + aten::mul 11.81% 170.426us 20.82% 300.510us 12.521us 95.263us 53.57% 95.263us 3.969us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 95.263us 53.57% 95.263us 3.969us 24 + aten::copy_ 6.98% 100.734us 40.22% 580.546us 32.253us 57.725us 32.46% 60.604us 3.367us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.895us 23.00% 40.895us 3.408us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.832us 13.96% 24.832us 2.069us 12 + aten::clone 1.48% 21.402us 34.03% 491.202us 81.867us 0.000us 0.00% 19.709us 3.285us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.830us 9.46% 16.830us 2.805us 6 + aten::sub 2.72% 39.260us 4.81% 69.410us 11.568us 12.416us 6.98% 12.416us 2.069us 6 + aten::add 2.46% 35.491us 4.17% 60.132us 10.022us 12.416us 6.98% 12.416us 2.069us 6 + Activity Buffer Request 14.76% 213.066us 14.76% 213.066us 213.066us 2.879us 1.62% 2.879us 2.879us 1 + aten::empty_strided 2.03% 29.329us 2.03% 29.329us 4.888us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.55% 195.534us 13.55% 195.534us 32.589us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.52% 65.271us 5.66% 81.651us 3.402us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.13% 16.380us 1.13% 16.380us 0.683us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 17.74% 256.087us 17.74% 256.087us 5.335us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.40% 5.801us 0.40% 5.801us 5.801us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.300ms -Self CUDA time total: 177.528us +Self CPU time total: 1.443ms +Self CUDA time total: 177.820us @@ -4646,27 +4646,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 904.750us 303.27% 904.750us 904.750us 1 - torch_eager 21.10% 268.921us 99.56% 1.269ms 1.269ms 0.000us 0.00% 316.413us 316.413us 1 - aten::mul 11.89% 151.482us 20.35% 259.306us 10.804us 145.920us 48.91% 145.920us 6.080us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.920us 48.91% 145.920us 6.080us 24 - aten::copy_ 8.00% 101.944us 39.08% 497.980us 27.666us 111.613us 37.41% 129.693us 7.205us 18 - aten::clone 1.62% 20.659us 32.72% 416.907us 69.484us 0.000us 0.00% 72.512us 12.085us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.181us 19.17% 57.181us 4.765us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 54.432us 18.25% 54.432us 9.072us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.800us 13.68% 40.800us 3.400us 12 - aten::add 2.47% 31.422us 4.17% 53.191us 8.865us 20.512us 6.88% 20.512us 3.419us 6 - aten::sub 2.72% 34.608us 4.46% 56.810us 9.468us 20.288us 6.80% 20.288us 3.381us 6 - Activity Buffer Request 10.32% 131.451us 10.32% 131.451us 131.451us 18.080us 6.06% 18.080us 18.080us 1 - aten::empty_strided 2.44% 31.142us 2.44% 31.142us 5.190us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 15.74% 200.533us 15.74% 200.533us 33.422us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.99% 63.539us 6.33% 80.621us 3.359us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.34% 17.082us 1.34% 17.082us 0.712us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.94% 215.847us 16.94% 215.847us 4.497us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.44% 5.651us 0.44% 5.651us 5.651us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.006ms 337.57% 1.006ms 1.006ms 1 + torch_eager 19.59% 286.301us 99.64% 1.456ms 1.456ms 0.000us 0.00% 316.065us 316.065us 1 + aten::mul 11.95% 174.684us 20.79% 303.846us 12.660us 145.439us 48.82% 145.439us 6.060us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.439us 48.82% 145.439us 6.060us 24 + aten::copy_ 7.17% 104.791us 40.98% 598.785us 33.266us 111.649us 37.48% 129.826us 7.213us 18 + aten::clone 1.35% 19.700us 34.46% 503.492us 83.915us 0.000us 0.00% 72.450us 12.075us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.376us 19.26% 57.376us 4.781us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 54.273us 18.22% 54.273us 9.045us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.800us 13.70% 40.800us 3.400us 12 + aten::sub 2.64% 38.590us 4.63% 67.691us 11.282us 20.448us 6.86% 20.448us 3.408us 6 + aten::add 2.53% 37.009us 4.42% 64.522us 10.754us 20.352us 6.83% 20.352us 3.392us 6 + Activity Buffer Request 15.77% 230.486us 15.77% 230.486us 230.486us 18.177us 6.10% 18.177us 18.177us 1 + aten::empty_strided 2.02% 29.450us 2.02% 29.450us 4.908us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 12.98% 189.676us 12.98% 189.676us 31.613us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.60% 67.290us 5.86% 85.691us 3.570us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.26% 18.401us 1.26% 18.401us 0.767us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 17.77% 259.608us 17.77% 259.608us 5.409us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.36% 5.300us 0.36% 5.300us 5.300us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.274ms -Self CUDA time total: 298.333us +Self CPU time total: 1.461ms +Self CUDA time total: 297.888us @@ -4676,27 +4676,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 937.429us 158.23% 937.429us 937.429us 1 - torch_eager 20.59% 270.182us 99.56% 1.306ms 1.306ms 0.000us 0.00% 616.217us 616.217us 1 - aten::copy_ 7.81% 102.469us 38.72% 508.048us 28.225us 278.876us 47.07% 302.652us 16.814us 18 - aten::mul 12.17% 159.666us 21.13% 277.305us 11.554us 247.965us 41.85% 247.965us 10.332us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 247.965us 41.85% 247.965us 10.332us 24 - aten::clone 1.57% 20.662us 32.21% 422.588us 70.431us 0.000us 0.00% 211.645us 35.274us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 187.869us 31.71% 187.869us 31.312us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 91.007us 15.36% 91.007us 7.584us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.600us 11.07% 65.600us 5.467us 12 - aten::sub 2.87% 37.612us 4.70% 61.651us 10.275us 32.896us 5.55% 32.896us 5.483us 6 - aten::add 2.50% 32.870us 4.35% 57.120us 9.520us 32.704us 5.52% 32.704us 5.451us 6 - Activity Buffer Request 12.01% 157.623us 12.01% 157.623us 157.623us 23.776us 4.01% 23.776us 23.776us 1 - aten::empty_strided 2.36% 30.970us 2.36% 30.970us 5.162us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.83% 181.533us 13.83% 181.533us 30.256us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.81% 63.053us 6.13% 80.473us 3.353us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.33% 17.420us 1.33% 17.420us 0.726us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 17.71% 232.351us 17.71% 232.351us 4.841us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.44% 5.770us 0.44% 5.770us 5.770us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.028ms 173.98% 1.028ms 1.028ms 1 + torch_eager 20.54% 307.941us 99.60% 1.493ms 1.493ms 0.000us 0.00% 614.720us 614.720us 1 + aten::copy_ 6.96% 104.343us 39.80% 596.653us 33.147us 277.825us 47.01% 301.537us 16.752us 18 + aten::mul 11.79% 176.823us 20.94% 313.949us 13.081us 247.103us 41.81% 247.103us 10.296us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 247.103us 41.81% 247.103us 10.296us 24 + aten::clone 1.37% 20.599us 33.38% 500.402us 83.400us 0.000us 0.00% 210.816us 35.136us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 187.104us 31.66% 187.104us 31.184us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.721us 15.35% 90.721us 7.560us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 66.080us 11.18% 66.080us 5.507us 12 + aten::add 2.50% 37.532us 4.42% 66.263us 11.044us 33.056us 5.59% 33.056us 5.509us 6 + aten::sub 2.78% 41.622us 4.80% 72.031us 12.005us 33.024us 5.59% 33.024us 5.504us 6 + Activity Buffer Request 15.35% 230.085us 15.35% 230.085us 230.085us 23.712us 4.01% 23.712us 23.712us 1 + aten::empty_strided 1.95% 29.191us 1.95% 29.191us 4.865us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 12.52% 187.674us 12.52% 187.674us 31.279us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.56% 68.431us 5.78% 86.660us 3.611us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.22% 18.229us 1.22% 18.229us 0.760us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 18.06% 270.817us 18.06% 270.817us 5.642us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.40% 6.020us 0.40% 6.020us 6.020us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.312ms -Self CUDA time total: 592.441us +Self CPU time total: 1.499ms +Self CUDA time total: 591.008us @@ -4706,55 +4706,61 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 7.94% 296.330us 77.11% 2.877ms 2.877ms 0.000us 0.00% 1.855ms 1.855ms 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.826ms 102.11% 1.826ms 1.826ms 1 - aten::copy_ 2.78% 103.676us 55.32% 2.064ms 114.670us 804.351us 44.97% 870.879us 48.382us 18 - aten::mul 4.03% 150.284us 7.00% 261.125us 10.880us 838.429us 46.87% 838.429us 34.935us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 838.429us 46.87% 838.429us 34.935us 24 - aten::clone 0.73% 27.418us 53.07% 1.980ms 330.025us 0.000us 0.00% 619.584us 103.264us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 553.056us 30.92% 553.056us 92.176us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 251.295us 14.05% 251.295us 20.941us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 145.983us 8.16% 145.983us 12.165us 12 - aten::sub 1.02% 38.061us 1.68% 62.715us 10.453us 87.583us 4.90% 87.583us 14.597us 6 - Activity Buffer Request 44.74% 1.669ms 44.74% 1.669ms 1.669ms 66.528us 3.72% 66.528us 66.528us 1 - aten::add 0.88% 32.700us 1.50% 55.970us 9.328us 58.400us 3.26% 58.400us 9.733us 6 - aten::empty_strided 0.81% 30.382us 0.81% 30.382us 5.064us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 5.84% 217.963us 5.84% 217.963us 36.327us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.66% 61.939us 2.12% 79.030us 3.293us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.46% 17.091us 0.46% 17.091us 0.712us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 6.21% 231.875us 6.21% 231.875us 4.831us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 22.89% 854.084us 22.89% 854.084us 854.084us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 13.10% 290.757us 64.42% 1.430ms 1.430ms 0.000us 0.00% 1.858ms 1.858ms 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.829ms 102.08% 1.829ms 1.829ms 1 + aten::copy_ 4.50% 99.996us 26.00% 577.207us 32.067us 806.683us 45.02% 872.922us 48.496us 18 + aten::mul 7.46% 165.620us 13.35% 296.315us 12.346us 837.369us 46.73% 837.369us 34.890us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 837.369us 46.73% 837.369us 34.890us 24 + aten::clone 0.93% 20.729us 21.91% 486.420us 81.070us 0.000us 0.00% 620.507us 103.418us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 554.268us 30.93% 554.268us 92.378us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 252.415us 14.09% 252.415us 21.035us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 147.935us 8.26% 147.935us 12.328us 12 + aten::sub 1.77% 39.361us 2.98% 66.203us 11.034us 89.536us 5.00% 89.536us 14.923us 6 + Activity Buffer Request 8.80% 195.395us 8.80% 195.395us 195.395us 66.239us 3.70% 66.239us 66.239us 1 + aten::add 1.55% 34.513us 2.81% 62.423us 10.404us 58.399us 3.26% 58.399us 9.733us 6 + aten::empty_strided 1.34% 29.799us 1.34% 29.799us 4.967us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.39% 208.344us 9.39% 208.344us 34.724us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.12% 69.193us 3.90% 86.553us 3.606us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.78% 17.360us 0.78% 17.360us 0.723us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 11.66% 258.919us 11.66% 258.919us 5.394us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 35.58% 789.949us 35.58% 789.949us 789.949us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.731ms -Self CUDA time total: 1.789ms +Self CPU time total: 2.220ms +Self CUDA time total: 1.792ms impl wl p50(ms) ok -torch_eager cuda_B1_S128_H32_D128_R64 0.22 True -torch_eager cuda_B1_S128_H32_D64_R32 0.22 True -torch_eager cuda_B1_S128_H8_D128_R64 0.22 True -torch_eager cuda_B1_S128_H8_D64_R32 0.18 True -torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True -torch_eager cuda_B1_S2048_H32_D64_R32 0.21 True -torch_eager cuda_B1_S2048_H8_D128_R64 0.21 True -torch_eager cuda_B1_S2048_H8_D64_R32 0.21 True -torch_eager cuda_B1_S512_H32_D128_R64 0.21 True -torch_eager cuda_B1_S512_H32_D64_R32 0.21 True -torch_eager cuda_B1_S512_H8_D128_R64 0.21 True -torch_eager cuda_B1_S512_H8_D64_R32 0.22 True -torch_eager cuda_B2_S128_H32_D128_R64 0.22 True -torch_eager cuda_B2_S128_H32_D64_R32 0.22 True -torch_eager cuda_B2_S128_H8_D128_R64 0.22 True -torch_eager cuda_B2_S128_H8_D64_R32 0.21 True +torch_eager cuda_B1_S128_H32_D128_R64 0.27 True +torch_eager cuda_B1_S128_H32_D64_R32 0.25 True +torch_eager cuda_B1_S128_H8_D128_R64 0.25 True +torch_eager cuda_B1_S128_H8_D64_R32 0.19 True +torch_eager cuda_B1_S2048_H32_D128_R64 0.25 True +torch_eager cuda_B1_S2048_H32_D64_R32 0.25 True +torch_eager cuda_B1_S2048_H8_D128_R64 0.24 True +torch_eager cuda_B1_S2048_H8_D64_R32 0.25 True +torch_eager cuda_B1_S512_H32_D128_R64 0.24 True +torch_eager cuda_B1_S512_H32_D64_R32 0.24 True +torch_eager cuda_B1_S512_H8_D128_R64 0.24 True +torch_eager cuda_B1_S512_H8_D64_R32 0.25 True +torch_eager cuda_B2_S128_H32_D128_R64 0.24 True +torch_eager cuda_B2_S128_H32_D64_R32 0.25 True +torch_eager cuda_B2_S128_H8_D128_R64 0.25 True +torch_eager cuda_B2_S128_H8_D64_R32 0.24 True torch_eager cuda_B2_S2048_H32_D128_R64 0.65 True -torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True -torch_eager cuda_B2_S2048_H8_D128_R64 0.21 True -torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True -torch_eager cuda_B2_S512_H32_D128_R64 0.21 True -torch_eager cuda_B2_S512_H32_D64_R32 0.22 True -torch_eager cuda_B2_S512_H8_D128_R64 0.22 True -torch_eager cuda_B2_S512_H8_D64_R32 0.22 True +torch_eager cuda_B2_S2048_H32_D64_R32 0.25 True +torch_eager cuda_B2_S2048_H8_D128_R64 0.25 True +torch_eager cuda_B2_S2048_H8_D64_R32 0.24 True +torch_eager cuda_B2_S512_H32_D128_R64 0.25 True +torch_eager cuda_B2_S512_H32_D64_R32 0.24 True +torch_eager cuda_B2_S512_H8_D128_R64 0.24 True +torch_eager cuda_B2_S512_H8_D64_R32 0.24 True+▶ UV Install Logs+ +Artifacts:
rotary.jsonl