diff --git "a/rotary/impls/torch_rotary.html" "b/rotary/impls/torch_rotary.html" --- "a/rotary/impls/torch_rotary.html" +++ "b/rotary/impls/torch_rotary.html" @@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Mon Nov 10 21:57:39 2025 +-Fri Dec 19 18:55:27 2025 +-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 | +| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 | +-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 26C P0 88W / 350W | 0MiB / 46068MiB | 22% Default | +| N/A 28C P0 85W / 350W | 0MiB / 46068MiB | 34% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -3935,9 +3935,9 @@ Cell: nv | 0.22s ▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 38.43s +Cell: benchmark | 4.08s | Raw @@ -4016,27 +4016,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.315ms 1474.39% 1.315ms 1.315ms 1 - torch_eager 7.00% 401.548us 82.40% 4.729ms 4.729ms 0.000us 0.00% 90.432us 90.432us 1 - aten::mul 3.25% 186.430us 5.35% 307.044us 12.793us 46.943us 52.62% 46.943us 1.956us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 46.943us 52.62% 46.943us 1.956us 24 - aten::copy_ 2.48% 142.261us 48.48% 2.782ms 154.576us 29.122us 32.64% 30.338us 1.685us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.433us 25.14% 22.433us 1.869us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.151us 14.74% 13.151us 1.096us 12 - aten::clone 0.88% 50.441us 59.65% 3.423ms 570.575us 0.000us 0.00% 7.905us 1.318us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.689us 7.50% 6.689us 1.115us 6 - aten::sub 0.82% 47.350us 1.28% 73.411us 12.235us 6.591us 7.39% 6.591us 1.098us 6 - aten::add 0.64% 36.811us 1.04% 59.601us 9.934us 6.560us 7.35% 6.560us 1.093us 6 - Activity Buffer Request 39.92% 2.291ms 39.92% 2.291ms 2.291ms 1.216us 1.36% 1.216us 1.216us 1 - aten::empty_strided 16.52% 948.386us 16.52% 948.386us 158.064us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 1.38% 78.980us 1.38% 78.980us 13.163us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.46% 83.925us 1.86% 106.703us 4.446us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.40% 22.778us 0.40% 22.778us 0.949us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.66% 439.430us 7.66% 439.430us 9.155us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 17.60% 1.010ms 17.60% 1.010ms 1.010ms 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 975.966us 1092.36% 975.966us 975.966us 1 + torch_eager 12.28% 354.583us 99.52% 2.875ms 2.875ms 0.000us 0.00% 90.561us 90.561us 1 + aten::mul 5.61% 162.144us 9.55% 275.765us 11.490us 47.009us 52.62% 47.009us 1.959us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.009us 52.62% 47.009us 1.959us 24 + aten::copy_ 3.85% 111.174us 67.17% 1.940ms 107.795us 29.153us 32.63% 30.369us 1.687us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.337us 25.00% 22.337us 1.861us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.183us 14.76% 13.183us 1.099us 12 + aten::clone 1.24% 35.691us 65.68% 1.897ms 316.212us 0.000us 0.00% 8.032us 1.339us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 7.63% 6.816us 1.136us 6 + aten::sub 1.48% 42.821us 2.36% 68.262us 11.377us 6.592us 7.38% 6.592us 1.099us 6 + aten::add 1.19% 34.391us 1.96% 56.751us 9.459us 6.591us 7.38% 6.591us 1.098us 6 + Activity Buffer Request 58.57% 1.692ms 58.57% 1.692ms 1.692ms 1.216us 1.36% 1.216us 1.216us 1 + aten::empty_strided 1.90% 54.851us 1.90% 54.851us 9.142us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.58% 74.440us 2.58% 74.440us 12.407us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.39% 69.079us 3.06% 88.511us 3.688us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.67% 19.432us 0.67% 19.432us 0.810us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.77% 224.384us 7.77% 224.384us 4.675us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.48% 13.900us 0.48% 13.900us 13.900us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.740ms -Self CUDA time total: 89.216us +Self CPU time total: 2.889ms +Self CUDA time total: 89.345us @@ -4046,27 +4046,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 967.576us 1072.55% 967.576us 967.576us 1 - torch_eager 10.80% 301.919us 99.80% 2.790ms 2.790ms 0.000us 0.00% 91.365us 91.365us 1 - aten::mul 5.82% 162.824us 9.87% 275.997us 11.500us 47.523us 52.68% 47.523us 1.980us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.523us 52.68% 47.523us 1.980us 24 - aten::copy_ 4.18% 116.751us 70.01% 1.957ms 108.723us 29.282us 32.46% 30.434us 1.691us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.497us 24.94% 22.497us 1.875us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.408us 14.86% 13.408us 1.117us 12 - aten::clone 0.79% 22.172us 66.92% 1.871ms 311.782us 0.000us 0.00% 7.937us 1.323us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.785us 7.52% 6.785us 1.131us 6 - aten::add 1.23% 34.361us 2.02% 56.562us 9.427us 6.720us 7.45% 6.720us 1.120us 6 - aten::sub 1.36% 38.010us 2.19% 61.310us 10.218us 6.688us 7.41% 6.688us 1.115us 6 - Activity Buffer Request 61.66% 1.724ms 61.66% 1.724ms 1.724ms 1.152us 1.28% 1.152us 1.152us 1 - aten::empty_strided 1.16% 32.541us 1.16% 32.541us 5.424us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.01% 56.260us 2.01% 56.260us 9.377us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.34% 65.363us 2.94% 82.214us 3.426us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.60% 16.851us 0.60% 16.851us 0.702us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.84% 219.114us 7.84% 219.114us 4.565us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.20% 5.580us 0.20% 5.580us 5.580us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 912.155us 1011.53% 912.155us 912.155us 1 + torch_eager 10.76% 301.514us 99.79% 2.795ms 2.795ms 0.000us 0.00% 91.296us 91.296us 1 + aten::mul 5.37% 150.316us 9.21% 258.115us 10.755us 47.452us 52.62% 47.452us 1.977us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.452us 52.62% 47.452us 1.977us 24 + aten::copy_ 3.63% 101.610us 71.00% 1.989ms 110.503us 29.347us 32.54% 30.467us 1.693us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.595us 25.06% 22.595us 1.883us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.377us 14.83% 13.377us 1.115us 12 + aten::clone 0.94% 26.442us 68.40% 1.916ms 319.344us 0.000us 0.00% 7.872us 1.312us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 7.49% 6.752us 1.125us 6 + aten::add 1.08% 30.320us 1.84% 51.590us 8.598us 6.689us 7.42% 6.689us 1.115us 6 + aten::sub 1.25% 34.880us 2.05% 57.350us 9.558us 6.688us 7.42% 6.688us 1.115us 6 + Activity Buffer Request 62.97% 1.764ms 62.97% 1.764ms 1.764ms 1.120us 1.24% 1.120us 1.120us 1 + aten::empty_strided 1.15% 32.200us 1.15% 32.200us 5.367us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.15% 60.121us 2.15% 60.121us 10.020us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.19% 61.232us 2.82% 79.123us 3.297us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.64% 17.891us 0.64% 17.891us 0.745us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.67% 214.771us 7.67% 214.771us 4.474us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.21% 5.950us 0.21% 5.950us 5.950us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.795ms -Self CUDA time total: 90.213us +Self CPU time total: 2.801ms +Self CUDA time total: 90.176us @@ -4076,27 +4076,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 927.639us 987.31% 927.639us 927.639us 1 - torch_eager 10.07% 282.335us 99.80% 2.798ms 2.798ms 0.000us 0.00% 95.268us 95.268us 1 - aten::mul 5.75% 161.290us 9.68% 271.373us 11.307us 48.769us 51.91% 48.769us 2.032us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.769us 51.91% 48.769us 2.032us 24 - aten::copy_ 3.66% 102.626us 71.21% 1.996ms 110.912us 30.720us 32.70% 32.032us 1.780us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 24.39% 22.912us 1.909us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.467us 15.40% 14.467us 1.206us 12 - aten::clone 0.79% 22.060us 68.41% 1.918ms 319.628us 0.000us 0.00% 9.120us 1.520us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 8.31% 7.808us 1.301us 6 - aten::sub 1.36% 38.040us 2.18% 61.002us 10.167us 7.265us 7.73% 7.265us 1.211us 6 - aten::add 1.15% 32.220us 1.90% 53.280us 8.880us 7.202us 7.67% 7.202us 1.200us 6 - Activity Buffer Request 63.51% 1.780ms 63.51% 1.780ms 1.780ms 1.312us 1.40% 1.312us 1.312us 1 - aten::empty_strided 1.12% 31.490us 1.12% 31.490us 5.248us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 1.87% 52.452us 1.87% 52.452us 8.742us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.25% 63.104us 2.86% 80.042us 3.335us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.60% 16.938us 0.60% 16.938us 0.706us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.67% 215.090us 7.67% 215.090us 4.481us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.20% 5.470us 0.20% 5.470us 5.470us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 913.022us 972.45% 913.022us 913.022us 1 + torch_eager 10.98% 301.066us 99.80% 2.737ms 2.737ms 0.000us 0.00% 95.200us 95.200us 1 + aten::mul 5.60% 153.463us 9.59% 262.853us 10.952us 48.737us 51.91% 48.737us 2.031us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.737us 51.91% 48.737us 2.031us 24 + aten::copy_ 3.66% 100.372us 70.16% 1.924ms 106.884us 30.688us 32.69% 31.999us 1.778us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 24.40% 22.912us 1.909us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.464us 15.41% 14.464us 1.205us 12 + aten::clone 0.95% 26.091us 67.47% 1.850ms 308.365us 0.000us 0.00% 9.087us 1.514us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 8.28% 7.776us 1.296us 6 + aten::add 1.14% 31.200us 1.93% 53.020us 8.837us 7.264us 7.74% 7.264us 1.211us 6 + aten::sub 1.31% 35.942us 2.13% 58.462us 9.744us 7.200us 7.67% 7.200us 1.200us 6 + Activity Buffer Request 62.12% 1.703ms 62.12% 1.703ms 1.703ms 1.311us 1.40% 1.311us 1.311us 1 + aten::empty_strided 1.13% 31.120us 1.13% 31.120us 5.187us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.07% 56.870us 2.07% 56.870us 9.478us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.29% 62.820us 2.92% 80.090us 3.337us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.63% 17.270us 0.63% 17.270us 0.720us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.91% 217.003us 7.91% 217.003us 4.521us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.20% 5.600us 0.20% 5.600us 5.600us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.803ms -Self CUDA time total: 93.956us +Self CPU time total: 2.742ms +Self CUDA time total: 93.889us @@ -4106,27 +4106,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 918.847us 904.69% 918.847us 918.847us 1 - torch_eager 11.08% 278.185us 99.79% 2.506ms 2.506ms 0.000us 0.00% 102.877us 102.877us 1 - aten::mul 6.15% 154.372us 10.54% 264.762us 11.032us 52.638us 51.83% 52.638us 2.193us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.638us 51.83% 52.638us 2.193us 24 - aten::copy_ 4.16% 104.580us 68.26% 1.714ms 95.219us 32.416us 31.92% 33.728us 1.874us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.641us 24.26% 24.641us 2.053us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.511us 16.26% 16.511us 1.376us 12 - aten::clone 0.84% 21.090us 65.15% 1.636ms 272.671us 0.000us 0.00% 9.087us 1.514us 6 - aten::sub 1.51% 38.031us 2.44% 61.190us 10.198us 8.288us 8.16% 8.288us 1.381us 6 - aten::add 1.29% 32.470us 2.19% 54.880us 9.147us 8.223us 8.10% 8.223us 1.371us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.775us 7.66% 7.775us 1.296us 6 - Activity Buffer Request 52.27% 1.312ms 52.27% 1.312ms 1.312ms 1.312us 1.29% 1.312us 1.312us 1 - aten::empty_strided 1.29% 32.302us 1.29% 32.302us 5.384us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.44% 236.943us 9.44% 236.943us 39.491us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.53% 63.496us 3.16% 79.393us 3.308us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.63% 15.897us 0.63% 15.897us 0.662us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.60% 215.892us 8.60% 215.892us 4.498us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.21% 5.340us 0.21% 5.340us 5.340us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 939.645us 923.40% 939.645us 939.645us 1 + torch_eager 10.25% 301.966us 99.82% 2.940ms 2.940ms 0.000us 0.00% 103.039us 103.039us 1 + aten::mul 5.17% 152.369us 8.89% 261.782us 10.908us 52.831us 51.92% 52.831us 2.201us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.831us 51.92% 52.831us 2.201us 24 + aten::copy_ 3.48% 102.582us 71.53% 2.107ms 117.036us 32.481us 31.92% 33.761us 1.876us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.737us 24.31% 24.737us 2.061us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.447us 16.16% 16.447us 1.371us 12 + aten::clone 0.93% 27.431us 68.93% 2.030ms 338.362us 0.000us 0.00% 9.024us 1.504us 6 + aten::sub 1.21% 35.711us 2.01% 59.102us 9.850us 8.319us 8.18% 8.319us 1.387us 6 + aten::add 1.06% 31.350us 1.79% 52.800us 8.800us 8.128us 7.99% 8.128us 1.355us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 7.61% 7.744us 1.291us 6 + Activity Buffer Request 57.63% 1.697ms 57.63% 1.697ms 1.697ms 1.280us 1.26% 1.280us 1.280us 1 + aten::empty_strided 1.03% 30.229us 1.03% 30.229us 5.038us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.25% 243.123us 8.25% 243.123us 40.520us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.79% 82.098us 3.39% 99.942us 4.164us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.61% 17.844us 0.61% 17.844us 0.743us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.40% 217.957us 7.40% 217.957us 4.541us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.18% 5.350us 0.18% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.511ms -Self CUDA time total: 101.565us +Self CPU time total: 2.945ms +Self CUDA time total: 101.759us @@ -4136,27 +4136,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 991.709us 1060.94% 991.709us 991.709us 1 - torch_eager 10.56% 336.649us 99.82% 3.183ms 3.183ms 0.000us 0.00% 94.755us 94.755us 1 - aten::mul 5.20% 165.794us 8.73% 278.295us 11.596us 48.674us 52.07% 48.674us 2.028us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.674us 52.07% 48.674us 2.028us 24 - aten::copy_ 3.76% 119.863us 72.07% 2.298ms 127.674us 30.622us 32.76% 31.902us 1.772us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.878us 24.47% 22.878us 1.907us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.179us 15.17% 14.179us 1.182us 12 - aten::clone 0.88% 28.161us 69.55% 2.218ms 369.616us 0.000us 0.00% 9.024us 1.504us 6 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 916.066us 979.04% 916.066us 916.066us 1 + torch_eager 10.22% 297.753us 99.80% 2.907ms 2.907ms 0.000us 0.00% 94.848us 94.848us 1 + aten::mul 5.27% 153.390us 8.99% 261.760us 10.907us 48.609us 51.95% 48.609us 2.025us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.609us 51.95% 48.609us 2.025us 24 + aten::copy_ 3.49% 101.793us 71.96% 2.096ms 116.462us 30.720us 32.83% 32.000us 1.778us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.976us 24.56% 22.976us 1.915us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.239us 15.22% 14.239us 1.187us 12 + aten::clone 1.04% 30.259us 69.44% 2.023ms 337.157us 0.000us 0.00% 9.024us 1.504us 6 Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 8.28% 7.744us 1.291us 6 - aten::sub 1.28% 40.920us 2.05% 65.511us 10.918us 7.138us 7.64% 7.138us 1.190us 6 - aten::add 1.05% 33.330us 1.81% 57.620us 9.603us 7.041us 7.53% 7.041us 1.173us 6 - Activity Buffer Request 55.60% 1.773ms 55.60% 1.773ms 1.773ms 1.280us 1.37% 1.280us 1.280us 1 - aten::empty_strided 1.06% 33.640us 1.06% 33.640us 5.607us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 10.74% 342.585us 10.74% 342.585us 57.097us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.14% 68.349us 2.66% 84.959us 3.540us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.52% 16.610us 0.52% 16.610us 0.692us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.03% 224.072us 7.03% 224.072us 4.668us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.18% 5.590us 0.18% 5.590us 5.590us 0.000us 0.00% 0.000us 0.000us 1 + aten::sub 1.21% 35.379us 2.01% 58.671us 9.778us 7.135us 7.63% 7.135us 1.189us 6 + aten::add 1.06% 30.800us 1.78% 51.980us 8.663us 7.104us 7.59% 7.104us 1.184us 6 + Activity Buffer Request 59.25% 1.726ms 59.25% 1.726ms 1.726ms 1.280us 1.37% 1.280us 1.280us 1 + aten::empty_strided 1.07% 31.081us 1.07% 31.081us 5.180us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.98% 203.214us 6.98% 203.214us 33.869us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.14% 62.478us 2.73% 79.587us 3.316us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.59% 17.109us 0.59% 17.109us 0.713us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.49% 218.173us 7.49% 218.173us 4.545us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.20% 5.800us 0.20% 5.800us 5.800us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.189ms -Self CUDA time total: 93.475us +Self CPU time total: 2.913ms +Self CUDA time total: 93.568us @@ -4166,27 +4166,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 941.177us 926.36% 941.177us 941.177us 1 - torch_eager 9.56% 295.804us 99.83% 3.088ms 3.088ms 0.000us 0.00% 102.911us 102.911us 1 - aten::mul 5.03% 155.643us 8.60% 265.986us 11.083us 52.802us 51.97% 52.802us 2.200us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.802us 51.97% 52.802us 2.200us 24 - aten::copy_ 3.66% 113.330us 73.34% 2.269ms 126.052us 32.447us 31.94% 33.759us 1.876us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.703us 24.31% 24.703us 2.059us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.350us 16.09% 16.350us 1.363us 12 - aten::clone 0.71% 21.820us 70.53% 2.182ms 363.694us 0.000us 0.00% 9.056us 1.509us 6 - aten::sub 1.30% 40.120us 2.07% 63.950us 10.658us 8.223us 8.09% 8.223us 1.370us 6 - aten::add 1.17% 36.201us 1.90% 58.931us 9.822us 8.127us 8.00% 8.127us 1.355us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 7.62% 7.744us 1.291us 6 - Activity Buffer Request 57.23% 1.771ms 57.23% 1.771ms 1.771ms 1.312us 1.29% 1.312us 1.312us 1 - aten::empty_strided 0.98% 30.371us 0.98% 30.371us 5.062us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 10.40% 321.885us 10.40% 321.885us 53.647us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.12% 65.592us 2.67% 82.622us 3.443us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.55% 17.030us 0.55% 17.030us 0.710us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.11% 219.985us 7.11% 219.985us 4.583us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.17% 5.340us 0.17% 5.340us 5.340us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 889.506us 873.60% 889.506us 889.506us 1 + torch_eager 9.47% 263.734us 99.81% 2.780ms 2.780ms 0.000us 0.00% 103.133us 103.133us 1 + aten::mul 5.38% 149.780us 9.23% 257.043us 10.710us 52.958us 52.01% 52.958us 2.207us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.958us 52.01% 52.958us 2.207us 24 + aten::copy_ 3.60% 100.145us 72.47% 2.019ms 112.141us 32.385us 31.81% 33.697us 1.872us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.609us 24.17% 24.609us 2.051us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.478us 16.18% 16.478us 1.373us 12 + aten::clone 0.76% 21.289us 69.67% 1.941ms 323.462us 0.000us 0.00% 9.088us 1.515us 6 + aten::sub 1.26% 35.149us 2.08% 57.971us 9.662us 8.318us 8.17% 8.318us 1.386us 6 + aten::add 1.12% 31.181us 1.89% 52.690us 8.782us 8.160us 8.01% 8.160us 1.360us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 7.64% 7.776us 1.296us 6 + Activity Buffer Request 59.87% 1.668ms 59.87% 1.668ms 1.668ms 1.312us 1.29% 1.312us 1.312us 1 + aten::empty_strided 1.08% 30.102us 1.08% 30.102us 5.017us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.81% 189.822us 6.81% 189.822us 31.637us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.23% 62.254us 2.83% 78.841us 3.285us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.60% 16.587us 0.60% 16.587us 0.691us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.62% 212.384us 7.62% 212.384us 4.425us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.19% 5.281us 0.19% 5.281us 5.281us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.094ms -Self CUDA time total: 101.599us +Self CPU time total: 2.785ms +Self CUDA time total: 101.821us @@ -4196,27 +4196,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 943.963us 782.64% 943.963us 943.963us 1 - torch_eager 9.85% 301.136us 99.82% 3.051ms 3.051ms 0.000us 0.00% 122.468us 122.468us 1 - aten::mul 5.14% 157.189us 8.67% 264.988us 11.041us 61.985us 51.39% 61.985us 2.583us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 61.985us 51.39% 61.985us 2.583us 24 - aten::copy_ 3.53% 107.981us 72.58% 2.218ms 123.247us 39.362us 32.64% 41.218us 2.290us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.802us 23.88% 28.802us 2.400us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.265us 15.97% 19.265us 1.605us 12 - aten::clone 0.97% 29.629us 70.14% 2.144ms 357.356us 0.000us 0.00% 12.416us 2.069us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.560us 8.76% 10.560us 1.760us 6 - aten::add 1.14% 34.930us 1.90% 58.161us 9.693us 9.633us 7.99% 9.633us 1.606us 6 - aten::sub 1.25% 38.210us 2.05% 62.510us 10.418us 9.632us 7.99% 9.632us 1.605us 6 - Activity Buffer Request 57.00% 1.742ms 57.00% 1.742ms 1.742ms 1.856us 1.54% 1.856us 1.856us 1 - aten::empty_strided 1.01% 31.021us 1.01% 31.021us 5.170us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 10.03% 306.454us 10.03% 306.454us 51.076us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.23% 68.242us 2.79% 85.430us 3.560us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.56% 17.188us 0.56% 17.188us 0.716us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.10% 217.131us 7.10% 217.131us 4.524us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.18% 5.390us 0.18% 5.390us 5.390us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 917.279us 760.79% 917.279us 917.279us 1 + torch_eager 9.49% 270.296us 99.79% 2.843ms 2.843ms 0.000us 0.00% 122.393us 122.393us 1 + aten::mul 5.35% 152.294us 9.23% 262.963us 10.957us 62.014us 51.43% 62.014us 2.584us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.014us 51.43% 62.014us 2.584us 24 + aten::copy_ 3.56% 101.468us 72.26% 2.058ms 114.357us 39.420us 32.69% 41.243us 2.291us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.894us 23.96% 28.894us 2.408us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.136us 15.87% 19.136us 1.595us 12 + aten::clone 0.72% 20.551us 69.70% 1.986ms 330.917us 0.000us 0.00% 12.349us 2.058us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.526us 8.73% 10.526us 1.754us 6 + aten::sub 1.26% 35.841us 2.08% 59.281us 9.880us 9.600us 7.96% 9.600us 1.600us 6 + aten::add 1.13% 32.290us 1.93% 54.990us 9.165us 9.536us 7.91% 9.536us 1.589us 6 + Activity Buffer Request 59.91% 1.707ms 59.91% 1.707ms 1.707ms 1.823us 1.51% 1.823us 1.823us 1 + aten::empty_strided 1.30% 36.920us 1.30% 36.920us 6.153us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.60% 187.893us 6.60% 187.893us 31.316us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.17% 61.750us 2.78% 79.243us 3.302us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.61% 17.493us 0.61% 17.493us 0.729us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.70% 219.342us 7.70% 219.342us 4.570us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.21% 5.890us 0.21% 5.890us 5.890us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.057ms -Self CUDA time total: 120.612us +Self CPU time total: 2.849ms +Self CUDA time total: 120.570us @@ -4226,27 +4226,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 928.245us 538.18% 928.245us 928.245us 1 - torch_eager 19.14% 292.425us 99.66% 1.523ms 1.523ms 0.000us 0.00% 175.325us 175.325us 1 - aten::mul 10.16% 155.270us 17.20% 262.742us 10.948us 89.630us 51.97% 89.630us 3.735us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.630us 51.97% 89.630us 3.735us 24 - aten::copy_ 6.82% 104.170us 46.76% 714.441us 39.691us 57.920us 33.58% 60.768us 3.376us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.768us 23.64% 40.768us 3.397us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.927us 14.45% 24.927us 2.077us 12 - aten::clone 1.34% 20.471us 41.24% 630.180us 105.030us 0.000us 0.00% 20.000us 3.333us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.152us 9.94% 17.152us 2.859us 6 - aten::sub 2.56% 39.072us 4.07% 62.112us 10.352us 12.480us 7.24% 12.480us 2.080us 6 - aten::add 2.20% 33.610us 3.65% 55.810us 9.302us 12.447us 7.22% 12.447us 2.075us 6 - Activity Buffer Request 16.69% 254.944us 16.69% 254.944us 254.944us 2.848us 1.65% 2.848us 2.848us 1 - aten::empty_strided 2.04% 31.181us 2.04% 31.181us 5.197us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 19.06% 291.294us 19.06% 291.294us 48.549us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.37% 66.700us 5.47% 83.522us 3.480us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.10% 16.822us 1.10% 16.822us 0.701us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 14.19% 216.745us 14.19% 216.745us 4.516us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.34% 5.240us 0.34% 5.240us 5.240us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 924.864us 536.51% 924.864us 924.864us 1 + torch_eager 10.45% 305.132us 99.82% 2.915ms 2.915ms 0.000us 0.00% 175.296us 175.296us 1 + aten::mul 5.14% 150.181us 8.96% 261.653us 10.902us 89.408us 51.87% 89.408us 3.725us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.408us 51.87% 89.408us 3.725us 24 + aten::copy_ 3.52% 102.713us 71.68% 2.093ms 116.286us 57.856us 33.56% 60.768us 3.376us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.768us 23.65% 40.768us 3.397us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.120us 14.57% 25.120us 2.093us 12 + aten::clone 1.06% 30.960us 69.26% 2.022ms 337.044us 0.000us 0.00% 20.000us 3.333us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 9.91% 17.088us 2.848us 6 + aten::add 1.07% 31.150us 1.81% 52.780us 8.797us 12.672us 7.35% 12.672us 2.112us 6 + aten::sub 1.22% 35.659us 2.01% 58.661us 9.777us 12.448us 7.22% 12.448us 2.075us 6 + Activity Buffer Request 59.67% 1.742ms 59.67% 1.742ms 1.742ms 2.912us 1.69% 2.912us 2.912us 1 + aten::empty_strided 1.10% 32.260us 1.10% 32.260us 5.377us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.30% 183.953us 6.30% 183.953us 30.659us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.15% 62.747us 2.74% 80.125us 3.339us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.60% 17.378us 0.60% 17.378us 0.724us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.55% 220.376us 7.55% 220.376us 4.591us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.18% 5.231us 0.18% 5.231us 5.231us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.528ms -Self CUDA time total: 172.477us +Self CPU time total: 2.920ms +Self CUDA time total: 172.384us @@ -4256,27 +4256,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 923.899us 767.46% 923.899us 923.899us 1 - torch_eager 19.14% 287.798us 99.65% 1.499ms 1.499ms 0.000us 0.00% 122.144us 122.144us 1 - aten::mul 10.49% 157.698us 17.70% 266.255us 11.094us 61.982us 51.49% 61.982us 2.583us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 61.982us 51.49% 61.982us 2.583us 24 - aten::copy_ 6.99% 105.118us 46.36% 697.187us 38.733us 39.264us 32.62% 41.024us 2.279us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.832us 23.95% 28.832us 2.403us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.138us 15.90% 19.138us 1.595us 12 - aten::clone 1.32% 19.822us 40.79% 613.519us 102.253us 0.000us 0.00% 12.192us 2.032us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 8.67% 10.432us 1.739us 6 - aten::sub 2.51% 37.801us 4.08% 61.341us 10.224us 9.570us 7.95% 9.570us 1.595us 6 - aten::add 2.16% 32.471us 3.63% 54.661us 9.110us 9.568us 7.95% 9.568us 1.595us 6 - Activity Buffer Request 16.71% 251.314us 16.71% 251.314us 251.314us 1.760us 1.46% 1.760us 1.760us 1 - aten::empty_strided 2.00% 30.060us 2.00% 30.060us 5.010us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 18.58% 279.394us 18.58% 279.394us 46.566us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.31% 64.750us 5.43% 81.609us 3.400us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.12% 16.859us 1.12% 16.859us 0.702us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 14.34% 215.648us 14.34% 215.648us 4.493us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.35% 5.220us 0.35% 5.220us 5.220us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 891.930us 739.34% 891.930us 891.930us 1 + torch_eager 21.05% 265.905us 99.59% 1.258ms 1.258ms 0.000us 0.00% 122.398us 122.398us 1 + aten::mul 11.89% 150.169us 20.37% 257.293us 10.721us 62.046us 51.43% 62.046us 2.585us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.046us 51.43% 62.046us 2.585us 24 + aten::copy_ 7.91% 99.966us 39.16% 494.681us 27.482us 39.361us 32.63% 41.121us 2.285us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.961us 24.01% 28.961us 2.413us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.231us 15.94% 19.231us 1.603us 12 + aten::clone 1.55% 19.527us 32.87% 415.246us 69.208us 0.000us 0.00% 12.160us 2.027us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 8.62% 10.400us 1.733us 6 + aten::add 2.51% 31.670us 4.18% 52.860us 8.810us 9.663us 8.01% 9.663us 1.611us 6 + aten::sub 2.77% 34.981us 4.60% 58.113us 9.685us 9.568us 7.93% 9.568us 1.595us 6 + Activity Buffer Request 11.68% 147.573us 11.68% 147.573us 147.573us 1.760us 1.46% 1.760us 1.760us 1 + aten::empty_strided 2.49% 31.403us 2.49% 31.403us 5.234us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 14.63% 184.842us 14.63% 184.842us 30.807us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.89% 61.717us 6.20% 78.378us 3.266us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.32% 16.661us 1.32% 16.661us 0.694us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 16.92% 213.746us 16.92% 213.746us 4.453us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.41% 5.161us 0.41% 5.161us 5.161us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.504ms -Self CUDA time total: 120.384us +Self CPU time total: 1.263ms +Self CUDA time total: 120.638us @@ -4286,27 +4286,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 943.259us 547.68% 943.259us 943.259us 1 - torch_eager 9.82% 293.988us 99.82% 2.988ms 2.988ms 0.000us 0.00% 175.075us 175.075us 1 - aten::mul 5.17% 154.631us 8.81% 263.742us 10.989us 89.536us 51.99% 89.536us 3.731us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.536us 51.99% 89.536us 3.731us 24 - aten::copy_ 3.66% 109.570us 72.53% 2.171ms 120.590us 57.795us 33.56% 60.643us 3.369us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.835us 23.71% 40.835us 3.403us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.896us 14.46% 24.896us 2.075us 12 - aten::clone 0.74% 22.030us 69.74% 2.087ms 347.874us 0.000us 0.00% 19.808us 3.301us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 9.85% 16.960us 2.827us 6 - aten::add 1.10% 32.890us 1.87% 55.840us 9.307us 12.481us 7.25% 12.481us 2.080us 6 - aten::sub 1.28% 38.273us 2.11% 63.142us 10.524us 12.415us 7.21% 12.415us 2.069us 6 - Activity Buffer Request 58.02% 1.736ms 58.02% 1.736ms 1.736ms 2.848us 1.65% 2.848us 2.848us 1 - aten::empty_strided 1.00% 30.050us 1.00% 30.050us 5.008us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.83% 264.325us 8.83% 264.325us 44.054us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.36% 70.650us 2.95% 88.161us 3.673us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.59% 17.511us 0.59% 17.511us 0.730us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.26% 217.282us 7.26% 217.282us 4.527us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.18% 5.289us 0.18% 5.289us 5.289us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 896.465us 520.02% 896.465us 896.465us 1 + torch_eager 20.64% 267.500us 99.56% 1.290ms 1.290ms 0.000us 0.00% 175.237us 175.237us 1 + aten::mul 11.74% 152.190us 20.16% 261.202us 10.883us 89.569us 51.96% 89.569us 3.732us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.569us 51.96% 89.569us 3.732us 24 + aten::copy_ 7.78% 100.776us 40.08% 519.323us 28.851us 57.731us 33.49% 60.579us 3.366us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.803us 23.67% 40.803us 3.400us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.089us 14.55% 25.089us 2.091us 12 + aten::clone 1.61% 20.829us 34.03% 440.978us 73.496us 0.000us 0.00% 19.776us 3.296us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 9.82% 16.928us 2.821us 6 + aten::add 2.36% 30.520us 4.00% 51.870us 8.645us 12.608us 7.31% 12.608us 2.101us 6 + aten::sub 2.79% 36.204us 4.56% 59.064us 9.844us 12.481us 7.24% 12.481us 2.080us 6 + Activity Buffer Request 13.56% 175.743us 13.56% 175.743us 175.743us 2.848us 1.65% 2.848us 2.848us 1 + aten::empty_strided 2.40% 31.051us 2.40% 31.051us 5.175us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.98% 181.154us 13.98% 181.154us 30.192us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.80% 62.202us 6.12% 79.302us 3.304us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.32% 17.100us 1.32% 17.100us 0.713us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 16.58% 214.872us 16.58% 214.872us 4.477us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.44% 5.731us 0.44% 5.731us 5.731us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.993ms -Self CUDA time total: 172.227us +Self CPU time total: 1.296ms +Self CUDA time total: 172.389us @@ -4316,27 +4316,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 922.006us 322.11% 922.006us 922.006us 1 - torch_eager 19.42% 278.764us 99.64% 1.431ms 1.431ms 0.000us 0.00% 304.543us 304.543us 1 - aten::mul 10.68% 153.400us 18.09% 259.803us 10.825us 134.112us 46.85% 134.112us 5.588us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 134.112us 46.85% 134.112us 5.588us 24 - aten::copy_ 7.65% 109.831us 44.83% 643.670us 35.759us 111.232us 38.86% 129.536us 7.196us 18 - aten::clone 1.43% 20.539us 38.82% 557.349us 92.892us 0.000us 0.00% 72.160us 12.027us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.376us 20.04% 57.376us 4.781us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.856us 18.82% 53.856us 8.976us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.895us 14.29% 40.895us 3.408us 12 - aten::sub 2.68% 38.501us 4.30% 61.692us 10.282us 20.543us 7.18% 20.543us 3.424us 6 - aten::add 2.29% 32.829us 3.81% 54.730us 9.122us 20.352us 7.11% 20.352us 3.392us 6 - Activity Buffer Request 16.08% 230.904us 16.08% 230.904us 230.904us 18.304us 6.39% 18.304us 18.304us 1 - aten::empty_strided 2.06% 29.601us 2.06% 29.601us 4.933us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 16.83% 241.674us 16.83% 241.674us 40.279us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.51% 64.754us 5.69% 81.743us 3.406us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.18% 16.989us 1.18% 16.989us 0.708us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 14.82% 212.756us 14.82% 212.756us 4.432us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.36% 5.240us 0.36% 5.240us 5.240us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 907.999us 318.14% 907.999us 907.999us 1 + torch_eager 10.33% 297.182us 99.78% 2.870ms 2.870ms 0.000us 0.00% 303.265us 303.265us 1 + aten::mul 5.13% 147.584us 8.90% 255.976us 10.666us 133.567us 46.80% 133.567us 5.565us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.567us 46.80% 133.567us 5.565us 24 + aten::copy_ 3.51% 101.030us 71.83% 2.067ms 114.810us 110.722us 38.79% 128.578us 7.143us 18 + aten::clone 0.94% 26.992us 69.34% 1.995ms 332.479us 0.000us 0.00% 71.233us 11.872us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.345us 20.09% 57.345us 4.779us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.377us 18.70% 53.377us 8.896us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.120us 14.41% 41.120us 3.427us 12 + aten::sub 1.21% 34.680us 2.01% 57.770us 9.628us 20.833us 7.30% 20.833us 3.472us 6 + aten::add 1.09% 31.394us 1.89% 54.514us 9.086us 20.287us 7.11% 20.287us 3.381us 6 + Activity Buffer Request 59.89% 1.723ms 59.89% 1.723ms 1.723ms 17.856us 6.26% 17.856us 17.856us 1 + aten::empty_strided 1.06% 30.410us 1.06% 30.410us 5.068us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.35% 182.753us 6.35% 182.753us 30.459us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.22% 63.807us 2.82% 81.011us 3.375us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.60% 17.204us 0.60% 17.204us 0.717us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.46% 214.543us 7.46% 214.543us 4.470us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.22% 6.420us 0.22% 6.420us 6.420us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.436ms -Self CUDA time total: 286.239us +Self CPU time total: 2.877ms +Self CUDA time total: 285.409us @@ -4346,27 +4346,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 970.352us 169.72% 970.352us 970.352us 1 - torch_eager 19.50% 289.365us 99.64% 1.478ms 1.478ms 0.000us 0.00% 595.480us 595.480us 1 - aten::copy_ 7.05% 104.551us 43.31% 642.598us 35.700us 273.596us 47.85% 297.340us 16.519us 18 - aten::mul 11.63% 172.532us 19.46% 288.666us 12.028us 232.863us 40.73% 232.863us 9.703us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 232.863us 40.73% 232.863us 9.703us 24 - aten::clone 1.45% 21.521us 37.67% 558.878us 93.146us 0.000us 0.00% 205.949us 34.325us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 182.205us 31.87% 182.205us 30.367us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 91.391us 15.98% 91.391us 7.616us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.277us 11.42% 65.277us 5.440us 12 - aten::sub 2.70% 40.111us 4.36% 64.701us 10.784us 32.768us 5.73% 32.768us 5.461us 6 - aten::add 2.31% 34.320us 3.88% 57.510us 9.585us 32.509us 5.69% 32.509us 5.418us 6 - Activity Buffer Request 17.48% 259.324us 17.48% 259.324us 259.324us 23.744us 4.15% 23.744us 23.744us 1 - aten::empty_strided 2.00% 29.720us 2.00% 29.720us 4.953us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 14.68% 217.742us 14.68% 217.742us 36.290us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.50% 66.694us 5.68% 84.252us 3.511us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.18% 17.558us 1.18% 17.558us 0.732us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.16% 224.895us 15.16% 224.895us 4.685us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.36% 5.340us 0.36% 5.340us 5.340us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 936.507us 164.60% 936.507us 936.507us 1 + torch_eager 21.10% 274.795us 99.58% 1.297ms 1.297ms 0.000us 0.00% 592.769us 592.769us 1 + aten::copy_ 7.72% 100.512us 38.34% 499.337us 27.741us 273.025us 47.99% 296.833us 16.491us 18 + aten::mul 11.94% 155.505us 20.72% 269.807us 11.242us 230.561us 40.52% 230.561us 9.607us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 230.561us 40.52% 230.561us 9.607us 24 + aten::clone 1.56% 20.351us 32.41% 422.047us 70.341us 0.000us 0.00% 206.465us 34.411us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 182.657us 32.10% 182.657us 30.443us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.368us 15.88% 90.368us 7.531us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.375us 11.49% 65.375us 5.448us 12 + aten::sub 2.96% 38.510us 4.81% 62.580us 10.430us 32.991us 5.80% 32.991us 5.498us 6 + aten::add 2.43% 31.581us 4.16% 54.201us 9.034us 32.384us 5.69% 32.384us 5.397us 6 + Activity Buffer Request 11.20% 145.802us 11.20% 145.802us 145.802us 23.808us 4.18% 23.808us 23.808us 1 + aten::empty_strided 2.79% 36.361us 2.79% 36.361us 6.060us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 14.42% 187.792us 14.42% 187.792us 31.299us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.78% 62.259us 6.10% 79.429us 3.310us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.32% 17.170us 1.32% 17.170us 0.715us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 17.37% 226.223us 17.37% 226.223us 4.713us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.42% 5.440us 0.42% 5.440us 5.440us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.484ms -Self CUDA time total: 571.736us +Self CPU time total: 1.302ms +Self CUDA time total: 568.961us @@ -4376,27 +4376,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 936.155us 1011.59% 936.155us 936.155us 1 - torch_eager 9.66% 281.404us 99.82% 2.908ms 2.908ms 0.000us 0.00% 93.663us 93.663us 1 - aten::mul 5.48% 159.764us 9.36% 272.564us 11.357us 49.568us 53.56% 49.568us 2.065us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.568us 53.56% 49.568us 2.065us 24 - aten::copy_ 3.70% 107.711us 72.25% 2.105ms 116.944us 29.407us 31.78% 30.527us 1.696us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.591us 24.41% 22.591us 1.883us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.568us 14.66% 13.568us 1.131us 12 - aten::clone 0.74% 21.551us 69.34% 2.020ms 336.695us 0.000us 0.00% 7.936us 1.323us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 7.37% 6.816us 1.136us 6 - aten::sub 1.31% 38.128us 2.13% 61.912us 10.319us 6.815us 7.36% 6.815us 1.136us 6 - aten::add 1.08% 31.450us 1.84% 53.600us 8.933us 6.753us 7.30% 6.753us 1.126us 6 - Activity Buffer Request 59.75% 1.741ms 59.75% 1.741ms 1.741ms 1.120us 1.21% 1.120us 1.120us 1 - aten::empty_strided 1.04% 30.170us 1.04% 30.170us 5.028us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.73% 196.044us 6.73% 196.044us 32.674us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.24% 65.300us 2.82% 82.022us 3.418us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.57% 16.722us 0.57% 16.722us 0.697us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.53% 219.305us 7.53% 219.305us 4.569us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.18% 5.160us 0.18% 5.160us 5.160us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 912.214us 987.11% 912.214us 912.214us 1 + torch_eager 20.73% 270.361us 99.60% 1.299ms 1.299ms 0.000us 0.00% 93.533us 93.533us 1 + aten::mul 11.80% 153.962us 20.35% 265.434us 11.060us 49.471us 53.53% 49.471us 2.061us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.471us 53.53% 49.471us 2.061us 24 + aten::copy_ 7.83% 102.175us 39.79% 518.911us 28.828us 29.343us 31.75% 30.463us 1.692us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.559us 24.41% 22.559us 1.880us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.599us 14.72% 13.599us 1.133us 12 + aten::clone 1.57% 20.491us 33.39% 435.488us 72.581us 0.000us 0.00% 7.904us 1.317us 6 + aten::add 2.51% 32.770us 4.19% 54.680us 9.113us 6.815us 7.37% 6.815us 1.136us 6 + aten::sub 2.78% 36.231us 4.59% 59.802us 9.967us 6.784us 7.34% 6.784us 1.131us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 7.34% 6.784us 1.131us 6 + Activity Buffer Request 12.62% 164.552us 12.62% 164.552us 164.552us 1.120us 1.21% 1.120us 1.120us 1 + aten::empty_strided 2.31% 30.071us 2.31% 30.071us 5.012us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 14.34% 187.054us 14.34% 187.054us 31.176us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.77% 62.210us 6.08% 79.271us 3.303us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.31% 17.061us 1.31% 17.061us 0.711us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 17.03% 222.083us 17.03% 222.083us 4.627us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.40% 5.200us 0.40% 5.200us 5.200us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.913ms -Self CUDA time total: 92.543us +Self CPU time total: 1.304ms +Self CUDA time total: 92.413us @@ -4406,27 +4406,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 918.262us 956.86% 918.262us 918.262us 1 - torch_eager 20.02% 274.163us 99.62% 1.364ms 1.364ms 0.000us 0.00% 97.279us 97.279us 1 - aten::mul 11.52% 157.766us 19.39% 265.646us 11.069us 51.167us 53.32% 51.167us 2.132us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.167us 53.32% 51.167us 2.132us 24 - aten::copy_ 7.76% 106.268us 42.02% 575.576us 31.976us 30.720us 32.01% 32.033us 1.780us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 23.88% 22.912us 1.909us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.079us 14.67% 14.079us 1.173us 12 - aten::clone 1.48% 20.322us 36.02% 493.298us 82.216us 0.000us 0.00% 9.121us 1.520us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 8.14% 7.808us 1.301us 6 - aten::sub 2.81% 38.541us 4.49% 61.481us 10.247us 7.072us 7.37% 7.072us 1.179us 6 - aten::add 2.42% 33.131us 4.04% 55.302us 9.217us 7.007us 7.30% 7.007us 1.168us 6 - Activity Buffer Request 16.17% 221.544us 16.17% 221.544us 221.544us 1.313us 1.37% 1.313us 1.313us 1 - aten::empty_strided 2.33% 31.950us 2.33% 31.950us 5.325us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.69% 187.513us 13.69% 187.513us 31.252us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.61% 63.101us 5.84% 79.961us 3.332us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.23% 16.860us 1.23% 16.860us 0.702us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.57% 213.242us 15.57% 213.242us 4.443us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.38% 5.270us 0.38% 5.270us 5.270us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 927.225us 964.57% 927.225us 927.225us 1 + torch_eager 10.42% 298.637us 99.83% 2.862ms 2.862ms 0.000us 0.00% 97.440us 97.440us 1 + aten::mul 5.27% 151.202us 9.04% 259.193us 10.800us 51.264us 53.33% 51.264us 2.136us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.264us 53.33% 51.264us 2.136us 24 + aten::copy_ 3.63% 103.959us 71.40% 2.047ms 113.712us 30.719us 31.96% 32.031us 1.780us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.878us 23.80% 22.878us 1.907us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.145us 14.71% 14.145us 1.179us 12 + aten::clone 0.93% 26.740us 68.76% 1.971ms 328.494us 0.000us 0.00% 9.153us 1.526us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.841us 8.16% 7.841us 1.307us 6 + aten::add 1.34% 38.470us 2.12% 60.910us 10.152us 7.105us 7.39% 7.105us 1.184us 6 + aten::sub 1.24% 35.481us 2.03% 58.281us 9.714us 7.040us 7.32% 7.040us 1.173us 6 + Activity Buffer Request 58.96% 1.690ms 58.96% 1.690ms 1.690ms 1.312us 1.36% 1.312us 1.312us 1 + aten::empty_strided 1.07% 30.811us 1.07% 30.811us 5.135us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.60% 189.265us 6.60% 189.265us 31.544us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.21% 63.469us 2.80% 80.301us 3.346us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.59% 16.832us 0.59% 16.832us 0.701us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.56% 216.723us 7.56% 216.723us 4.515us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.17% 4.900us 0.17% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.370ms -Self CUDA time total: 95.966us +Self CPU time total: 2.867ms +Self CUDA time total: 96.128us @@ -4436,27 +4436,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 929.528us 892.96% 929.528us 929.528us 1 - torch_eager 20.25% 278.528us 99.63% 1.370ms 1.370ms 0.000us 0.00% 105.439us 105.439us 1 - aten::mul 11.59% 159.422us 19.60% 269.583us 11.233us 55.326us 53.15% 55.326us 2.305us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.326us 53.15% 55.326us 2.305us 24 - aten::copy_ 7.64% 105.130us 41.59% 572.021us 31.779us 32.351us 31.08% 33.695us 1.872us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.607us 23.64% 24.607us 2.051us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.418us 15.77% 16.418us 1.368us 12 - aten::clone 1.49% 20.431us 35.49% 488.057us 81.343us 0.000us 0.00% 9.088us 1.515us 6 - aten::sub 2.60% 35.723us 4.36% 59.953us 9.992us 8.258us 7.93% 8.258us 1.376us 6 - aten::add 2.46% 33.770us 4.07% 55.940us 9.323us 8.160us 7.84% 8.160us 1.360us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 7.44% 7.744us 1.291us 6 - Activity Buffer Request 16.10% 221.454us 16.10% 221.454us 221.454us 1.344us 1.29% 1.344us 1.344us 1 - aten::empty_strided 2.25% 30.990us 2.25% 30.990us 5.165us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.30% 182.863us 13.30% 182.863us 30.477us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.81% 66.212us 6.02% 82.825us 3.451us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.21% 16.613us 1.21% 16.613us 0.692us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.93% 219.135us 15.93% 219.135us 4.565us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.37% 5.090us 0.37% 5.090us 5.090us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 904.125us 867.77% 904.125us 904.125us 1 + torch_eager 21.17% 270.270us 99.58% 1.271ms 1.271ms 0.000us 0.00% 105.501us 105.501us 1 + aten::mul 11.71% 149.569us 20.29% 259.044us 10.793us 55.455us 53.23% 55.455us 2.311us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.455us 53.23% 55.455us 2.311us 24 + aten::copy_ 7.94% 101.312us 39.18% 500.180us 27.788us 32.416us 31.11% 33.728us 1.874us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.704us 23.71% 24.704us 2.059us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.318us 15.66% 16.318us 1.360us 12 + aten::clone 1.62% 20.642us 32.82% 418.978us 69.830us 0.000us 0.00% 9.024us 1.504us 6 + aten::sub 2.73% 34.811us 4.52% 57.673us 9.612us 8.256us 7.92% 8.256us 1.376us 6 + aten::add 2.55% 32.590us 4.27% 54.470us 9.078us 8.062us 7.74% 8.062us 1.344us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 7.40% 7.712us 1.285us 6 + Activity Buffer Request 11.60% 148.113us 11.60% 148.113us 148.113us 1.312us 1.26% 1.312us 1.312us 1 + aten::empty_strided 2.40% 30.670us 2.40% 30.670us 5.112us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 14.64% 186.881us 14.64% 186.881us 31.147us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.82% 61.602us 6.15% 78.501us 3.271us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.32% 16.899us 1.32% 16.899us 0.704us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 17.08% 218.091us 17.08% 218.091us 4.544us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.42% 5.300us 0.42% 5.300us 5.300us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.375ms -Self CUDA time total: 104.095us +Self CPU time total: 1.277ms +Self CUDA time total: 104.189us @@ -4466,27 +4466,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 943.134us 762.57% 943.134us 943.134us 1 - torch_eager 9.91% 288.756us 99.81% 2.907ms 2.907ms 0.000us 0.00% 125.503us 125.503us 1 - aten::mul 5.47% 159.428us 9.14% 266.247us 11.094us 65.088us 52.63% 65.088us 2.712us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.088us 52.63% 65.088us 2.712us 24 - aten::copy_ 3.82% 111.411us 72.08% 2.100ms 116.650us 39.391us 31.85% 41.215us 2.290us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.799us 23.29% 28.799us 2.400us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.200us 15.52% 19.200us 1.600us 12 - aten::clone 0.71% 20.821us 69.14% 2.014ms 335.649us 0.000us 0.00% 12.416us 2.069us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.592us 8.56% 10.592us 1.765us 6 - aten::sub 1.35% 39.440us 2.20% 63.980us 10.663us 9.632us 7.79% 9.632us 1.605us 6 - aten::add 1.16% 33.802us 1.92% 55.961us 9.327us 9.568us 7.74% 9.568us 1.595us 6 - Activity Buffer Request 59.81% 1.742ms 59.81% 1.742ms 1.742ms 1.824us 1.47% 1.824us 1.824us 1 - aten::empty_strided 1.06% 30.871us 1.06% 30.871us 5.145us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.32% 184.202us 6.32% 184.202us 30.700us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.20% 64.120us 2.78% 80.888us 3.370us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.58% 16.768us 0.58% 16.768us 0.699us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.39% 215.298us 7.39% 215.298us 4.485us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.19% 5.660us 0.19% 5.660us 5.660us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 918.160us 742.36% 918.160us 918.160us 1 + torch_eager 20.43% 265.704us 99.60% 1.295ms 1.295ms 0.000us 0.00% 125.506us 125.506us 1 + aten::mul 11.98% 155.738us 20.81% 270.642us 11.277us 65.023us 52.57% 65.023us 2.709us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.023us 52.57% 65.023us 2.709us 24 + aten::copy_ 7.90% 102.742us 39.27% 510.720us 28.373us 39.331us 31.80% 41.155us 2.286us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.898us 23.36% 28.898us 2.408us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.328us 15.63% 19.328us 1.611us 12 + aten::clone 1.56% 20.250us 32.83% 426.918us 71.153us 0.000us 0.00% 12.257us 2.043us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.433us 8.44% 10.433us 1.739us 6 + aten::add 2.56% 33.260us 4.26% 55.420us 9.237us 9.728us 7.87% 9.728us 1.621us 6 + aten::sub 2.77% 36.070us 4.64% 60.350us 10.058us 9.600us 7.76% 9.600us 1.600us 6 + Activity Buffer Request 12.28% 159.673us 12.28% 159.673us 159.673us 1.824us 1.47% 1.824us 1.824us 1 + aten::empty_strided 2.40% 31.241us 2.40% 31.241us 5.207us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 14.07% 182.993us 14.07% 182.993us 30.499us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.89% 63.631us 6.22% 80.874us 3.370us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.33% 17.243us 1.33% 17.243us 0.718us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 17.43% 226.656us 17.43% 226.656us 4.722us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.40% 5.220us 0.40% 5.220us 5.220us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.913ms -Self CUDA time total: 123.679us +Self CPU time total: 1.300ms +Self CUDA time total: 123.682us @@ -4496,27 +4496,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 926.451us 888.37% 926.451us 926.451us 1 - torch_eager 20.56% 277.090us 99.61% 1.342ms 1.342ms 0.000us 0.00% 105.599us 105.599us 1 - aten::mul 11.75% 158.363us 19.88% 267.883us 11.162us 55.423us 53.14% 55.423us 2.309us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.423us 53.14% 55.423us 2.309us 24 - aten::copy_ 7.94% 107.035us 40.62% 547.383us 30.410us 32.352us 31.02% 33.664us 1.870us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.640us 23.63% 24.640us 2.053us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.512us 15.83% 16.512us 1.376us 12 - aten::clone 1.47% 19.840us 34.29% 462.099us 77.016us 0.000us 0.00% 9.024us 1.504us 6 - aten::sub 2.93% 39.461us 4.68% 63.054us 10.509us 8.287us 7.95% 8.287us 1.381us 6 - aten::add 2.50% 33.680us 4.16% 56.100us 9.350us 8.225us 7.89% 8.225us 1.371us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 7.39% 7.712us 1.285us 6 - Activity Buffer Request 14.74% 198.654us 14.74% 198.654us 198.654us 1.312us 1.26% 1.312us 1.312us 1 - aten::empty_strided 2.26% 30.481us 2.26% 30.481us 5.080us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.39% 180.523us 13.39% 180.523us 30.087us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.73% 63.708us 5.98% 80.630us 3.360us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.26% 16.922us 1.26% 16.922us 0.705us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.08% 216.704us 16.08% 216.704us 4.515us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.39% 5.231us 0.39% 5.231us 5.231us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 920.156us 882.07% 920.156us 920.156us 1 + torch_eager 10.51% 302.454us 99.79% 2.872ms 2.872ms 0.000us 0.00% 105.630us 105.630us 1 + aten::mul 5.35% 153.872us 9.14% 263.076us 10.962us 55.392us 53.10% 55.392us 2.308us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.392us 53.10% 55.392us 2.308us 24 + aten::copy_ 3.49% 100.493us 71.36% 2.053ms 114.078us 32.479us 31.13% 33.791us 1.877us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.736us 23.71% 24.736us 2.061us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.447us 15.77% 16.447us 1.371us 12 + aten::clone 1.01% 28.981us 68.90% 1.983ms 330.436us 0.000us 0.00% 9.055us 1.509us 6 + aten::sub 1.22% 35.200us 2.05% 58.980us 9.830us 8.288us 7.94% 8.288us 1.381us 6 + aten::add 1.08% 30.999us 1.81% 52.042us 8.674us 8.159us 7.82% 8.159us 1.360us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.743us 7.42% 7.743us 1.290us 6 + Activity Buffer Request 59.15% 1.702ms 59.15% 1.702ms 1.702ms 1.312us 1.26% 1.312us 1.312us 1 + aten::empty_strided 1.11% 32.080us 1.11% 32.080us 5.347us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.49% 186.892us 6.49% 186.892us 31.149us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.21% 63.599us 2.80% 80.681us 3.362us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.59% 17.082us 0.59% 17.082us 0.712us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.57% 217.817us 7.57% 217.817us 4.538us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.21% 6.030us 0.21% 6.030us 6.030us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.348ms -Self CUDA time total: 104.287us +Self CPU time total: 2.878ms +Self CUDA time total: 104.318us @@ -4526,27 +4526,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 931.662us 754.64% 931.662us 931.662us 1 - torch_eager 20.88% 278.302us 99.60% 1.328ms 1.328ms 0.000us 0.00% 125.281us 125.281us 1 - aten::mul 11.71% 156.112us 20.55% 273.936us 11.414us 65.153us 52.77% 65.153us 2.715us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.153us 52.77% 65.153us 2.715us 24 - aten::copy_ 7.95% 105.951us 39.52% 526.779us 29.265us 39.169us 31.73% 40.993us 2.277us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.737us 23.28% 28.737us 2.395us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.135us 15.50% 19.135us 1.595us 12 - aten::clone 1.44% 19.200us 33.27% 443.406us 73.901us 0.000us 0.00% 12.256us 2.043us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 8.45% 10.432us 1.739us 6 - aten::sub 2.81% 37.440us 4.58% 61.110us 10.185us 9.632us 7.80% 9.632us 1.605us 6 - aten::add 2.52% 33.611us 4.17% 55.611us 9.268us 9.503us 7.70% 9.503us 1.584us 6 - Activity Buffer Request 13.21% 176.083us 13.21% 176.083us 176.083us 1.824us 1.48% 1.824us 1.824us 1 - aten::empty_strided 2.29% 30.570us 2.29% 30.570us 5.095us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.82% 184.192us 13.82% 184.192us 30.699us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.90% 65.274us 6.16% 82.123us 3.422us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.26% 16.849us 1.26% 16.849us 0.702us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.81% 224.047us 16.81% 224.047us 4.668us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.40% 5.310us 0.40% 5.310us 5.310us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 907.897us 732.75% 907.897us 907.897us 1 + torch_eager 10.14% 290.438us 99.82% 2.860ms 2.860ms 0.000us 0.00% 125.694us 125.694us 1 + aten::mul 5.20% 149.121us 8.91% 255.361us 10.640us 65.405us 52.79% 65.405us 2.725us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.405us 52.79% 65.405us 2.725us 24 + aten::copy_ 3.52% 100.835us 71.92% 2.061ms 114.473us 39.359us 31.77% 41.151us 2.286us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.927us 23.35% 28.927us 2.411us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.138us 15.45% 19.138us 1.595us 12 + aten::clone 0.91% 26.167us 69.31% 1.986ms 330.944us 0.000us 0.00% 12.224us 2.037us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 8.42% 10.432us 1.739us 6 + aten::sub 1.49% 42.714us 2.30% 65.852us 10.975us 9.601us 7.75% 9.601us 1.600us 6 + aten::add 1.10% 31.550us 1.85% 52.920us 8.820us 9.537us 7.70% 9.537us 1.589us 6 + Activity Buffer Request 59.69% 1.710ms 59.69% 1.710ms 1.710ms 1.792us 1.45% 1.792us 1.792us 1 + aten::empty_strided 1.05% 30.111us 1.05% 30.111us 5.018us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.49% 186.023us 6.49% 186.023us 31.004us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.15% 61.602us 2.74% 78.501us 3.271us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.59% 16.899us 0.59% 16.899us 0.704us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.48% 214.199us 7.48% 214.199us 4.462us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.18% 5.240us 0.18% 5.240us 5.240us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.333ms -Self CUDA time total: 123.457us +Self CPU time total: 2.865ms +Self CUDA time total: 123.902us @@ -4556,27 +4556,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 944.092us 532.26% 944.092us 944.092us 1 - torch_eager 9.66% 282.874us 99.81% 2.921ms 2.921ms 0.000us 0.00% 180.253us 180.253us 1 - aten::mul 5.51% 161.402us 9.28% 271.603us 11.317us 95.040us 53.58% 95.040us 3.960us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 95.040us 53.58% 95.040us 3.960us 24 - aten::copy_ 3.62% 106.065us 72.07% 2.109ms 117.193us 57.663us 32.51% 60.543us 3.364us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.703us 22.95% 40.703us 3.392us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.670us 13.91% 24.670us 2.056us 12 - aten::clone 0.77% 22.428us 69.22% 2.026ms 337.680us 0.000us 0.00% 19.840us 3.307us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 9.56% 16.960us 2.827us 6 - aten::add 1.16% 34.010us 1.95% 57.150us 9.525us 12.383us 6.98% 12.383us 2.064us 6 - aten::sub 1.32% 38.563us 2.15% 62.972us 10.495us 12.287us 6.93% 12.287us 2.048us 6 - Activity Buffer Request 59.97% 1.755ms 59.97% 1.755ms 1.755ms 2.880us 1.62% 2.880us 2.880us 1 - aten::empty_strided 1.05% 30.691us 1.05% 30.691us 5.115us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.31% 184.633us 6.31% 184.633us 30.772us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.32% 67.977us 2.88% 84.170us 3.507us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.55% 16.193us 0.55% 16.193us 0.675us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.56% 221.262us 7.56% 221.262us 4.610us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.19% 5.669us 0.19% 5.669us 5.669us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 893.688us 504.20% 893.688us 893.688us 1 + torch_eager 20.45% 267.171us 99.57% 1.301ms 1.301ms 0.000us 0.00% 180.096us 180.096us 1 + aten::mul 11.55% 150.960us 19.81% 258.802us 10.783us 94.720us 53.44% 94.720us 3.947us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.720us 53.44% 94.720us 3.947us 24 + aten::copy_ 7.76% 101.418us 41.00% 535.738us 29.763us 57.761us 32.59% 60.609us 3.367us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.769us 23.00% 40.769us 3.397us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.767us 13.97% 24.767us 2.064us 12 + aten::clone 1.49% 19.482us 34.75% 454.110us 75.685us 0.000us 0.00% 19.840us 3.307us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.992us 9.59% 16.992us 2.832us 6 + aten::add 2.31% 30.129us 3.96% 51.730us 8.622us 12.479us 7.04% 12.479us 2.080us 6 + aten::sub 2.69% 35.182us 4.51% 58.883us 9.814us 12.288us 6.93% 12.288us 2.048us 6 + Activity Buffer Request 14.52% 189.694us 14.52% 189.694us 189.694us 2.848us 1.61% 2.848us 2.848us 1 + aten::empty_strided 2.22% 29.022us 2.22% 29.022us 4.837us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 14.04% 183.444us 14.04% 183.444us 30.574us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.81% 62.873us 6.14% 80.203us 3.342us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.33% 17.330us 1.33% 17.330us 0.722us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 16.40% 214.326us 16.40% 214.326us 4.465us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.43% 5.660us 0.43% 5.660us 5.660us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.927ms -Self CUDA time total: 177.373us +Self CPU time total: 1.307ms +Self CUDA time total: 177.248us @@ -4586,27 +4586,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 956.029us 320.35% 956.029us 956.029us 1 - torch_eager 10.28% 306.488us 99.82% 2.977ms 2.977ms 0.000us 0.00% 316.194us 316.194us 1 - aten::mul 5.10% 152.001us 8.95% 266.845us 11.119us 146.560us 49.11% 146.560us 6.107us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 146.560us 49.11% 146.560us 6.107us 24 - aten::copy_ 3.72% 110.901us 71.64% 2.137ms 118.718us 110.754us 37.11% 128.514us 7.140us 18 - aten::clone 0.97% 28.901us 68.99% 2.058ms 342.957us 0.000us 0.00% 70.944us 11.824us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.570us 19.29% 57.570us 4.797us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.184us 17.82% 53.184us 8.864us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.120us 13.78% 41.120us 3.427us 12 - aten::add 1.16% 34.740us 1.93% 57.500us 9.583us 20.641us 6.92% 20.641us 3.440us 6 - aten::sub 1.34% 39.998us 2.18% 65.101us 10.850us 20.479us 6.86% 20.479us 3.413us 6 - Activity Buffer Request 59.58% 1.777ms 59.58% 1.777ms 1.777ms 17.760us 5.95% 17.760us 17.760us 1 - aten::empty_strided 1.05% 31.260us 1.05% 31.260us 5.210us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.26% 186.663us 6.26% 186.663us 31.111us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.24% 66.809us 2.82% 84.238us 3.510us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.58% 17.429us 0.58% 17.429us 0.726us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.54% 224.919us 7.54% 224.919us 4.686us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.18% 5.469us 0.18% 5.469us 5.469us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 902.974us 302.94% 902.974us 902.974us 1 + torch_eager 20.55% 268.018us 99.60% 1.299ms 1.299ms 0.000us 0.00% 315.867us 315.867us 1 + aten::mul 11.45% 149.281us 19.89% 259.364us 10.807us 145.821us 48.92% 145.821us 6.076us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.821us 48.92% 145.821us 6.076us 24 + aten::copy_ 7.72% 100.690us 40.33% 526.007us 29.223us 111.006us 37.24% 128.798us 7.155us 18 + aten::clone 1.57% 20.500us 34.18% 445.716us 74.286us 0.000us 0.00% 71.584us 11.931us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.214us 19.19% 57.214us 4.768us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.792us 18.05% 53.792us 8.965us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.248us 13.84% 41.248us 3.437us 12 + aten::sub 2.77% 36.180us 4.59% 59.800us 9.967us 20.672us 6.94% 20.672us 3.445us 6 + aten::add 2.40% 31.251us 4.19% 54.701us 9.117us 20.576us 6.90% 20.576us 3.429us 6 + Activity Buffer Request 13.76% 179.443us 13.76% 179.443us 179.443us 17.792us 5.97% 17.792us 17.792us 1 + aten::empty_strided 2.33% 30.440us 2.33% 30.440us 5.073us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 14.06% 183.364us 14.06% 183.364us 30.561us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.83% 63.022us 6.14% 80.061us 3.336us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.31% 17.039us 1.31% 17.039us 0.710us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 16.84% 219.663us 16.84% 219.663us 4.576us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.40% 5.210us 0.40% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.983ms -Self CUDA time total: 298.434us +Self CPU time total: 1.304ms +Self CUDA time total: 298.075us @@ -4616,27 +4616,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 916.392us 515.61% 916.392us 916.392us 1 - torch_eager 19.58% 274.201us 99.60% 1.394ms 1.394ms 0.000us 0.00% 180.610us 180.610us 1 - aten::mul 11.24% 157.371us 18.87% 264.183us 11.008us 95.074us 53.49% 95.074us 3.961us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 95.074us 53.49% 95.074us 3.961us 24 - aten::copy_ 7.77% 108.775us 43.49% 608.863us 33.826us 57.825us 32.54% 60.705us 3.373us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.897us 23.01% 40.897us 3.408us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.831us 13.97% 24.831us 2.069us 12 - aten::clone 1.40% 19.580us 37.38% 523.368us 87.228us 0.000us 0.00% 19.808us 3.301us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 9.52% 16.928us 2.821us 6 - aten::add 2.38% 33.360us 4.00% 56.040us 9.340us 12.416us 6.99% 12.416us 2.069us 6 - aten::sub 2.76% 38.582us 4.39% 61.472us 10.245us 12.415us 6.99% 12.415us 2.069us 6 - Activity Buffer Request 18.14% 253.955us 18.14% 253.955us 253.955us 2.880us 1.62% 2.880us 2.880us 1 - aten::empty_strided 2.13% 29.860us 2.13% 29.860us 4.977us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.38% 187.273us 13.38% 187.273us 31.212us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.53% 63.391us 5.73% 80.293us 3.346us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.21% 16.902us 1.21% 16.902us 0.704us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.09% 211.242us 15.09% 211.242us 4.401us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.40% 5.600us 0.40% 5.600us 5.600us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 892.343us 502.65% 892.343us 892.343us 1 + torch_eager 20.29% 263.739us 99.57% 1.294ms 1.294ms 0.000us 0.00% 180.376us 180.376us 1 + aten::mul 11.80% 153.414us 20.02% 260.283us 10.845us 95.163us 53.60% 95.163us 3.965us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 95.163us 53.60% 95.163us 3.965us 24 + aten::copy_ 7.77% 100.952us 40.82% 530.557us 29.475us 57.598us 32.44% 60.446us 3.358us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.672us 22.91% 40.672us 3.389us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.767us 13.95% 24.767us 2.064us 12 + aten::clone 1.59% 20.651us 34.74% 451.538us 75.256us 0.000us 0.00% 19.774us 3.296us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.926us 9.53% 16.926us 2.821us 6 + aten::add 2.40% 31.260us 4.02% 52.270us 8.712us 12.447us 7.01% 12.447us 2.074us 6 + aten::sub 2.80% 36.342us 4.59% 59.652us 9.942us 12.320us 6.94% 12.320us 2.053us 6 + Activity Buffer Request 14.29% 185.813us 14.29% 185.813us 185.813us 2.848us 1.60% 2.848us 2.848us 1 + aten::empty_strided 2.21% 28.780us 2.21% 28.780us 4.797us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 14.07% 182.883us 14.07% 182.883us 30.480us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.73% 61.469us 6.03% 78.389us 3.266us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.30% 16.920us 1.30% 16.920us 0.705us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 16.32% 212.098us 16.32% 212.098us 4.419us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.43% 5.551us 0.43% 5.551us 5.551us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.400ms -Self CUDA time total: 177.730us +Self CPU time total: 1.300ms +Self CUDA time total: 177.528us @@ -4646,27 +4646,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 934.618us 312.71% 934.618us 934.618us 1 - torch_eager 20.60% 280.895us 99.62% 1.358ms 1.358ms 0.000us 0.00% 316.921us 316.921us 1 - aten::mul 11.57% 157.759us 19.61% 267.373us 11.141us 146.460us 49.00% 146.460us 6.102us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 146.460us 49.00% 146.460us 6.102us 24 - aten::copy_ 8.07% 110.072us 41.19% 561.700us 31.206us 111.966us 37.46% 130.013us 7.223us 18 - aten::clone 1.51% 20.600us 34.77% 474.096us 79.016us 0.000us 0.00% 72.670us 12.112us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.343us 19.19% 57.343us 4.779us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 54.623us 18.28% 54.623us 9.104us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.448us 13.53% 40.448us 3.371us 12 - aten::add 2.59% 35.260us 4.22% 57.590us 9.598us 20.288us 6.79% 20.288us 3.381us 6 - aten::sub 2.60% 35.410us 4.30% 58.621us 9.770us 20.160us 6.75% 20.160us 3.360us 6 - Activity Buffer Request 14.73% 200.853us 14.73% 200.853us 200.853us 18.047us 6.04% 18.047us 18.047us 1 - aten::empty_strided 2.18% 29.660us 2.18% 29.660us 4.943us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.85% 188.823us 13.85% 188.823us 31.471us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.75% 64.754us 6.01% 81.922us 3.413us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.26% 17.168us 1.26% 17.168us 0.715us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.92% 217.107us 15.92% 217.107us 4.523us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.38% 5.180us 0.38% 5.180us 5.180us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 904.750us 303.27% 904.750us 904.750us 1 + torch_eager 21.10% 268.921us 99.56% 1.269ms 1.269ms 0.000us 0.00% 316.413us 316.413us 1 + aten::mul 11.89% 151.482us 20.35% 259.306us 10.804us 145.920us 48.91% 145.920us 6.080us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.920us 48.91% 145.920us 6.080us 24 + aten::copy_ 8.00% 101.944us 39.08% 497.980us 27.666us 111.613us 37.41% 129.693us 7.205us 18 + aten::clone 1.62% 20.659us 32.72% 416.907us 69.484us 0.000us 0.00% 72.512us 12.085us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.181us 19.17% 57.181us 4.765us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 54.432us 18.25% 54.432us 9.072us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.800us 13.68% 40.800us 3.400us 12 + aten::add 2.47% 31.422us 4.17% 53.191us 8.865us 20.512us 6.88% 20.512us 3.419us 6 + aten::sub 2.72% 34.608us 4.46% 56.810us 9.468us 20.288us 6.80% 20.288us 3.381us 6 + Activity Buffer Request 10.32% 131.451us 10.32% 131.451us 131.451us 18.080us 6.06% 18.080us 18.080us 1 + aten::empty_strided 2.44% 31.142us 2.44% 31.142us 5.190us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 15.74% 200.533us 15.74% 200.533us 33.422us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.99% 63.539us 6.33% 80.621us 3.359us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.34% 17.082us 1.34% 17.082us 0.712us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 16.94% 215.847us 16.94% 215.847us 4.497us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.44% 5.651us 0.44% 5.651us 5.651us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.364ms -Self CUDA time total: 298.874us +Self CPU time total: 1.274ms +Self CUDA time total: 298.333us @@ -4676,27 +4676,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 956.919us 161.50% 956.919us 956.919us 1 - torch_eager 21.30% 289.504us 99.57% 1.353ms 1.353ms 0.000us 0.00% 616.281us 616.281us 1 - aten::copy_ 7.84% 106.532us 38.89% 528.548us 29.364us 278.013us 46.92% 301.788us 16.766us 18 - aten::mul 11.95% 162.407us 20.79% 282.469us 11.770us 248.703us 41.97% 248.703us 10.363us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 248.703us 41.97% 248.703us 10.363us 24 - aten::clone 1.53% 20.799us 32.73% 444.735us 74.123us 0.000us 0.00% 210.204us 35.034us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 186.429us 31.46% 186.429us 31.072us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 91.584us 15.46% 91.584us 7.632us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.790us 11.10% 65.790us 5.483us 12 - aten::add 2.44% 33.161us 4.08% 55.501us 9.250us 32.927us 5.56% 32.927us 5.488us 6 - aten::sub 2.95% 40.030us 4.74% 64.440us 10.740us 32.863us 5.55% 32.863us 5.477us 6 - Activity Buffer Request 13.07% 177.663us 13.07% 177.663us 177.663us 23.775us 4.01% 23.775us 23.775us 1 - aten::empty_strided 2.15% 29.270us 2.15% 29.270us 4.878us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.63% 185.172us 13.63% 185.172us 30.862us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.83% 65.662us 6.08% 82.660us 3.444us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.25% 16.998us 1.25% 16.998us 0.708us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.63% 225.993us 16.63% 225.993us 4.708us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.43% 5.780us 0.43% 5.780us 5.780us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 937.429us 158.23% 937.429us 937.429us 1 + torch_eager 20.59% 270.182us 99.56% 1.306ms 1.306ms 0.000us 0.00% 616.217us 616.217us 1 + aten::copy_ 7.81% 102.469us 38.72% 508.048us 28.225us 278.876us 47.07% 302.652us 16.814us 18 + aten::mul 12.17% 159.666us 21.13% 277.305us 11.554us 247.965us 41.85% 247.965us 10.332us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 247.965us 41.85% 247.965us 10.332us 24 + aten::clone 1.57% 20.662us 32.21% 422.588us 70.431us 0.000us 0.00% 211.645us 35.274us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 187.869us 31.71% 187.869us 31.312us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 91.007us 15.36% 91.007us 7.584us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.600us 11.07% 65.600us 5.467us 12 + aten::sub 2.87% 37.612us 4.70% 61.651us 10.275us 32.896us 5.55% 32.896us 5.483us 6 + aten::add 2.50% 32.870us 4.35% 57.120us 9.520us 32.704us 5.52% 32.704us 5.451us 6 + Activity Buffer Request 12.01% 157.623us 12.01% 157.623us 157.623us 23.776us 4.01% 23.776us 23.776us 1 + aten::empty_strided 2.36% 30.970us 2.36% 30.970us 5.162us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.83% 181.533us 13.83% 181.533us 30.256us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.81% 63.053us 6.13% 80.473us 3.353us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.33% 17.420us 1.33% 17.420us 0.726us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 17.71% 232.351us 17.71% 232.351us 4.841us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.44% 5.770us 0.44% 5.770us 5.770us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.359ms -Self CUDA time total: 592.506us +Self CPU time total: 1.312ms +Self CUDA time total: 592.441us @@ -4706,107 +4706,55 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 12.69% 276.287us 61.52% 1.340ms 1.340ms 0.000us 0.00% 1.863ms 1.863ms 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.835ms 102.22% 1.835ms 1.835ms 1 - aten::copy_ 5.01% 109.060us 24.98% 544.137us 30.230us 806.007us 44.89% 873.590us 48.533us 18 - aten::mul 7.11% 154.844us 12.06% 262.604us 10.942us 842.615us 46.93% 842.615us 35.109us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 842.615us 46.93% 842.615us 35.109us 24 - aten::clone 1.01% 22.000us 21.12% 459.916us 76.653us 0.000us 0.00% 622.361us 103.727us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 554.778us 30.90% 554.778us 92.463us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 251.229us 13.99% 251.229us 20.936us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 146.939us 8.18% 146.939us 12.245us 12 - aten::sub 1.90% 41.421us 3.00% 65.411us 10.902us 88.573us 4.93% 88.573us 14.762us 6 - Activity Buffer Request 8.49% 184.983us 8.49% 184.983us 184.983us 67.583us 3.76% 67.583us 67.583us 1 - aten::add 1.54% 33.561us 2.59% 56.461us 9.410us 58.366us 3.25% 58.366us 9.728us 6 - aten::empty_strided 1.42% 30.960us 1.42% 30.960us 5.160us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.70% 189.543us 8.70% 189.543us 31.591us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.99% 65.113us 3.77% 82.061us 3.419us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.78% 16.948us 0.78% 16.948us 0.706us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 9.88% 215.201us 9.88% 215.201us 4.483us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 38.48% 838.063us 38.48% 838.063us 838.063us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 7.94% 296.330us 77.11% 2.877ms 2.877ms 0.000us 0.00% 1.855ms 1.855ms 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.826ms 102.11% 1.826ms 1.826ms 1 + aten::copy_ 2.78% 103.676us 55.32% 2.064ms 114.670us 804.351us 44.97% 870.879us 48.382us 18 + aten::mul 4.03% 150.284us 7.00% 261.125us 10.880us 838.429us 46.87% 838.429us 34.935us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 838.429us 46.87% 838.429us 34.935us 24 + aten::clone 0.73% 27.418us 53.07% 1.980ms 330.025us 0.000us 0.00% 619.584us 103.264us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 553.056us 30.92% 553.056us 92.176us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 251.295us 14.05% 251.295us 20.941us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 145.983us 8.16% 145.983us 12.165us 12 + aten::sub 1.02% 38.061us 1.68% 62.715us 10.453us 87.583us 4.90% 87.583us 14.597us 6 + Activity Buffer Request 44.74% 1.669ms 44.74% 1.669ms 1.669ms 66.528us 3.72% 66.528us 66.528us 1 + aten::add 0.88% 32.700us 1.50% 55.970us 9.328us 58.400us 3.26% 58.400us 9.733us 6 + aten::empty_strided 0.81% 30.382us 0.81% 30.382us 5.064us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 5.84% 217.963us 5.84% 217.963us 36.327us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.66% 61.939us 2.12% 79.030us 3.293us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.46% 17.091us 0.46% 17.091us 0.712us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 6.21% 231.875us 6.21% 231.875us 4.831us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 22.89% 854.084us 22.89% 854.084us 854.084us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.178ms -Self CUDA time total: 1.796ms +Self CPU time total: 3.731ms +Self CUDA time total: 1.789ms impl wl p50(ms) ok torch_eager cuda_B1_S128_H32_D128_R64 0.22 True -torch_eager cuda_B1_S128_H32_D64_R32 0.23 True -torch_eager cuda_B1_S128_H8_D128_R64 0.23 True +torch_eager cuda_B1_S128_H32_D64_R32 0.22 True +torch_eager cuda_B1_S128_H8_D128_R64 0.22 True torch_eager cuda_B1_S128_H8_D64_R32 0.18 True -torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True -torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True -torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True -torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True -torch_eager cuda_B1_S512_H32_D128_R64 0.22 True -torch_eager cuda_B1_S512_H32_D64_R32 0.22 True -torch_eager cuda_B1_S512_H8_D128_R64 0.23 True -torch_eager cuda_B1_S512_H8_D64_R32 0.23 True +torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True +torch_eager cuda_B1_S2048_H32_D64_R32 0.21 True +torch_eager cuda_B1_S2048_H8_D128_R64 0.21 True +torch_eager cuda_B1_S2048_H8_D64_R32 0.21 True +torch_eager cuda_B1_S512_H32_D128_R64 0.21 True +torch_eager cuda_B1_S512_H32_D64_R32 0.21 True +torch_eager cuda_B1_S512_H8_D128_R64 0.21 True +torch_eager cuda_B1_S512_H8_D64_R32 0.22 True torch_eager cuda_B2_S128_H32_D128_R64 0.22 True torch_eager cuda_B2_S128_H32_D64_R32 0.22 True torch_eager cuda_B2_S128_H8_D128_R64 0.22 True -torch_eager cuda_B2_S128_H8_D64_R32 0.22 True +torch_eager cuda_B2_S128_H8_D64_R32 0.21 True torch_eager cuda_B2_S2048_H32_D128_R64 0.65 True torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True -torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True +torch_eager cuda_B2_S2048_H8_D128_R64 0.21 True torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True -torch_eager cuda_B2_S512_H32_D128_R64 0.22 True -torch_eager cuda_B2_S512_H32_D64_R32 0.23 True +torch_eager cuda_B2_S512_H32_D128_R64 0.21 True +torch_eager cuda_B2_S512_H32_D64_R32 0.22 True torch_eager cuda_B2_S512_H8_D128_R64 0.22 True -torch_eager cuda_B2_S512_H8_D64_R32 0.23 True +torch_eager cuda_B2_S512_H8_D64_R32 0.22 True-▶ UV Install Logs- -Artifacts:
rotary.jsonl