Running activation benchmark on cuda with 9 workloads.
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T128_D768
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 208.254us 1623.18% 208.254us 208.254us 1
+ torch_eager 11.63% 222.938us 99.53% 1.908ms 1.908ms 0.000us 0.00% 15.165us 15.165us 1
+ aten::silu 3.35% 64.173us 81.27% 1.558ms 519.434us 6.558us 51.11% 8.893us 2.964us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.558us 51.11% 6.558us 2.186us 3
+ aten::mul 2.01% 38.591us 3.22% 61.711us 20.570us 6.272us 48.89% 6.272us 2.091us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 48.89% 6.272us 2.091us 3
+ Activity Buffer Request 75.51% 1.448ms 75.51% 1.448ms 1.448ms 2.335us 18.20% 2.335us 2.335us 1
+ aten::slice 2.75% 52.771us 3.41% 65.422us 10.904us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.66% 12.651us 0.66% 12.651us 2.108us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 3.62% 69.391us 3.62% 69.391us 11.565us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.47% 9.050us 0.47% 9.050us 9.050us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.917ms
+Self CUDA time total: 12.830us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T128_D1024
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.071us 1219.79% 151.071us 151.071us 1
+ torch_eager 7.39% 126.424us 99.65% 1.704ms 1.704ms 0.000us 0.00% 14.561us 14.561us 1
+ aten::silu 2.37% 40.550us 87.76% 1.501ms 500.240us 6.400us 51.68% 8.576us 2.859us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.400us 51.68% 6.400us 2.133us 3
+ aten::mul 1.49% 25.470us 2.58% 44.190us 14.730us 5.985us 48.32% 5.985us 1.995us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.985us 48.32% 5.985us 1.995us 3
+ Activity Buffer Request 83.86% 1.434ms 83.86% 1.434ms 1.434ms 2.176us 17.57% 2.176us 2.176us 1
+ aten::slice 1.55% 26.493us 1.91% 32.623us 5.437us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.36% 6.130us 0.36% 6.130us 1.022us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 2.63% 44.922us 2.63% 44.922us 7.487us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.35% 5.980us 0.35% 5.980us 5.980us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.710ms
+Self CUDA time total: 12.385us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T128_D2048
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 154.943us 1178.09% 154.943us 154.943us 1
+ torch_eager 7.25% 123.104us 99.64% 1.692ms 1.692ms 0.000us 0.00% 15.424us 15.424us 1
+ aten::silu 2.33% 39.532us 87.79% 1.491ms 496.854us 6.784us 51.58% 9.056us 3.019us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.58% 6.784us 2.261us 3
+ aten::mul 1.58% 26.910us 2.71% 46.021us 15.340us 6.368us 48.42% 6.368us 2.123us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 48.42% 6.368us 2.123us 3
+ Activity Buffer Request 83.90% 1.424ms 83.90% 1.424ms 1.424ms 2.272us 17.27% 2.272us 2.272us 1
+ aten::slice 1.53% 26.021us 1.89% 32.121us 5.353us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.36% 6.100us 0.36% 6.100us 1.017us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 2.69% 45.642us 2.69% 45.642us 7.607us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.36% 6.080us 0.36% 6.080us 6.080us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.698ms
+Self CUDA time total: 13.152us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T256_D768
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 180.768us 1415.79% 180.768us 180.768us 1
+ torch_eager 7.93% 123.526us 99.68% 1.554ms 1.554ms 0.000us 0.00% 14.976us 14.976us 1
+ aten::silu 3.24% 50.441us 85.53% 1.333ms 444.348us 6.592us 51.63% 8.800us 2.933us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 51.63% 6.592us 2.197us 3
+ aten::mul 1.75% 27.260us 4.09% 63.791us 21.264us 6.176us 48.37% 6.176us 2.059us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.37% 6.176us 2.059us 3
+ Activity Buffer Request 67.46% 1.051ms 67.46% 1.051ms 1.051ms 2.208us 17.29% 2.208us 2.208us 1
+ aten::slice 1.70% 26.549us 2.13% 33.261us 5.543us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.43% 6.712us 0.43% 6.712us 1.119us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 17.18% 267.779us 17.18% 267.779us 44.630us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.32% 4.940us 0.32% 4.940us 4.940us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.559ms
+Self CUDA time total: 12.768us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T256_D1024
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 150.816us 1138.41% 150.816us 150.816us 1
+ torch_eager 6.24% 117.054us 99.74% 1.872ms 1.872ms 0.000us 0.00% 15.520us 15.520us 1
+ aten::silu 2.12% 39.802us 89.47% 1.679ms 559.729us 6.784us 51.21% 9.056us 3.019us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.21% 6.784us 2.261us 3
+ aten::mul 1.34% 25.111us 2.35% 44.062us 14.687us 6.464us 48.79% 6.464us 2.155us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.79% 6.464us 2.155us 3
+ Activity Buffer Request 75.90% 1.425ms 75.90% 1.425ms 1.425ms 2.272us 17.15% 2.272us 2.272us 1
+ aten::slice 1.36% 25.472us 1.68% 31.591us 5.265us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.33% 6.119us 0.33% 6.119us 1.020us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 12.46% 233.778us 12.46% 233.778us 38.963us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.26% 4.950us 0.26% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.877ms
+Self CUDA time total: 13.248us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T256_D2048
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 143.615us 923.45% 143.615us 143.615us 1
+ torch_eager 17.00% 110.812us 99.16% 646.262us 646.262us 0.000us 0.00% 18.240us 18.240us 1
+ aten::silu 6.35% 41.393us 70.99% 462.667us 154.222us 7.936us 51.03% 10.624us 3.541us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.03% 7.936us 2.645us 3
+ aten::mul 3.56% 23.221us 6.51% 42.412us 14.137us 7.616us 48.97% 7.616us 2.539us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 48.97% 7.616us 2.539us 3
+ Activity Buffer Request 32.67% 212.907us 32.67% 212.907us 212.907us 2.688us 17.28% 2.688us 2.688us 1
+ aten::slice 3.77% 24.551us 4.66% 30.371us 5.062us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.89% 5.820us 0.89% 5.820us 0.970us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 34.91% 227.558us 34.91% 227.558us 37.926us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.84% 5.490us 0.84% 5.490us 5.490us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 651.752us
+Self CUDA time total: 15.552us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T512_D768
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.197us 1080.16% 155.197us 155.197us 1
+ torch_eager 6.30% 118.195us 99.70% 1.872ms 1.872ms 0.000us 0.00% 16.864us 16.864us 1
+ aten::silu 2.16% 40.640us 89.31% 1.677ms 558.889us 7.360us 51.22% 9.856us 3.285us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 51.22% 7.360us 2.453us 3
+ aten::mul 1.39% 26.190us 2.47% 46.331us 15.444us 7.008us 48.78% 7.008us 2.336us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.78% 7.008us 2.336us 3
+ Activity Buffer Request 76.28% 1.432ms 76.28% 1.432ms 1.432ms 2.496us 17.37% 2.496us 2.496us 1
+ aten::slice 1.31% 24.671us 1.64% 30.721us 5.120us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.32% 6.050us 0.32% 6.050us 1.008us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 11.93% 224.049us 11.93% 224.049us 37.341us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.30% 5.540us 0.30% 5.540us 5.540us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.877ms
+Self CUDA time total: 14.368us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T512_D1024
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 144.252us 927.61% 144.252us 144.252us 1
+ torch_eager 18.42% 116.554us 99.16% 627.471us 627.471us 0.000us 0.00% 18.239us 18.239us 1
+ aten::silu 6.52% 41.251us 69.31% 438.595us 146.198us 7.968us 51.24% 10.656us 3.552us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 51.24% 7.968us 2.656us 3
+ aten::mul 3.66% 23.182us 6.58% 41.632us 13.877us 7.583us 48.76% 7.583us 2.528us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.583us 48.76% 7.583us 2.528us 3
+ Activity Buffer Request 30.96% 195.937us 30.96% 195.937us 195.937us 2.688us 17.29% 2.688us 2.688us 1
+ aten::slice 3.89% 24.640us 4.85% 30.690us 5.115us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.96% 6.050us 0.96% 6.050us 1.008us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 34.74% 219.857us 34.74% 219.857us 36.643us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.84% 5.310us 0.84% 5.310us 5.310us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 632.781us
+Self CUDA time total: 15.551us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T512_D2048
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 150.463us 665.09% 150.463us 150.463us 1
+ torch_eager 5.93% 109.544us 99.69% 1.842ms 1.842ms 0.000us 0.00% 26.527us 26.527us 1
+ aten::silu 2.24% 41.413us 89.69% 1.657ms 552.422us 11.584us 51.20% 15.488us 5.163us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.584us 51.20% 11.584us 3.861us 3
+ aten::mul 1.32% 24.310us 2.35% 43.432us 14.477us 11.039us 48.80% 11.039us 3.680us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.039us 48.80% 11.039us 3.680us 3
+ Activity Buffer Request 76.49% 1.413ms 76.49% 1.413ms 1.413ms 3.904us 17.26% 3.904us 3.904us 1
+ aten::slice 1.39% 25.640us 1.72% 31.740us 5.290us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.33% 6.100us 0.33% 6.100us 1.017us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 12.00% 221.728us 12.00% 221.728us 36.955us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.31% 5.690us 0.31% 5.690us 5.690us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.848ms
+Self CUDA time total: 22.623us
+
+
+impl wl p50(ms) ok
+torch_eager cuda_T128_D1024 0.05 True
+torch_eager cuda_T128_D2048 0.05 True
+torch_eager cuda_T128_D768 0.04 True
+torch_eager cuda_T256_D1024 0.05 True
+torch_eager cuda_T256_D2048 0.05 True
+torch_eager cuda_T256_D768 0.05 True
+torch_eager cuda_T512_D1024 0.05 True
+torch_eager cuda_T512_D2048 0.05 True
+torch_eager cuda_T512_D768 0.05 True
+