Running activation benchmark on cuda with 9 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 187.393us 1471.25% 187.393us 187.393us 1
torch_eager 9.10% 197.603us 99.31% 2.157ms 2.157ms 0.000us 0.00% 15.073us 15.073us 1
aten::silu 2.86% 62.203us 85.13% 1.849ms 616.358us 6.561us 51.51% 8.897us 2.966us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.561us 51.51% 6.561us 2.187us 3
aten::mul 1.58% 34.212us 2.55% 55.432us 18.477us 6.176us 48.49% 6.176us 2.059us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
Activity Buffer Request 80.18% 1.741ms 80.18% 1.741ms 1.741ms 2.336us 18.34% 2.336us 2.336us 1
aten::slice 2.02% 43.964us 2.53% 55.013us 9.169us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.51% 11.049us 0.51% 11.049us 1.842us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.07% 66.630us 3.07% 66.630us 11.105us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.69% 14.920us 0.69% 14.920us 14.920us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.172ms
Self CUDA time total: 12.737us
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.648us 1289.36% 159.648us 159.648us 1
torch_eager 6.57% 137.523us 99.70% 2.087ms 2.087ms 0.000us 0.00% 14.526us 14.526us 1
aten::silu 2.02% 42.391us 89.22% 1.868ms 622.711us 6.399us 51.68% 8.543us 2.848us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.68% 6.399us 2.133us 3
aten::mul 1.43% 29.882us 2.35% 49.282us 16.427us 5.983us 48.32% 5.983us 1.994us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.983us 48.32% 5.983us 1.994us 3
Activity Buffer Request 85.88% 1.798ms 85.88% 1.798ms 1.798ms 2.144us 17.32% 2.144us 2.144us 1
aten::slice 1.30% 27.292us 1.55% 32.512us 5.419us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.25% 5.220us 0.25% 5.220us 0.870us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 2.25% 47.061us 2.25% 47.061us 7.843us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.30% 6.330us 0.30% 6.330us 6.330us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.094ms
Self CUDA time total: 12.382us
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.680us 1172.29% 155.680us 155.680us 1
torch_eager 6.64% 129.562us 99.68% 1.946ms 1.946ms 0.000us 0.00% 15.584us 15.584us 1
aten::silu 2.16% 42.182us 89.10% 1.739ms 579.704us 6.848us 51.57% 9.152us 3.051us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.848us 51.57% 6.848us 2.283us 3
aten::mul 1.46% 28.592us 2.38% 46.553us 15.518us 6.432us 48.43% 6.432us 2.144us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.43% 6.432us 2.144us 3
Activity Buffer Request 85.60% 1.671ms 85.60% 1.671ms 1.671ms 2.304us 17.35% 2.304us 2.304us 1
aten::slice 1.31% 25.640us 1.56% 30.540us 5.090us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.25% 4.900us 0.25% 4.900us 0.817us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 2.26% 44.052us 2.26% 44.052us 7.342us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.32% 6.150us 0.32% 6.150us 6.150us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.952ms
Self CUDA time total: 13.280us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 160.289us 1264.91% 160.289us 160.289us 1
torch_eager 6.06% 136.754us 99.75% 2.252ms 2.252ms 0.000us 0.00% 14.880us 14.880us 1
aten::silu 1.87% 42.159us 90.17% 2.036ms 678.503us 6.560us 51.77% 8.768us 2.923us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.77% 6.560us 2.187us 3
aten::mul 1.25% 28.231us 2.20% 49.632us 16.544us 6.112us 48.23% 6.112us 2.037us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.112us 48.23% 6.112us 2.037us 3
Activity Buffer Request 79.28% 1.790ms 79.28% 1.790ms 1.790ms 2.208us 17.42% 2.208us 2.208us 1
aten::slice 1.09% 24.671us 1.32% 29.801us 4.967us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.23% 5.130us 0.23% 5.130us 0.855us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.98% 225.208us 9.98% 225.208us 37.535us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.25% 5.621us 0.25% 5.621us 5.621us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.257ms
Self CUDA time total: 12.672us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.295us 1196.72% 159.295us 159.295us 1
torch_eager 6.43% 135.135us 99.75% 2.096ms 2.096ms 0.000us 0.00% 15.615us 15.615us 1
aten::silu 2.00% 41.931us 89.60% 1.883ms 627.518us 6.815us 51.20% 9.119us 3.040us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.815us 51.20% 6.815us 2.272us 3
aten::mul 1.42% 29.749us 2.27% 47.691us 15.897us 6.496us 48.80% 6.496us 2.165us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.496us 48.80% 6.496us 2.165us 3
Activity Buffer Request 79.61% 1.673ms 79.61% 1.673ms 1.673ms 2.304us 17.31% 2.304us 2.304us 1
aten::slice 1.22% 25.650us 1.46% 30.630us 5.105us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.24% 4.980us 0.24% 4.980us 0.830us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.84% 185.847us 8.84% 185.847us 30.974us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.25% 5.161us 0.25% 5.161us 5.161us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.101ms
Self CUDA time total: 13.311us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.432us 1014.55% 158.432us 158.432us 1
torch_eager 6.38% 140.261us 99.75% 2.192ms 2.192ms 0.000us 0.00% 18.304us 18.304us 1
aten::silu 1.93% 42.492us 89.81% 1.973ms 657.799us 8.000us 51.23% 10.688us 3.563us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 8.000us 51.23% 8.000us 2.667us 3
aten::mul 1.27% 27.872us 2.10% 46.122us 15.374us 7.616us 48.77% 7.616us 2.539us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 48.77% 7.616us 2.539us 3
Activity Buffer Request 80.61% 1.771ms 80.61% 1.771ms 1.771ms 2.688us 17.21% 2.688us 2.688us 1
aten::slice 1.22% 26.832us 1.46% 31.992us 5.332us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.23% 5.160us 0.23% 5.160us 0.860us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.09% 177.845us 8.09% 177.845us 29.641us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.25% 5.530us 0.25% 5.530us 5.530us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.197ms
Self CUDA time total: 15.616us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.083us 1097.65% 158.083us 158.083us 1
torch_eager 6.35% 128.334us 99.75% 2.015ms 2.015ms 0.000us 0.00% 16.898us 16.898us 1
aten::silu 2.10% 42.419us 89.46% 1.807ms 602.407us 7.394us 51.34% 9.890us 3.297us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.394us 51.34% 7.394us 2.465us 3
aten::mul 1.39% 28.141us 2.40% 48.382us 16.127us 7.008us 48.66% 7.008us 2.336us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.66% 7.008us 2.336us 3
Activity Buffer Request 79.49% 1.606ms 79.49% 1.606ms 1.606ms 2.496us 17.33% 2.496us 2.496us 1
aten::slice 1.27% 25.691us 1.54% 31.081us 5.180us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.27% 5.390us 0.27% 5.390us 0.898us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.88% 179.306us 8.88% 179.306us 29.884us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.25% 5.100us 0.25% 5.100us 5.100us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.020ms
Self CUDA time total: 14.402us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.658us 1020.18% 158.658us 158.658us 1
torch_eager 5.52% 111.823us 99.73% 2.019ms 2.019ms 0.000us 0.00% 18.240us 18.240us 1
aten::silu 2.12% 42.830us 90.25% 1.827ms 609.110us 7.936us 51.03% 10.624us 3.541us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.03% 7.936us 2.645us 3
aten::mul 1.37% 27.772us 2.44% 49.332us 16.444us 7.616us 48.97% 7.616us 2.539us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 48.97% 7.616us 2.539us 3
Activity Buffer Request 80.18% 1.623ms 80.18% 1.623ms 1.623ms 2.688us 17.28% 2.688us 2.688us 1
aten::slice 1.25% 25.302us 1.51% 30.641us 5.107us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.26% 5.339us 0.26% 5.339us 0.890us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.02% 182.624us 9.02% 182.624us 30.437us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.27% 5.520us 0.27% 5.520us 5.520us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.025ms
Self CUDA time total: 15.552us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 164.002us 726.96% 164.002us 164.002us 1
torch_eager 5.39% 111.814us 99.74% 2.071ms 2.071ms 0.000us 0.00% 26.464us 26.464us 1
aten::silu 2.07% 43.010us 90.61% 1.881ms 627.114us 11.616us 51.49% 15.520us 5.173us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.616us 51.49% 11.616us 3.872us 3
aten::mul 1.37% 28.451us 2.32% 48.232us 16.077us 10.944us 48.51% 10.944us 3.648us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.944us 48.51% 10.944us 3.648us 3
Activity Buffer Request 80.76% 1.677ms 80.76% 1.677ms 1.677ms 3.904us 17.30% 3.904us 3.904us 1
aten::slice 1.14% 23.769us 1.41% 29.310us 4.885us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.27% 5.541us 0.27% 5.541us 0.923us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.74% 181.415us 8.74% 181.415us 30.236us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.26% 5.500us 0.26% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.076ms
Self CUDA time total: 22.560us
impl wl p50(ms) ok
torch_eager cuda_T128_D1024 0.06 True
torch_eager cuda_T128_D2048 0.05 True
torch_eager cuda_T128_D768 0.04 True
torch_eager cuda_T256_D1024 0.05 True
torch_eager cuda_T256_D2048 0.05 True
torch_eager cuda_T256_D768 0.05 True
torch_eager cuda_T512_D1024 0.05 True
torch_eager cuda_T512_D2048 0.05 True
torch_eager cuda_T512_D768 0.05 True