Running rotary benchmark on cuda with 24 workloads.
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.099ms 1229.41% 1.099ms 1.099ms 1
+ torch_eager 14.68% 402.893us 99.74% 2.737ms 2.737ms 0.000us 0.00% 90.654us 90.654us 1
+ aten::mul 6.18% 169.712us 10.63% 291.789us 12.158us 46.975us 52.54% 46.975us 1.957us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 46.975us 52.54% 46.975us 1.957us 24
+ aten::copy_ 5.12% 140.498us 62.48% 1.714ms 95.244us 29.151us 32.61% 30.399us 1.689us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.400us 25.05% 22.400us 1.867us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.280us 14.85% 13.280us 1.107us 12
+ aten::clone 1.37% 37.603us 60.57% 1.662ms 277.027us 0.000us 0.00% 7.999us 1.333us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.751us 7.55% 6.751us 1.125us 6
+ aten::sub 1.57% 43.112us 2.52% 69.272us 11.545us 6.688us 7.48% 6.688us 1.115us 6
+ aten::add 1.32% 36.261us 2.18% 59.731us 9.955us 6.592us 7.37% 6.592us 1.099us 6
+ Activity Buffer Request 52.27% 1.434ms 52.27% 1.434ms 1.434ms 1.248us 1.40% 1.248us 1.248us 1
+ aten::empty_strided 2.02% 55.541us 2.02% 55.541us 9.257us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 2.66% 72.862us 2.66% 72.862us 12.144us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 3.02% 82.803us 3.84% 105.504us 4.396us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.83% 22.701us 0.83% 22.701us 0.946us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.69% 238.340us 8.69% 238.340us 4.965us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.26% 7.250us 0.26% 7.250us 7.250us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.744ms
+Self CUDA time total: 89.406us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.001ms 1104.88% 1.001ms 1.001ms 1
+ torch_eager 13.31% 340.683us 99.79% 2.555ms 2.555ms 0.000us 0.00% 91.680us 91.680us 1
+ aten::mul 6.04% 154.674us 10.48% 268.377us 11.182us 47.810us 52.79% 47.810us 1.992us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.810us 52.79% 47.810us 1.992us 24
+ aten::copy_ 4.35% 111.424us 65.16% 1.668ms 92.682us 29.407us 32.47% 30.527us 1.696us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.559us 24.91% 22.559us 1.880us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.343us 14.73% 13.343us 1.112us 12
+ aten::clone 1.08% 27.742us 62.03% 1.588ms 264.676us 0.000us 0.00% 7.968us 1.328us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.848us 7.56% 6.848us 1.141us 6
+ aten::sub 1.52% 38.791us 2.50% 64.042us 10.674us 6.720us 7.42% 6.720us 1.120us 6
+ aten::add 1.27% 32.413us 2.18% 55.903us 9.317us 6.623us 7.31% 6.623us 1.104us 6
+ Activity Buffer Request 56.03% 1.434ms 56.03% 1.434ms 1.434ms 1.120us 1.24% 1.120us 1.120us 1
+ aten::empty_strided 1.42% 36.451us 1.42% 36.451us 6.075us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 2.10% 53.872us 2.10% 53.872us 8.979us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.86% 73.182us 3.65% 93.342us 3.889us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.79% 20.160us 0.79% 20.160us 0.840us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 9.02% 231.028us 9.02% 231.028us 4.813us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.21% 5.420us 0.21% 5.420us 5.420us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.560ms
+Self CUDA time total: 90.560us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 944.341us 1003.42% 944.341us 944.341us 1
+ torch_eager 12.66% 316.554us 99.80% 2.495ms 2.495ms 0.000us 0.00% 95.424us 95.424us 1
+ aten::mul 6.01% 150.161us 10.40% 259.987us 10.833us 48.863us 51.92% 48.863us 2.036us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.863us 51.92% 48.863us 2.036us 24
+ aten::copy_ 4.06% 101.511us 66.21% 1.655ms 91.941us 30.785us 32.71% 32.097us 1.783us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.009us 24.45% 23.009us 1.917us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.464us 15.37% 14.464us 1.205us 12
+ aten::clone 1.08% 26.971us 63.11% 1.577ms 262.904us 0.000us 0.00% 9.088us 1.515us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 8.26% 7.776us 1.296us 6
+ aten::add 1.43% 35.631us 2.33% 58.151us 9.692us 7.233us 7.69% 7.233us 1.205us 6
+ aten::sub 1.42% 35.432us 2.34% 58.413us 9.736us 7.231us 7.68% 7.231us 1.205us 6
+ Activity Buffer Request 57.41% 1.435ms 57.41% 1.435ms 1.435ms 1.312us 1.39% 1.312us 1.312us 1
+ aten::empty_strided 1.23% 30.860us 1.23% 30.860us 5.143us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 2.03% 50.692us 2.03% 50.692us 8.449us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.76% 69.107us 3.55% 88.725us 3.697us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.78% 19.618us 0.78% 19.618us 0.817us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.92% 222.961us 8.92% 222.961us 4.645us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.20% 5.071us 0.20% 5.071us 5.071us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.500ms
+Self CUDA time total: 94.112us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 949.272us 934.99% 949.272us 949.272us 1
+ torch_eager 11.74% 319.184us 99.83% 2.715ms 2.715ms 0.000us 0.00% 102.839us 102.839us 1
+ aten::mul 5.42% 147.290us 9.69% 263.662us 10.986us 53.022us 52.22% 53.022us 2.209us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 53.022us 52.22% 53.022us 2.209us 24
+ aten::copy_ 3.75% 101.924us 68.58% 1.865ms 103.635us 32.444us 31.96% 33.755us 1.875us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.637us 24.27% 24.637us 2.053us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.062us 15.82% 16.062us 1.339us 12
+ aten::clone 1.13% 30.729us 66.03% 1.796ms 299.314us 0.000us 0.00% 9.118us 1.520us 6
+ aten::add 1.18% 32.140us 2.02% 54.851us 9.142us 8.032us 7.91% 8.032us 1.339us 6
+ aten::sub 1.29% 35.030us 2.16% 58.621us 9.770us 8.030us 7.91% 8.030us 1.338us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.807us 7.69% 7.807us 1.301us 6
+ Activity Buffer Request 53.21% 1.447ms 53.21% 1.447ms 1.447ms 1.311us 1.29% 1.311us 1.311us 1
+ aten::empty_strided 1.17% 31.801us 1.17% 31.801us 5.300us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 9.34% 254.009us 9.34% 254.009us 42.335us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.60% 70.842us 3.35% 90.984us 3.791us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.74% 20.142us 0.74% 20.142us 0.839us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.27% 224.985us 8.27% 224.985us 4.687us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.17% 4.671us 0.17% 4.671us 4.671us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.720ms
+Self CUDA time total: 101.528us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 944.887us 1005.38% 944.887us 944.887us 1
+ torch_eager 11.86% 320.838us 99.82% 2.700ms 2.700ms 0.000us 0.00% 95.295us 95.295us 1
+ aten::mul 5.37% 145.335us 9.42% 254.837us 10.618us 49.024us 52.16% 49.024us 2.043us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.024us 52.16% 49.024us 2.043us 24
+ aten::copy_ 3.87% 104.672us 68.80% 1.861ms 103.396us 30.783us 32.75% 32.095us 1.783us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 24.38% 22.912us 1.909us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.176us 15.08% 14.176us 1.181us 12
+ aten::clone 1.07% 28.861us 66.14% 1.789ms 298.231us 0.000us 0.00% 9.183us 1.530us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.871us 8.37% 7.871us 1.312us 6
+ aten::sub 1.26% 33.972us 2.12% 57.464us 9.577us 7.103us 7.56% 7.103us 1.184us 6
+ aten::add 1.16% 31.253us 1.99% 53.964us 8.994us 7.073us 7.53% 7.073us 1.179us 6
+ Activity Buffer Request 53.80% 1.456ms 53.80% 1.456ms 1.456ms 1.312us 1.40% 1.312us 1.312us 1
+ aten::empty_strided 1.17% 31.633us 1.17% 31.633us 5.272us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.82% 238.648us 8.82% 238.648us 39.775us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.67% 72.119us 3.38% 91.532us 3.814us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.72% 19.413us 0.72% 19.413us 0.809us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.06% 217.970us 8.06% 217.970us 4.541us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.18% 4.990us 0.18% 4.990us 4.990us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.705ms
+Self CUDA time total: 93.983us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 912.250us 902.45% 912.250us 912.250us 1
+ torch_eager 10.84% 287.380us 99.80% 2.646ms 2.646ms 0.000us 0.00% 102.398us 102.398us 1
+ aten::mul 5.43% 143.901us 9.61% 254.716us 10.613us 52.767us 52.20% 52.767us 2.199us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.767us 52.20% 52.767us 2.199us 24
+ aten::copy_ 3.82% 101.373us 69.76% 1.849ms 102.733us 32.416us 32.07% 33.728us 1.874us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.608us 24.34% 24.608us 2.051us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.903us 15.73% 15.903us 1.325us 12
+ aten::clone 0.89% 23.520us 66.94% 1.774ms 295.745us 0.000us 0.00% 9.120us 1.520us 6
+ aten::add 1.25% 33.223us 2.12% 56.323us 9.387us 7.968us 7.88% 7.968us 1.328us 6
+ aten::sub 1.34% 35.391us 2.21% 58.453us 9.742us 7.935us 7.85% 7.935us 1.322us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 7.72% 7.808us 1.301us 6
+ Activity Buffer Request 54.59% 1.447ms 54.59% 1.447ms 1.447ms 1.312us 1.30% 1.312us 1.312us 1
+ aten::empty_strided 1.14% 30.292us 1.14% 30.292us 5.049us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 9.04% 239.538us 9.04% 239.538us 39.923us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.52% 66.730us 3.23% 85.664us 3.569us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.71% 18.934us 0.71% 18.934us 0.789us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.23% 218.091us 8.23% 218.091us 4.544us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.20% 5.360us 0.20% 5.360us 5.360us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.651ms
+Self CUDA time total: 101.086us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 920.762us 761.21% 920.762us 920.762us 1
+ torch_eager 10.74% 283.666us 99.80% 2.636ms 2.636ms 0.000us 0.00% 122.785us 122.785us 1
+ aten::mul 5.61% 148.102us 9.80% 258.888us 10.787us 62.177us 51.40% 62.177us 2.591us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.177us 51.40% 62.177us 2.591us 24
+ aten::copy_ 4.01% 105.842us 69.73% 1.842ms 102.324us 39.520us 32.67% 41.344us 2.297us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.896us 23.89% 28.896us 2.408us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.264us 15.93% 19.264us 1.605us 12
+ aten::clone 0.81% 21.319us 66.69% 1.761ms 293.582us 0.000us 0.00% 12.448us 2.075us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.624us 8.78% 10.624us 1.771us 6
+ aten::add 1.23% 32.431us 2.08% 54.912us 9.152us 9.696us 8.02% 9.696us 1.616us 6
+ aten::sub 1.34% 35.510us 2.24% 59.050us 9.842us 9.568us 7.91% 9.568us 1.595us 6
+ Activity Buffer Request 54.62% 1.443ms 54.62% 1.443ms 1.443ms 1.824us 1.51% 1.824us 1.824us 1
+ aten::empty_strided 1.13% 29.871us 1.13% 29.871us 4.979us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.76% 231.329us 8.76% 231.329us 38.555us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.53% 66.872us 3.28% 86.661us 3.611us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.75% 19.789us 0.75% 19.789us 0.825us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.28% 218.631us 8.28% 218.631us 4.555us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.20% 5.190us 0.20% 5.190us 5.190us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.641ms
+Self CUDA time total: 120.961us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 939.640us 544.89% 939.640us 939.640us 1
+ torch_eager 12.08% 323.576us 99.81% 2.674ms 2.674ms 0.000us 0.00% 175.325us 175.325us 1
+ aten::mul 5.49% 147.107us 9.55% 255.901us 10.663us 89.504us 51.90% 89.504us 3.729us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.504us 51.90% 89.504us 3.729us 24
+ aten::copy_ 3.83% 102.724us 68.48% 1.835ms 101.930us 57.918us 33.59% 60.798us 3.378us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.734us 23.62% 40.734us 3.395us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.023us 14.51% 25.023us 2.085us 12
+ aten::clone 1.06% 28.292us 65.67% 1.760ms 293.252us 0.000us 0.00% 20.064us 3.344us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.184us 9.96% 17.184us 2.864us 6
+ aten::add 1.22% 32.572us 2.05% 54.872us 9.145us 12.512us 7.26% 12.512us 2.085us 6
+ aten::sub 1.28% 34.403us 2.15% 57.513us 9.586us 12.511us 7.26% 12.511us 2.085us 6
+ Activity Buffer Request 53.69% 1.438ms 53.69% 1.438ms 1.438ms 2.880us 1.67% 2.880us 2.880us 1
+ aten::empty_strided 1.12% 30.100us 1.12% 30.100us 5.017us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.57% 229.599us 8.57% 229.599us 38.267us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.59% 69.394us 3.32% 89.005us 3.709us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.73% 19.611us 0.73% 19.611us 0.817us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.14% 218.155us 8.14% 218.155us 4.545us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.19% 5.191us 0.19% 5.191us 5.191us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.679ms
+Self CUDA time total: 172.445us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 910.515us 751.54% 910.515us 910.515us 1
+ torch_eager 19.90% 282.972us 99.65% 1.417ms 1.417ms 0.000us 0.00% 123.009us 123.009us 1
+ aten::mul 10.25% 145.781us 17.92% 254.851us 10.619us 62.146us 51.30% 62.146us 2.589us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.146us 51.30% 62.146us 2.589us 24
+ aten::copy_ 7.07% 100.509us 44.20% 628.439us 34.913us 39.743us 32.80% 41.599us 2.311us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 29.055us 23.98% 29.055us 2.421us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.264us 15.90% 19.264us 1.605us 12
+ aten::clone 1.59% 22.604us 38.82% 551.881us 91.980us 0.000us 0.00% 12.544us 2.091us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.688us 8.82% 10.688us 1.781us 6
+ aten::add 2.23% 31.661us 3.79% 53.922us 8.987us 9.633us 7.95% 9.633us 1.606us 6
+ aten::sub 2.49% 35.352us 4.13% 58.732us 9.789us 9.631us 7.95% 9.631us 1.605us 6
+ Activity Buffer Request 16.91% 240.489us 16.91% 240.489us 240.489us 1.856us 1.53% 1.856us 1.856us 1
+ aten::empty_strided 2.06% 29.230us 2.06% 29.230us 4.872us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.93% 226.498us 15.93% 226.498us 37.750us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.75% 67.473us 6.05% 86.070us 3.586us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.31% 18.597us 1.31% 18.597us 0.775us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.17% 215.654us 15.17% 215.654us 4.493us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.35% 4.980us 0.35% 4.980us 4.980us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.422ms
+Self CUDA time total: 121.153us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 918.443us 533.10% 918.443us 918.443us 1
+ torch_eager 20.03% 279.953us 99.65% 1.393ms 1.393ms 0.000us 0.00% 175.133us 175.133us 1
+ aten::mul 10.59% 147.997us 18.47% 258.229us 10.760us 89.472us 51.93% 89.472us 3.728us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.472us 51.93% 89.472us 3.728us 24
+ aten::copy_ 7.43% 103.844us 43.15% 603.182us 33.510us 57.887us 33.60% 60.735us 3.374us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.831us 23.70% 40.831us 3.403us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.926us 14.47% 24.926us 2.077us 12
+ aten::clone 1.45% 20.289us 37.34% 521.998us 87.000us 0.000us 0.00% 19.904us 3.317us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.056us 9.90% 17.056us 2.843us 6
+ aten::add 2.21% 30.953us 3.79% 53.002us 8.834us 12.480us 7.24% 12.480us 2.080us 6
+ aten::sub 2.40% 33.491us 4.09% 57.142us 9.524us 12.446us 7.22% 12.446us 2.074us 6
+ Activity Buffer Request 14.98% 209.468us 14.98% 209.468us 209.468us 2.848us 1.65% 2.848us 2.848us 1
+ aten::empty_strided 2.03% 28.380us 2.03% 28.380us 4.730us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 16.36% 228.728us 16.36% 228.728us 38.121us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 5.25% 73.370us 6.64% 92.881us 3.870us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.40% 19.511us 1.40% 19.511us 0.813us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.53% 217.074us 15.53% 217.074us 4.522us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.35% 4.950us 0.35% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.398ms
+Self CUDA time total: 172.285us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 945.822us 332.63% 945.822us 945.822us 1
+ torch_eager 11.69% 314.391us 99.81% 2.685ms 2.685ms 0.000us 0.00% 302.941us 302.941us 1
+ aten::mul 5.41% 145.454us 9.45% 254.127us 10.589us 133.310us 46.88% 133.310us 5.555us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.310us 46.88% 133.310us 5.555us 24
+ aten::copy_ 4.13% 111.027us 68.93% 1.854ms 103.002us 109.662us 38.57% 128.254us 7.125us 18
+ aten::clone 1.07% 28.661us 65.93% 1.773ms 295.570us 0.000us 0.00% 70.912us 11.819us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.342us 20.17% 57.342us 4.779us 12
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.320us 18.40% 52.320us 8.720us 6
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.377us 14.55% 41.377us 3.448us 12
+ aten::sub 1.27% 34.091us 2.15% 57.911us 9.652us 20.704us 7.28% 20.704us 3.451us 6
+ aten::add 1.22% 32.950us 2.07% 55.610us 9.268us 20.673us 7.27% 20.673us 3.446us 6
+ Activity Buffer Request 54.12% 1.456ms 54.12% 1.456ms 1.456ms 18.592us 6.54% 18.592us 18.592us 1
+ aten::empty_strided 1.18% 31.741us 1.18% 31.741us 5.290us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.32% 223.797us 8.32% 223.797us 37.300us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.55% 68.485us 3.28% 88.267us 3.678us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.74% 19.782us 0.74% 19.782us 0.824us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.13% 218.664us 8.13% 218.664us 4.555us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.19% 5.100us 0.19% 5.100us 5.100us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.690ms
+Self CUDA time total: 284.349us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 938.033us 165.64% 938.033us 938.033us 1
+ torch_eager 20.89% 291.484us 99.63% 1.390ms 1.390ms 0.000us 0.00% 590.004us 590.004us 1
+ aten::copy_ 7.34% 102.395us 41.53% 579.320us 32.184us 273.370us 48.27% 297.081us 16.504us 18
+ aten::mul 10.73% 149.623us 18.75% 261.638us 10.902us 225.916us 39.89% 225.916us 9.413us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 225.916us 39.89% 225.916us 9.413us 24
+ aten::clone 1.46% 20.369us 35.71% 498.147us 83.025us 0.000us 0.00% 206.459us 34.410us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 182.748us 32.27% 182.748us 30.458us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.622us 16.00% 90.622us 7.552us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 67.007us 11.83% 67.007us 5.584us 12
+ aten::sub 2.52% 35.222us 4.78% 66.682us 11.114us 34.272us 6.05% 34.272us 5.712us 6
+ aten::add 2.30% 32.121us 4.02% 56.063us 9.344us 32.735us 5.78% 32.735us 5.456us 6
+ Activity Buffer Request 14.16% 197.506us 14.16% 197.506us 197.506us 23.711us 4.19% 23.711us 23.711us 1
+ aten::empty_strided 2.10% 29.332us 2.10% 29.332us 4.889us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.61% 217.828us 15.61% 217.828us 36.305us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.72% 65.792us 6.10% 85.041us 3.543us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.38% 19.249us 1.38% 19.249us 0.802us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 16.42% 229.008us 16.42% 229.008us 4.771us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.37% 5.150us 0.37% 5.150us 5.150us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.395ms
+Self CUDA time total: 566.293us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 912.211us 984.01% 912.211us 912.211us 1
+ torch_eager 20.74% 286.708us 99.62% 1.377ms 1.377ms 0.000us 0.00% 93.855us 93.855us 1
+ aten::mul 10.48% 144.890us 18.31% 253.080us 10.545us 49.856us 53.78% 49.856us 2.077us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.856us 53.78% 49.856us 2.077us 24
+ aten::copy_ 7.33% 101.333us 42.51% 587.542us 32.641us 29.407us 31.72% 30.559us 1.698us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.623us 24.40% 22.623us 1.885us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.440us 14.50% 13.440us 1.120us 12
+ aten::clone 1.54% 21.251us 36.76% 508.068us 84.678us 0.000us 0.00% 7.936us 1.323us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 7.32% 6.784us 1.131us 6
+ aten::sub 2.53% 34.908us 4.26% 58.910us 9.818us 6.720us 7.25% 6.720us 1.120us 6
+ aten::add 2.34% 32.341us 3.97% 54.832us 9.139us 6.720us 7.25% 6.720us 1.120us 6
+ Activity Buffer Request 14.89% 205.787us 14.89% 205.787us 205.787us 1.152us 1.24% 1.152us 1.152us 1
+ aten::empty_strided 2.09% 28.901us 2.09% 28.901us 4.817us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.89% 219.618us 15.89% 219.618us 36.603us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.84% 66.885us 6.21% 85.845us 3.577us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.37% 18.960us 1.37% 18.960us 0.790us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.59% 215.487us 15.59% 215.487us 4.489us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.38% 5.210us 0.38% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.382ms
+Self CUDA time total: 92.703us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 938.901us 973.14% 938.901us 938.901us 1
+ torch_eager 11.77% 313.313us 99.82% 2.656ms 2.656ms 0.000us 0.00% 97.825us 97.825us 1
+ aten::mul 5.60% 148.957us 9.78% 260.340us 10.847us 51.266us 53.14% 51.266us 2.136us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.266us 53.14% 51.266us 2.136us 24
+ aten::copy_ 3.87% 103.023us 68.29% 1.817ms 100.957us 30.976us 32.11% 32.319us 1.795us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.072us 23.91% 23.072us 1.923us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.240us 14.76% 14.240us 1.187us 12
+ aten::clone 1.07% 28.429us 65.69% 1.748ms 291.327us 0.000us 0.00% 9.247us 1.541us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 8.19% 7.904us 1.317us 6
+ aten::add 1.24% 33.110us 2.10% 56.011us 9.335us 7.137us 7.40% 7.137us 1.189us 6
+ aten::sub 1.37% 36.490us 2.25% 59.790us 9.965us 7.103us 7.36% 7.103us 1.184us 6
+ Activity Buffer Request 53.84% 1.433ms 53.84% 1.433ms 1.433ms 1.343us 1.39% 1.343us 1.343us 1
+ aten::empty_strided 1.19% 31.751us 1.19% 31.751us 5.292us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.25% 219.470us 8.25% 219.470us 36.578us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.63% 69.934us 3.35% 89.134us 3.714us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.72% 19.200us 0.72% 19.200us 0.800us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.25% 219.576us 8.25% 219.576us 4.574us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.18% 4.910us 0.18% 4.910us 4.910us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.661ms
+Self CUDA time total: 96.482us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 932.446us 897.69% 932.446us 932.446us 1
+ torch_eager 11.60% 307.685us 99.81% 2.647ms 2.647ms 0.000us 0.00% 105.184us 105.184us 1
+ aten::mul 5.51% 146.123us 9.64% 255.679us 10.653us 55.362us 53.30% 55.362us 2.307us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.362us 53.30% 55.362us 2.307us 24
+ aten::copy_ 3.78% 100.194us 68.64% 1.821ms 101.144us 32.478us 31.27% 33.790us 1.877us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.703us 23.78% 24.703us 2.059us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.032us 15.43% 16.032us 1.336us 12
+ aten::clone 1.02% 27.179us 65.92% 1.748ms 291.378us 0.000us 0.00% 9.087us 1.515us 6
+ aten::add 1.19% 31.489us 2.03% 53.840us 8.973us 8.064us 7.76% 8.064us 1.344us 6
+ aten::sub 1.35% 35.692us 2.26% 59.843us 9.974us 7.968us 7.67% 7.968us 1.328us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.775us 7.49% 7.775us 1.296us 6
+ Activity Buffer Request 54.18% 1.437ms 54.18% 1.437ms 1.437ms 1.312us 1.26% 1.312us 1.312us 1
+ aten::empty_strided 1.21% 32.003us 1.21% 32.003us 5.334us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.25% 218.717us 8.25% 218.717us 36.453us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.67% 70.760us 3.41% 90.371us 3.765us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.74% 19.611us 0.74% 19.611us 0.817us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.32% 220.800us 8.32% 220.800us 4.600us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.19% 5.070us 0.19% 5.070us 5.070us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.652ms
+Self CUDA time total: 103.872us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 914.130us 736.81% 914.130us 914.130us 1
+ torch_eager 19.76% 284.015us 99.65% 1.432ms 1.432ms 0.000us 0.00% 125.858us 125.858us 1
+ aten::mul 10.20% 146.586us 17.70% 254.419us 10.601us 65.313us 52.64% 65.313us 2.721us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.313us 52.64% 65.313us 2.721us 24
+ aten::copy_ 7.71% 110.793us 44.82% 644.172us 35.787us 39.489us 31.83% 41.281us 2.293us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.961us 23.34% 28.961us 2.413us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.264us 15.53% 19.264us 1.605us 12
+ aten::clone 1.45% 20.820us 39.14% 562.560us 93.760us 0.000us 0.00% 12.320us 2.053us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.528us 8.49% 10.528us 1.755us 6
+ aten::add 2.34% 33.572us 3.91% 56.142us 9.357us 9.664us 7.79% 9.664us 1.611us 6
+ aten::sub 2.40% 34.530us 4.02% 57.751us 9.625us 9.600us 7.74% 9.600us 1.600us 6
+ Activity Buffer Request 17.82% 256.078us 17.82% 256.078us 256.078us 1.792us 1.44% 1.792us 1.792us 1
+ aten::empty_strided 2.04% 29.262us 2.04% 29.262us 4.877us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 14.99% 215.437us 14.99% 215.437us 35.906us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.63% 66.508us 5.96% 85.660us 3.569us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.33% 19.152us 1.33% 19.152us 0.798us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 14.99% 215.488us 14.99% 215.488us 4.489us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.35% 5.000us 0.35% 5.000us 5.000us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.437ms
+Self CUDA time total: 124.066us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 921.138us 886.26% 921.138us 921.138us 1
+ torch_eager 20.59% 281.307us 99.64% 1.361ms 1.361ms 0.000us 0.00% 105.280us 105.280us 1
+ aten::mul 10.84% 148.087us 18.91% 258.361us 10.765us 55.487us 53.39% 55.487us 2.312us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.487us 53.39% 55.487us 2.312us 24
+ aten::copy_ 7.39% 100.946us 41.35% 564.842us 31.380us 32.481us 31.25% 33.825us 1.879us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.640us 23.71% 24.640us 2.053us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.968us 15.36% 15.968us 1.331us 12
+ aten::clone 1.54% 21.041us 35.66% 487.118us 81.186us 0.000us 0.00% 9.185us 1.531us 6
+ aten::sub 2.75% 37.531us 4.47% 61.012us 10.169us 8.031us 7.73% 8.031us 1.339us 6
+ aten::add 2.35% 32.112us 3.97% 54.222us 9.037us 7.937us 7.64% 7.937us 1.323us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.841us 7.54% 7.841us 1.307us 6
+ Activity Buffer Request 13.62% 186.046us 13.62% 186.046us 186.046us 1.344us 1.29% 1.344us 1.344us 1
+ aten::empty_strided 2.20% 30.110us 2.20% 30.110us 5.018us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.76% 215.337us 15.76% 215.337us 35.890us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 5.18% 70.704us 6.60% 90.193us 3.758us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.43% 19.489us 1.43% 19.489us 0.812us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.99% 218.378us 15.99% 218.378us 4.550us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.36% 4.960us 0.36% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.366ms
+Self CUDA time total: 103.936us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 943.466us 759.69% 943.466us 943.466us 1
+ torch_eager 21.73% 302.071us 99.63% 1.385ms 1.385ms 0.000us 0.00% 125.950us 125.950us 1
+ aten::mul 10.55% 146.657us 18.63% 259.039us 10.793us 65.378us 52.64% 65.378us 2.724us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.378us 52.64% 65.378us 2.724us 24
+ aten::copy_ 7.63% 106.103us 41.12% 571.631us 31.757us 39.519us 31.82% 41.278us 2.293us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 29.024us 23.37% 29.024us 2.419us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.294us 15.54% 19.294us 1.608us 12
+ aten::clone 1.52% 21.080us 35.11% 488.057us 81.343us 0.000us 0.00% 12.254us 2.042us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.495us 8.45% 10.495us 1.749us 6
+ aten::sub 2.46% 34.153us 4.15% 57.634us 9.606us 9.727us 7.83% 9.727us 1.621us 6
+ aten::add 2.41% 33.450us 4.05% 56.342us 9.390us 9.567us 7.70% 9.567us 1.595us 6
+ Activity Buffer Request 13.70% 190.466us 13.70% 190.466us 190.466us 1.759us 1.42% 1.759us 1.759us 1
+ aten::empty_strided 2.14% 29.791us 2.14% 29.791us 4.965us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.29% 212.610us 15.29% 212.610us 35.435us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.88% 67.802us 6.29% 87.511us 3.646us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.42% 19.709us 1.42% 19.709us 0.821us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.91% 221.207us 15.91% 221.207us 4.608us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.37% 5.080us 0.37% 5.080us 5.080us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.390ms
+Self CUDA time total: 124.191us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 909.497us 512.75% 909.497us 909.497us 1
+ torch_eager 20.85% 278.298us 99.63% 1.330ms 1.330ms 0.000us 0.00% 180.288us 180.288us 1
+ aten::mul 10.86% 144.977us 19.10% 254.920us 10.622us 94.591us 53.33% 94.591us 3.941us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.591us 53.33% 94.591us 3.941us 24
+ aten::copy_ 7.76% 103.603us 40.90% 545.870us 30.326us 57.919us 32.65% 60.831us 3.380us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.767us 22.98% 40.767us 3.397us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.866us 14.02% 24.866us 2.072us 12
+ aten::clone 1.59% 21.200us 34.96% 466.526us 77.754us 0.000us 0.00% 20.064us 3.344us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.152us 9.67% 17.152us 2.859us 6
+ aten::sub 2.64% 35.242us 4.38% 58.452us 9.742us 12.450us 7.02% 12.450us 2.075us 6
+ aten::add 2.38% 31.821us 4.13% 55.081us 9.180us 12.416us 7.00% 12.416us 2.069us 6
+ Activity Buffer Request 12.93% 172.606us 12.93% 172.606us 172.606us 2.912us 1.64% 2.912us 2.912us 1
+ aten::empty_strided 2.27% 30.341us 2.27% 30.341us 5.057us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.64% 208.798us 15.64% 208.798us 34.800us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.99% 66.616us 6.40% 85.475us 3.561us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.41% 18.859us 1.41% 18.859us 0.786us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 16.28% 217.276us 16.28% 217.276us 4.527us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.37% 5.001us 0.37% 5.001us 5.001us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.335ms
+Self CUDA time total: 177.376us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 908.914us 305.78% 908.914us 908.914us 1
+ torch_eager 20.55% 283.527us 99.64% 1.375ms 1.375ms 0.000us 0.00% 314.296us 314.296us 1
+ aten::mul 10.61% 146.340us 18.54% 255.803us 10.658us 145.086us 48.81% 145.086us 6.045us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.086us 48.81% 145.086us 6.045us 24
+ aten::copy_ 7.34% 101.324us 42.67% 588.790us 32.711us 111.099us 37.38% 128.154us 7.120us 18
+ aten::clone 1.50% 20.722us 37.09% 511.699us 85.283us 0.000us 0.00% 70.718us 11.786us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.436us 19.32% 57.436us 4.786us 12
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.663us 18.05% 53.663us 8.944us 6
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.056us 13.81% 41.056us 3.421us 12
+ aten::sub 2.49% 34.330us 4.16% 57.351us 9.558us 20.672us 6.95% 20.672us 3.445us 6
+ aten::add 2.29% 31.611us 3.89% 53.723us 8.954us 20.384us 6.86% 20.384us 3.397us 6
+ Activity Buffer Request 15.84% 218.487us 15.84% 218.487us 218.487us 17.055us 5.74% 17.055us 17.055us 1
+ aten::empty_strided 2.18% 30.110us 2.18% 30.110us 5.018us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.10% 208.357us 15.10% 208.357us 34.726us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.74% 65.442us 6.15% 84.803us 3.533us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.40% 19.361us 1.40% 19.361us 0.807us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.60% 215.218us 15.60% 215.218us 4.484us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.36% 4.930us 0.36% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.380ms
+Self CUDA time total: 297.241us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 939.162us 529.48% 939.162us 939.162us 1
+ torch_eager 11.57% 307.472us 99.80% 2.653ms 2.653ms 0.000us 0.00% 180.256us 180.256us 1
+ aten::mul 5.55% 147.649us 9.66% 256.649us 10.694us 94.851us 53.47% 94.851us 3.952us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.851us 53.47% 94.851us 3.952us 24
+ aten::copy_ 3.85% 102.292us 68.52% 1.821ms 101.186us 57.759us 32.56% 60.639us 3.369us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.671us 22.93% 40.671us 3.389us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.766us 13.96% 24.766us 2.064us 12
+ aten::clone 1.06% 28.080us 65.81% 1.749ms 291.547us 0.000us 0.00% 19.968us 3.328us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 9.63% 17.088us 2.848us 6
+ aten::add 1.13% 30.133us 1.96% 52.053us 8.675us 12.384us 6.98% 12.384us 2.064us 6
+ aten::sub 1.27% 33.752us 2.15% 57.162us 9.527us 12.382us 6.98% 12.382us 2.064us 6
+ Activity Buffer Request 54.50% 1.449ms 54.50% 1.449ms 1.449ms 2.880us 1.62% 2.880us 2.880us 1
+ aten::empty_strided 1.13% 30.142us 1.13% 30.142us 5.024us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 7.84% 208.428us 7.84% 208.428us 34.738us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 3.02% 80.309us 3.76% 99.911us 4.163us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.74% 19.602us 0.74% 19.602us 0.817us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.14% 216.293us 8.14% 216.293us 4.506us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.20% 5.200us 0.20% 5.200us 5.200us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.658ms
+Self CUDA time total: 177.376us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 942.515us 317.36% 942.515us 942.515us 1
+ torch_eager 20.57% 285.923us 99.62% 1.385ms 1.385ms 0.000us 0.00% 314.717us 314.717us 1
+ aten::mul 10.73% 149.116us 18.62% 258.870us 10.786us 145.439us 48.97% 145.439us 6.060us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.439us 48.97% 145.439us 6.060us 24
+ aten::copy_ 7.46% 103.659us 42.33% 588.488us 32.694us 110.749us 37.29% 128.477us 7.138us 18
+ aten::clone 1.56% 21.753us 36.61% 508.959us 84.826us 0.000us 0.00% 71.104us 11.851us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.373us 19.32% 57.373us 4.781us 12
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.376us 17.97% 53.376us 8.896us 6
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.801us 13.74% 40.801us 3.400us 12
+ aten::sub 2.38% 33.081us 4.03% 56.021us 9.337us 20.449us 6.89% 20.449us 3.408us 6
+ aten::add 2.40% 33.331us 4.05% 56.271us 9.379us 20.352us 6.85% 20.352us 3.392us 6
+ Activity Buffer Request 14.18% 197.118us 14.18% 197.118us 197.118us 17.728us 5.97% 17.728us 17.728us 1
+ aten::empty_strided 2.21% 30.780us 2.21% 30.780us 5.130us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 16.19% 225.018us 16.19% 225.018us 37.503us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.87% 67.722us 6.24% 86.713us 3.613us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.37% 18.991us 1.37% 18.991us 0.791us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.71% 218.327us 15.71% 218.327us 4.548us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.38% 5.310us 0.38% 5.310us 5.310us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.390ms
+Self CUDA time total: 296.989us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 928.214us 158.30% 928.214us 928.214us 1
+ torch_eager 21.21% 285.194us 99.61% 1.340ms 1.340ms 0.000us 0.00% 610.012us 610.012us 1
+ aten::copy_ 7.59% 102.047us 40.19% 540.521us 30.029us 268.445us 45.78% 292.093us 16.227us 18
+ aten::mul 11.07% 148.860us 19.42% 261.184us 10.883us 251.679us 42.92% 251.679us 10.487us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 251.679us 42.92% 251.679us 10.487us 24
+ aten::clone 1.57% 21.069us 34.26% 460.696us 76.783us 0.000us 0.00% 201.406us 33.568us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 177.758us 30.32% 177.758us 29.626us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.687us 15.47% 90.687us 7.557us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 66.240us 11.30% 66.240us 5.520us 12
+ aten::sub 2.72% 36.642us 4.50% 60.582us 10.097us 33.152us 5.65% 33.152us 5.525us 6
+ aten::add 2.29% 30.800us 3.93% 52.901us 8.817us 33.088us 5.64% 33.088us 5.515us 6
+ Activity Buffer Request 12.31% 165.596us 12.31% 165.596us 165.596us 23.648us 4.03% 23.648us 23.648us 1
+ aten::empty_strided 2.19% 29.501us 2.19% 29.501us 4.917us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.63% 210.266us 15.63% 210.266us 35.044us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 5.16% 69.374us 6.60% 88.734us 3.697us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.44% 19.360us 1.44% 19.360us 0.807us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 16.43% 220.977us 16.43% 220.977us 4.604us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.39% 5.180us 0.39% 5.180us 5.180us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.345ms
+Self CUDA time total: 586.364us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 9.32% 323.657us 76.63% 2.662ms 2.662ms 0.000us 0.00% 1.834ms 1.834ms 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.806ms 102.11% 1.806ms 1.806ms 1
+ aten::copy_ 3.12% 108.276us 52.46% 1.822ms 101.225us 791.134us 44.74% 857.278us 47.627us 18
+ aten::mul 4.16% 144.572us 7.37% 256.109us 10.671us 827.198us 46.78% 827.198us 34.467us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 827.198us 46.78% 827.198us 34.467us 24
+ aten::clone 0.81% 28.142us 50.15% 1.742ms 290.300us 0.000us 0.00% 624.095us 104.016us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 557.951us 31.55% 557.951us 92.992us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 233.183us 13.19% 233.183us 19.432us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 149.919us 8.48% 149.919us 12.493us 12
+ aten::sub 0.98% 34.102us 1.65% 57.362us 9.560us 90.368us 5.11% 90.368us 15.061us 6
+ Activity Buffer Request 41.53% 1.443ms 41.53% 1.443ms 1.443ms 66.144us 3.74% 66.144us 66.144us 1
+ aten::add 0.89% 30.740us 1.53% 53.293us 8.882us 59.551us 3.37% 59.551us 9.925us 6
+ aten::empty_strided 0.86% 29.871us 0.86% 29.871us 4.979us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 5.94% 206.426us 5.94% 206.426us 34.404us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.06% 71.442us 2.62% 91.034us 3.793us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.56% 19.592us 0.56% 19.592us 0.816us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 6.40% 222.192us 6.40% 222.192us 4.629us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 23.37% 811.698us 23.37% 811.698us 811.698us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 3.473ms
+Self CUDA time total: 1.768ms
+
+
+impl wl p50(ms) ok
+torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
+torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
+torch_eager cuda_B1_S128_H8_D128_R64 0.23 True
+torch_eager cuda_B1_S128_H8_D64_R32 0.17 True
+torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True
+torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True
+torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True
+torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True
+torch_eager cuda_B1_S512_H32_D128_R64 0.22 True
+torch_eager cuda_B1_S512_H32_D64_R32 0.22 True
+torch_eager cuda_B1_S512_H8_D128_R64 0.22 True
+torch_eager cuda_B1_S512_H8_D64_R32 0.22 True
+torch_eager cuda_B2_S128_H32_D128_R64 0.22 True
+torch_eager cuda_B2_S128_H32_D64_R32 0.22 True
+torch_eager cuda_B2_S128_H8_D128_R64 0.22 True
+torch_eager cuda_B2_S128_H8_D64_R32 0.22 True
+torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True
+torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
+torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True
+torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
+torch_eager cuda_B2_S512_H32_D128_R64 0.22 True
+torch_eager cuda_B2_S512_H32_D64_R32 0.22 True
+torch_eager cuda_B2_S512_H8_D128_R64 0.22 True
+torch_eager cuda_B2_S512_H8_D64_R32 0.22 True
+