+

PyTorch Native - Rotary Position Embeddings

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.23s + | + +Raw +GitHub +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Tue Oct 28 14:08:24 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   29C    P0             90W /  350W |       0MiB /  46068MiB |     24%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+ +

Rotary Embeddings Benchmark (PyTorch Native)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 3.87s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def apply_rotary_torch(x1, x2, cos, sin, conj=False):
+    """Reference rotary implementation."""
+    if not conj:
+        out1 = x1 * cos - x2 * sin
+        out2 = x1 * sin + x2 * cos
+    else:
+        out1 = x1 * cos + x2 * sin
+        out2 = -x1 * sin + x2 * cos
+    return out1, out2
+
+
+def torch_rotary(query, key, cos, sin, conj=False):
+    rotary_dim = cos.shape[-1]
+
+    # Clone inputs to avoid modifying them
+    q_out = query.clone()
+    k_out = key.clone()
+
+    # Apply rotation to query
+    q1 = q_out[..., :rotary_dim]
+    q2 = q_out[..., rotary_dim : 2 * rotary_dim]
+    q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
+    q_out[..., :rotary_dim] = q_out_1
+    q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2
+
+    # Apply rotation to key
+    k1 = k_out[..., :rotary_dim]
+    k2 = k_out[..., rotary_dim : 2 * rotary_dim]
+    k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
+    k_out[..., :rotary_dim] = k_out_1
+    k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2
+
+    return q_out, k_out
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.ROTARY,
+    impl_name="torch_eager",
+    impl_tags={"family": "pytorch", "backend": "eager"},
+    impl_func=torch_rotary,
+)
+
+ +
+
+
+
+
Running rotary benchmark on cuda with 24 workloads.
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.099ms      1229.41%       1.099ms       1.099ms             1  
+                                            torch_eager        14.68%     402.893us        99.74%       2.737ms       2.737ms       0.000us         0.00%      90.654us      90.654us             1  
+                                              aten::mul         6.18%     169.712us        10.63%     291.789us      12.158us      46.975us        52.54%      46.975us       1.957us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.975us        52.54%      46.975us       1.957us            24  
+                                            aten::copy_         5.12%     140.498us        62.48%       1.714ms      95.244us      29.151us        32.61%      30.399us       1.689us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.400us        25.05%      22.400us       1.867us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.280us        14.85%      13.280us       1.107us            12  
+                                            aten::clone         1.37%      37.603us        60.57%       1.662ms     277.027us       0.000us         0.00%       7.999us       1.333us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.751us         7.55%       6.751us       1.125us             6  
+                                              aten::sub         1.57%      43.112us         2.52%      69.272us      11.545us       6.688us         7.48%       6.688us       1.115us             6  
+                                              aten::add         1.32%      36.261us         2.18%      59.731us       9.955us       6.592us         7.37%       6.592us       1.099us             6  
+                                Activity Buffer Request        52.27%       1.434ms        52.27%       1.434ms       1.434ms       1.248us         1.40%       1.248us       1.248us             1  
+                                    aten::empty_strided         2.02%      55.541us         2.02%      55.541us       9.257us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.66%      72.862us         2.66%      72.862us      12.144us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.02%      82.803us         3.84%     105.504us       4.396us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.83%      22.701us         0.83%      22.701us       0.946us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.69%     238.340us         8.69%     238.340us       4.965us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.26%       7.250us         0.26%       7.250us       7.250us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.744ms
+Self CUDA time total: 89.406us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.001ms      1104.88%       1.001ms       1.001ms             1  
+                                            torch_eager        13.31%     340.683us        99.79%       2.555ms       2.555ms       0.000us         0.00%      91.680us      91.680us             1  
+                                              aten::mul         6.04%     154.674us        10.48%     268.377us      11.182us      47.810us        52.79%      47.810us       1.992us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.810us        52.79%      47.810us       1.992us            24  
+                                            aten::copy_         4.35%     111.424us        65.16%       1.668ms      92.682us      29.407us        32.47%      30.527us       1.696us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.559us        24.91%      22.559us       1.880us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.343us        14.73%      13.343us       1.112us            12  
+                                            aten::clone         1.08%      27.742us        62.03%       1.588ms     264.676us       0.000us         0.00%       7.968us       1.328us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.848us         7.56%       6.848us       1.141us             6  
+                                              aten::sub         1.52%      38.791us         2.50%      64.042us      10.674us       6.720us         7.42%       6.720us       1.120us             6  
+                                              aten::add         1.27%      32.413us         2.18%      55.903us       9.317us       6.623us         7.31%       6.623us       1.104us             6  
+                                Activity Buffer Request        56.03%       1.434ms        56.03%       1.434ms       1.434ms       1.120us         1.24%       1.120us       1.120us             1  
+                                    aten::empty_strided         1.42%      36.451us         1.42%      36.451us       6.075us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.10%      53.872us         2.10%      53.872us       8.979us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.86%      73.182us         3.65%      93.342us       3.889us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.79%      20.160us         0.79%      20.160us       0.840us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.02%     231.028us         9.02%     231.028us       4.813us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.21%       5.420us         0.21%       5.420us       5.420us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.560ms
+Self CUDA time total: 90.560us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     944.341us      1003.42%     944.341us     944.341us             1  
+                                            torch_eager        12.66%     316.554us        99.80%       2.495ms       2.495ms       0.000us         0.00%      95.424us      95.424us             1  
+                                              aten::mul         6.01%     150.161us        10.40%     259.987us      10.833us      48.863us        51.92%      48.863us       2.036us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.863us        51.92%      48.863us       2.036us            24  
+                                            aten::copy_         4.06%     101.511us        66.21%       1.655ms      91.941us      30.785us        32.71%      32.097us       1.783us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.009us        24.45%      23.009us       1.917us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.464us        15.37%      14.464us       1.205us            12  
+                                            aten::clone         1.08%      26.971us        63.11%       1.577ms     262.904us       0.000us         0.00%       9.088us       1.515us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us         8.26%       7.776us       1.296us             6  
+                                              aten::add         1.43%      35.631us         2.33%      58.151us       9.692us       7.233us         7.69%       7.233us       1.205us             6  
+                                              aten::sub         1.42%      35.432us         2.34%      58.413us       9.736us       7.231us         7.68%       7.231us       1.205us             6  
+                                Activity Buffer Request        57.41%       1.435ms        57.41%       1.435ms       1.435ms       1.312us         1.39%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.23%      30.860us         1.23%      30.860us       5.143us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.03%      50.692us         2.03%      50.692us       8.449us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.76%      69.107us         3.55%      88.725us       3.697us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.78%      19.618us         0.78%      19.618us       0.817us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.92%     222.961us         8.92%     222.961us       4.645us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.071us         0.20%       5.071us       5.071us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.500ms
+Self CUDA time total: 94.112us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     949.272us       934.99%     949.272us     949.272us             1  
+                                            torch_eager        11.74%     319.184us        99.83%       2.715ms       2.715ms       0.000us         0.00%     102.839us     102.839us             1  
+                                              aten::mul         5.42%     147.290us         9.69%     263.662us      10.986us      53.022us        52.22%      53.022us       2.209us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      53.022us        52.22%      53.022us       2.209us            24  
+                                            aten::copy_         3.75%     101.924us        68.58%       1.865ms     103.635us      32.444us        31.96%      33.755us       1.875us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.637us        24.27%      24.637us       2.053us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.062us        15.82%      16.062us       1.339us            12  
+                                            aten::clone         1.13%      30.729us        66.03%       1.796ms     299.314us       0.000us         0.00%       9.118us       1.520us             6  
+                                              aten::add         1.18%      32.140us         2.02%      54.851us       9.142us       8.032us         7.91%       8.032us       1.339us             6  
+                                              aten::sub         1.29%      35.030us         2.16%      58.621us       9.770us       8.030us         7.91%       8.030us       1.338us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.807us         7.69%       7.807us       1.301us             6  
+                                Activity Buffer Request        53.21%       1.447ms        53.21%       1.447ms       1.447ms       1.311us         1.29%       1.311us       1.311us             1  
+                                    aten::empty_strided         1.17%      31.801us         1.17%      31.801us       5.300us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.34%     254.009us         9.34%     254.009us      42.335us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.60%      70.842us         3.35%      90.984us       3.791us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.74%      20.142us         0.74%      20.142us       0.839us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.27%     224.985us         8.27%     224.985us       4.687us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.17%       4.671us         0.17%       4.671us       4.671us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.720ms
+Self CUDA time total: 101.528us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     944.887us      1005.38%     944.887us     944.887us             1  
+                                            torch_eager        11.86%     320.838us        99.82%       2.700ms       2.700ms       0.000us         0.00%      95.295us      95.295us             1  
+                                              aten::mul         5.37%     145.335us         9.42%     254.837us      10.618us      49.024us        52.16%      49.024us       2.043us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.024us        52.16%      49.024us       2.043us            24  
+                                            aten::copy_         3.87%     104.672us        68.80%       1.861ms     103.396us      30.783us        32.75%      32.095us       1.783us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.912us        24.38%      22.912us       1.909us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.176us        15.08%      14.176us       1.181us            12  
+                                            aten::clone         1.07%      28.861us        66.14%       1.789ms     298.231us       0.000us         0.00%       9.183us       1.530us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.871us         8.37%       7.871us       1.312us             6  
+                                              aten::sub         1.26%      33.972us         2.12%      57.464us       9.577us       7.103us         7.56%       7.103us       1.184us             6  
+                                              aten::add         1.16%      31.253us         1.99%      53.964us       8.994us       7.073us         7.53%       7.073us       1.179us             6  
+                                Activity Buffer Request        53.80%       1.456ms        53.80%       1.456ms       1.456ms       1.312us         1.40%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.17%      31.633us         1.17%      31.633us       5.272us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.82%     238.648us         8.82%     238.648us      39.775us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.67%      72.119us         3.38%      91.532us       3.814us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.72%      19.413us         0.72%      19.413us       0.809us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.06%     217.970us         8.06%     217.970us       4.541us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       4.990us         0.18%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.705ms
+Self CUDA time total: 93.983us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     912.250us       902.45%     912.250us     912.250us             1  
+                                            torch_eager        10.84%     287.380us        99.80%       2.646ms       2.646ms       0.000us         0.00%     102.398us     102.398us             1  
+                                              aten::mul         5.43%     143.901us         9.61%     254.716us      10.613us      52.767us        52.20%      52.767us       2.199us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.767us        52.20%      52.767us       2.199us            24  
+                                            aten::copy_         3.82%     101.373us        69.76%       1.849ms     102.733us      32.416us        32.07%      33.728us       1.874us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.608us        24.34%      24.608us       2.051us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.903us        15.73%      15.903us       1.325us            12  
+                                            aten::clone         0.89%      23.520us        66.94%       1.774ms     295.745us       0.000us         0.00%       9.120us       1.520us             6  
+                                              aten::add         1.25%      33.223us         2.12%      56.323us       9.387us       7.968us         7.88%       7.968us       1.328us             6  
+                                              aten::sub         1.34%      35.391us         2.21%      58.453us       9.742us       7.935us         7.85%       7.935us       1.322us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.72%       7.808us       1.301us             6  
+                                Activity Buffer Request        54.59%       1.447ms        54.59%       1.447ms       1.447ms       1.312us         1.30%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.14%      30.292us         1.14%      30.292us       5.049us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.04%     239.538us         9.04%     239.538us      39.923us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.52%      66.730us         3.23%      85.664us       3.569us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.71%      18.934us         0.71%      18.934us       0.789us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.23%     218.091us         8.23%     218.091us       4.544us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.360us         0.20%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.651ms
+Self CUDA time total: 101.086us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     920.762us       761.21%     920.762us     920.762us             1  
+                                            torch_eager        10.74%     283.666us        99.80%       2.636ms       2.636ms       0.000us         0.00%     122.785us     122.785us             1  
+                                              aten::mul         5.61%     148.102us         9.80%     258.888us      10.787us      62.177us        51.40%      62.177us       2.591us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.177us        51.40%      62.177us       2.591us            24  
+                                            aten::copy_         4.01%     105.842us        69.73%       1.842ms     102.324us      39.520us        32.67%      41.344us       2.297us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.896us        23.89%      28.896us       2.408us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.93%      19.264us       1.605us            12  
+                                            aten::clone         0.81%      21.319us        66.69%       1.761ms     293.582us       0.000us         0.00%      12.448us       2.075us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.624us         8.78%      10.624us       1.771us             6  
+                                              aten::add         1.23%      32.431us         2.08%      54.912us       9.152us       9.696us         8.02%       9.696us       1.616us             6  
+                                              aten::sub         1.34%      35.510us         2.24%      59.050us       9.842us       9.568us         7.91%       9.568us       1.595us             6  
+                                Activity Buffer Request        54.62%       1.443ms        54.62%       1.443ms       1.443ms       1.824us         1.51%       1.824us       1.824us             1  
+                                    aten::empty_strided         1.13%      29.871us         1.13%      29.871us       4.979us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.76%     231.329us         8.76%     231.329us      38.555us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.53%      66.872us         3.28%      86.661us       3.611us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.75%      19.789us         0.75%      19.789us       0.825us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.28%     218.631us         8.28%     218.631us       4.555us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.190us         0.20%       5.190us       5.190us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.641ms
+Self CUDA time total: 120.961us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     939.640us       544.89%     939.640us     939.640us             1  
+                                            torch_eager        12.08%     323.576us        99.81%       2.674ms       2.674ms       0.000us         0.00%     175.325us     175.325us             1  
+                                              aten::mul         5.49%     147.107us         9.55%     255.901us      10.663us      89.504us        51.90%      89.504us       3.729us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.504us        51.90%      89.504us       3.729us            24  
+                                            aten::copy_         3.83%     102.724us        68.48%       1.835ms     101.930us      57.918us        33.59%      60.798us       3.378us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.734us        23.62%      40.734us       3.395us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.023us        14.51%      25.023us       2.085us            12  
+                                            aten::clone         1.06%      28.292us        65.67%       1.760ms     293.252us       0.000us         0.00%      20.064us       3.344us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.184us         9.96%      17.184us       2.864us             6  
+                                              aten::add         1.22%      32.572us         2.05%      54.872us       9.145us      12.512us         7.26%      12.512us       2.085us             6  
+                                              aten::sub         1.28%      34.403us         2.15%      57.513us       9.586us      12.511us         7.26%      12.511us       2.085us             6  
+                                Activity Buffer Request        53.69%       1.438ms        53.69%       1.438ms       1.438ms       2.880us         1.67%       2.880us       2.880us             1  
+                                    aten::empty_strided         1.12%      30.100us         1.12%      30.100us       5.017us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.57%     229.599us         8.57%     229.599us      38.267us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.59%      69.394us         3.32%      89.005us       3.709us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.73%      19.611us         0.73%      19.611us       0.817us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.14%     218.155us         8.14%     218.155us       4.545us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.191us         0.19%       5.191us       5.191us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.679ms
+Self CUDA time total: 172.445us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     910.515us       751.54%     910.515us     910.515us             1  
+                                            torch_eager        19.90%     282.972us        99.65%       1.417ms       1.417ms       0.000us         0.00%     123.009us     123.009us             1  
+                                              aten::mul        10.25%     145.781us        17.92%     254.851us      10.619us      62.146us        51.30%      62.146us       2.589us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.146us        51.30%      62.146us       2.589us            24  
+                                            aten::copy_         7.07%     100.509us        44.20%     628.439us      34.913us      39.743us        32.80%      41.599us       2.311us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      29.055us        23.98%      29.055us       2.421us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.90%      19.264us       1.605us            12  
+                                            aten::clone         1.59%      22.604us        38.82%     551.881us      91.980us       0.000us         0.00%      12.544us       2.091us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.688us         8.82%      10.688us       1.781us             6  
+                                              aten::add         2.23%      31.661us         3.79%      53.922us       8.987us       9.633us         7.95%       9.633us       1.606us             6  
+                                              aten::sub         2.49%      35.352us         4.13%      58.732us       9.789us       9.631us         7.95%       9.631us       1.605us             6  
+                                Activity Buffer Request        16.91%     240.489us        16.91%     240.489us     240.489us       1.856us         1.53%       1.856us       1.856us             1  
+                                    aten::empty_strided         2.06%      29.230us         2.06%      29.230us       4.872us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.93%     226.498us        15.93%     226.498us      37.750us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.75%      67.473us         6.05%      86.070us       3.586us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.31%      18.597us         1.31%      18.597us       0.775us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.17%     215.654us        15.17%     215.654us       4.493us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       4.980us         0.35%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.422ms
+Self CUDA time total: 121.153us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     918.443us       533.10%     918.443us     918.443us             1  
+                                            torch_eager        20.03%     279.953us        99.65%       1.393ms       1.393ms       0.000us         0.00%     175.133us     175.133us             1  
+                                              aten::mul        10.59%     147.997us        18.47%     258.229us      10.760us      89.472us        51.93%      89.472us       3.728us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.472us        51.93%      89.472us       3.728us            24  
+                                            aten::copy_         7.43%     103.844us        43.15%     603.182us      33.510us      57.887us        33.60%      60.735us       3.374us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.831us        23.70%      40.831us       3.403us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.926us        14.47%      24.926us       2.077us            12  
+                                            aten::clone         1.45%      20.289us        37.34%     521.998us      87.000us       0.000us         0.00%      19.904us       3.317us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us         9.90%      17.056us       2.843us             6  
+                                              aten::add         2.21%      30.953us         3.79%      53.002us       8.834us      12.480us         7.24%      12.480us       2.080us             6  
+                                              aten::sub         2.40%      33.491us         4.09%      57.142us       9.524us      12.446us         7.22%      12.446us       2.074us             6  
+                                Activity Buffer Request        14.98%     209.468us        14.98%     209.468us     209.468us       2.848us         1.65%       2.848us       2.848us             1  
+                                    aten::empty_strided         2.03%      28.380us         2.03%      28.380us       4.730us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.36%     228.728us        16.36%     228.728us      38.121us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         5.25%      73.370us         6.64%      92.881us       3.870us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.40%      19.511us         1.40%      19.511us       0.813us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.53%     217.074us        15.53%     217.074us       4.522us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       4.950us         0.35%       4.950us       4.950us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.398ms
+Self CUDA time total: 172.285us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     945.822us       332.63%     945.822us     945.822us             1  
+                                            torch_eager        11.69%     314.391us        99.81%       2.685ms       2.685ms       0.000us         0.00%     302.941us     302.941us             1  
+                                              aten::mul         5.41%     145.454us         9.45%     254.127us      10.589us     133.310us        46.88%     133.310us       5.555us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.310us        46.88%     133.310us       5.555us            24  
+                                            aten::copy_         4.13%     111.027us        68.93%       1.854ms     103.002us     109.662us        38.57%     128.254us       7.125us            18  
+                                            aten::clone         1.07%      28.661us        65.93%       1.773ms     295.570us       0.000us         0.00%      70.912us      11.819us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.342us        20.17%      57.342us       4.779us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.320us        18.40%      52.320us       8.720us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.377us        14.55%      41.377us       3.448us            12  
+                                              aten::sub         1.27%      34.091us         2.15%      57.911us       9.652us      20.704us         7.28%      20.704us       3.451us             6  
+                                              aten::add         1.22%      32.950us         2.07%      55.610us       9.268us      20.673us         7.27%      20.673us       3.446us             6  
+                                Activity Buffer Request        54.12%       1.456ms        54.12%       1.456ms       1.456ms      18.592us         6.54%      18.592us      18.592us             1  
+                                    aten::empty_strided         1.18%      31.741us         1.18%      31.741us       5.290us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.32%     223.797us         8.32%     223.797us      37.300us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.55%      68.485us         3.28%      88.267us       3.678us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.74%      19.782us         0.74%      19.782us       0.824us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.13%     218.664us         8.13%     218.664us       4.555us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.100us         0.19%       5.100us       5.100us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.690ms
+Self CUDA time total: 284.349us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     938.033us       165.64%     938.033us     938.033us             1  
+                                            torch_eager        20.89%     291.484us        99.63%       1.390ms       1.390ms       0.000us         0.00%     590.004us     590.004us             1  
+                                            aten::copy_         7.34%     102.395us        41.53%     579.320us      32.184us     273.370us        48.27%     297.081us      16.504us            18  
+                                              aten::mul        10.73%     149.623us        18.75%     261.638us      10.902us     225.916us        39.89%     225.916us       9.413us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     225.916us        39.89%     225.916us       9.413us            24  
+                                            aten::clone         1.46%      20.369us        35.71%     498.147us      83.025us       0.000us         0.00%     206.459us      34.410us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     182.748us        32.27%     182.748us      30.458us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.622us        16.00%      90.622us       7.552us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      67.007us        11.83%      67.007us       5.584us            12  
+                                              aten::sub         2.52%      35.222us         4.78%      66.682us      11.114us      34.272us         6.05%      34.272us       5.712us             6  
+                                              aten::add         2.30%      32.121us         4.02%      56.063us       9.344us      32.735us         5.78%      32.735us       5.456us             6  
+                                Activity Buffer Request        14.16%     197.506us        14.16%     197.506us     197.506us      23.711us         4.19%      23.711us      23.711us             1  
+                                    aten::empty_strided         2.10%      29.332us         2.10%      29.332us       4.889us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.61%     217.828us        15.61%     217.828us      36.305us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.72%      65.792us         6.10%      85.041us       3.543us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.38%      19.249us         1.38%      19.249us       0.802us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.42%     229.008us        16.42%     229.008us       4.771us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.37%       5.150us         0.37%       5.150us       5.150us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.395ms
+Self CUDA time total: 566.293us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     912.211us       984.01%     912.211us     912.211us             1  
+                                            torch_eager        20.74%     286.708us        99.62%       1.377ms       1.377ms       0.000us         0.00%      93.855us      93.855us             1  
+                                              aten::mul        10.48%     144.890us        18.31%     253.080us      10.545us      49.856us        53.78%      49.856us       2.077us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.856us        53.78%      49.856us       2.077us            24  
+                                            aten::copy_         7.33%     101.333us        42.51%     587.542us      32.641us      29.407us        31.72%      30.559us       1.698us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.623us        24.40%      22.623us       1.885us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.440us        14.50%      13.440us       1.120us            12  
+                                            aten::clone         1.54%      21.251us        36.76%     508.068us      84.678us       0.000us         0.00%       7.936us       1.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us         7.32%       6.784us       1.131us             6  
+                                              aten::sub         2.53%      34.908us         4.26%      58.910us       9.818us       6.720us         7.25%       6.720us       1.120us             6  
+                                              aten::add         2.34%      32.341us         3.97%      54.832us       9.139us       6.720us         7.25%       6.720us       1.120us             6  
+                                Activity Buffer Request        14.89%     205.787us        14.89%     205.787us     205.787us       1.152us         1.24%       1.152us       1.152us             1  
+                                    aten::empty_strided         2.09%      28.901us         2.09%      28.901us       4.817us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.89%     219.618us        15.89%     219.618us      36.603us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.84%      66.885us         6.21%      85.845us       3.577us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.37%      18.960us         1.37%      18.960us       0.790us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.59%     215.487us        15.59%     215.487us       4.489us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.38%       5.210us         0.38%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.382ms
+Self CUDA time total: 92.703us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     938.901us       973.14%     938.901us     938.901us             1  
+                                            torch_eager        11.77%     313.313us        99.82%       2.656ms       2.656ms       0.000us         0.00%      97.825us      97.825us             1  
+                                              aten::mul         5.60%     148.957us         9.78%     260.340us      10.847us      51.266us        53.14%      51.266us       2.136us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.266us        53.14%      51.266us       2.136us            24  
+                                            aten::copy_         3.87%     103.023us        68.29%       1.817ms     100.957us      30.976us        32.11%      32.319us       1.795us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.072us        23.91%      23.072us       1.923us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.240us        14.76%      14.240us       1.187us            12  
+                                            aten::clone         1.07%      28.429us        65.69%       1.748ms     291.327us       0.000us         0.00%       9.247us       1.541us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us         8.19%       7.904us       1.317us             6  
+                                              aten::add         1.24%      33.110us         2.10%      56.011us       9.335us       7.137us         7.40%       7.137us       1.189us             6  
+                                              aten::sub         1.37%      36.490us         2.25%      59.790us       9.965us       7.103us         7.36%       7.103us       1.184us             6  
+                                Activity Buffer Request        53.84%       1.433ms        53.84%       1.433ms       1.433ms       1.343us         1.39%       1.343us       1.343us             1  
+                                    aten::empty_strided         1.19%      31.751us         1.19%      31.751us       5.292us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.25%     219.470us         8.25%     219.470us      36.578us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.63%      69.934us         3.35%      89.134us       3.714us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.72%      19.200us         0.72%      19.200us       0.800us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.25%     219.576us         8.25%     219.576us       4.574us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       4.910us         0.18%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.661ms
+Self CUDA time total: 96.482us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     932.446us       897.69%     932.446us     932.446us             1  
+                                            torch_eager        11.60%     307.685us        99.81%       2.647ms       2.647ms       0.000us         0.00%     105.184us     105.184us             1  
+                                              aten::mul         5.51%     146.123us         9.64%     255.679us      10.653us      55.362us        53.30%      55.362us       2.307us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.362us        53.30%      55.362us       2.307us            24  
+                                            aten::copy_         3.78%     100.194us        68.64%       1.821ms     101.144us      32.478us        31.27%      33.790us       1.877us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.703us        23.78%      24.703us       2.059us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.032us        15.43%      16.032us       1.336us            12  
+                                            aten::clone         1.02%      27.179us        65.92%       1.748ms     291.378us       0.000us         0.00%       9.087us       1.515us             6  
+                                              aten::add         1.19%      31.489us         2.03%      53.840us       8.973us       8.064us         7.76%       8.064us       1.344us             6  
+                                              aten::sub         1.35%      35.692us         2.26%      59.843us       9.974us       7.968us         7.67%       7.968us       1.328us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.775us         7.49%       7.775us       1.296us             6  
+                                Activity Buffer Request        54.18%       1.437ms        54.18%       1.437ms       1.437ms       1.312us         1.26%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.21%      32.003us         1.21%      32.003us       5.334us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.25%     218.717us         8.25%     218.717us      36.453us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.67%      70.760us         3.41%      90.371us       3.765us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.74%      19.611us         0.74%      19.611us       0.817us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.32%     220.800us         8.32%     220.800us       4.600us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.070us         0.19%       5.070us       5.070us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.652ms
+Self CUDA time total: 103.872us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     914.130us       736.81%     914.130us     914.130us             1  
+                                            torch_eager        19.76%     284.015us        99.65%       1.432ms       1.432ms       0.000us         0.00%     125.858us     125.858us             1  
+                                              aten::mul        10.20%     146.586us        17.70%     254.419us      10.601us      65.313us        52.64%      65.313us       2.721us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.313us        52.64%      65.313us       2.721us            24  
+                                            aten::copy_         7.71%     110.793us        44.82%     644.172us      35.787us      39.489us        31.83%      41.281us       2.293us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.961us        23.34%      28.961us       2.413us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.53%      19.264us       1.605us            12  
+                                            aten::clone         1.45%      20.820us        39.14%     562.560us      93.760us       0.000us         0.00%      12.320us       2.053us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us         8.49%      10.528us       1.755us             6  
+                                              aten::add         2.34%      33.572us         3.91%      56.142us       9.357us       9.664us         7.79%       9.664us       1.611us             6  
+                                              aten::sub         2.40%      34.530us         4.02%      57.751us       9.625us       9.600us         7.74%       9.600us       1.600us             6  
+                                Activity Buffer Request        17.82%     256.078us        17.82%     256.078us     256.078us       1.792us         1.44%       1.792us       1.792us             1  
+                                    aten::empty_strided         2.04%      29.262us         2.04%      29.262us       4.877us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        14.99%     215.437us        14.99%     215.437us      35.906us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.63%      66.508us         5.96%      85.660us       3.569us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.33%      19.152us         1.33%      19.152us       0.798us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.99%     215.488us        14.99%     215.488us       4.489us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       5.000us         0.35%       5.000us       5.000us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.437ms
+Self CUDA time total: 124.066us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     921.138us       886.26%     921.138us     921.138us             1  
+                                            torch_eager        20.59%     281.307us        99.64%       1.361ms       1.361ms       0.000us         0.00%     105.280us     105.280us             1  
+                                              aten::mul        10.84%     148.087us        18.91%     258.361us      10.765us      55.487us        53.39%      55.487us       2.312us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.487us        53.39%      55.487us       2.312us            24  
+                                            aten::copy_         7.39%     100.946us        41.35%     564.842us      31.380us      32.481us        31.25%      33.825us       1.879us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.640us        23.71%      24.640us       2.053us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.968us        15.36%      15.968us       1.331us            12  
+                                            aten::clone         1.54%      21.041us        35.66%     487.118us      81.186us       0.000us         0.00%       9.185us       1.531us             6  
+                                              aten::sub         2.75%      37.531us         4.47%      61.012us      10.169us       8.031us         7.73%       8.031us       1.339us             6  
+                                              aten::add         2.35%      32.112us         3.97%      54.222us       9.037us       7.937us         7.64%       7.937us       1.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.841us         7.54%       7.841us       1.307us             6  
+                                Activity Buffer Request        13.62%     186.046us        13.62%     186.046us     186.046us       1.344us         1.29%       1.344us       1.344us             1  
+                                    aten::empty_strided         2.20%      30.110us         2.20%      30.110us       5.018us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.76%     215.337us        15.76%     215.337us      35.890us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         5.18%      70.704us         6.60%      90.193us       3.758us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.43%      19.489us         1.43%      19.489us       0.812us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.99%     218.378us        15.99%     218.378us       4.550us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       4.960us         0.36%       4.960us       4.960us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.366ms
+Self CUDA time total: 103.936us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     943.466us       759.69%     943.466us     943.466us             1  
+                                            torch_eager        21.73%     302.071us        99.63%       1.385ms       1.385ms       0.000us         0.00%     125.950us     125.950us             1  
+                                              aten::mul        10.55%     146.657us        18.63%     259.039us      10.793us      65.378us        52.64%      65.378us       2.724us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.378us        52.64%      65.378us       2.724us            24  
+                                            aten::copy_         7.63%     106.103us        41.12%     571.631us      31.757us      39.519us        31.82%      41.278us       2.293us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      29.024us        23.37%      29.024us       2.419us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.294us        15.54%      19.294us       1.608us            12  
+                                            aten::clone         1.52%      21.080us        35.11%     488.057us      81.343us       0.000us         0.00%      12.254us       2.042us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.495us         8.45%      10.495us       1.749us             6  
+                                              aten::sub         2.46%      34.153us         4.15%      57.634us       9.606us       9.727us         7.83%       9.727us       1.621us             6  
+                                              aten::add         2.41%      33.450us         4.05%      56.342us       9.390us       9.567us         7.70%       9.567us       1.595us             6  
+                                Activity Buffer Request        13.70%     190.466us        13.70%     190.466us     190.466us       1.759us         1.42%       1.759us       1.759us             1  
+                                    aten::empty_strided         2.14%      29.791us         2.14%      29.791us       4.965us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.29%     212.610us        15.29%     212.610us      35.435us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.88%      67.802us         6.29%      87.511us       3.646us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.42%      19.709us         1.42%      19.709us       0.821us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.91%     221.207us        15.91%     221.207us       4.608us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.37%       5.080us         0.37%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.390ms
+Self CUDA time total: 124.191us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     909.497us       512.75%     909.497us     909.497us             1  
+                                            torch_eager        20.85%     278.298us        99.63%       1.330ms       1.330ms       0.000us         0.00%     180.288us     180.288us             1  
+                                              aten::mul        10.86%     144.977us        19.10%     254.920us      10.622us      94.591us        53.33%      94.591us       3.941us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.591us        53.33%      94.591us       3.941us            24  
+                                            aten::copy_         7.76%     103.603us        40.90%     545.870us      30.326us      57.919us        32.65%      60.831us       3.380us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.767us        22.98%      40.767us       3.397us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.866us        14.02%      24.866us       2.072us            12  
+                                            aten::clone         1.59%      21.200us        34.96%     466.526us      77.754us       0.000us         0.00%      20.064us       3.344us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.152us         9.67%      17.152us       2.859us             6  
+                                              aten::sub         2.64%      35.242us         4.38%      58.452us       9.742us      12.450us         7.02%      12.450us       2.075us             6  
+                                              aten::add         2.38%      31.821us         4.13%      55.081us       9.180us      12.416us         7.00%      12.416us       2.069us             6  
+                                Activity Buffer Request        12.93%     172.606us        12.93%     172.606us     172.606us       2.912us         1.64%       2.912us       2.912us             1  
+                                    aten::empty_strided         2.27%      30.341us         2.27%      30.341us       5.057us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.64%     208.798us        15.64%     208.798us      34.800us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.99%      66.616us         6.40%      85.475us       3.561us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.41%      18.859us         1.41%      18.859us       0.786us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.28%     217.276us        16.28%     217.276us       4.527us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.37%       5.001us         0.37%       5.001us       5.001us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.335ms
+Self CUDA time total: 177.376us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     908.914us       305.78%     908.914us     908.914us             1  
+                                            torch_eager        20.55%     283.527us        99.64%       1.375ms       1.375ms       0.000us         0.00%     314.296us     314.296us             1  
+                                              aten::mul        10.61%     146.340us        18.54%     255.803us      10.658us     145.086us        48.81%     145.086us       6.045us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.086us        48.81%     145.086us       6.045us            24  
+                                            aten::copy_         7.34%     101.324us        42.67%     588.790us      32.711us     111.099us        37.38%     128.154us       7.120us            18  
+                                            aten::clone         1.50%      20.722us        37.09%     511.699us      85.283us       0.000us         0.00%      70.718us      11.786us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.436us        19.32%      57.436us       4.786us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.663us        18.05%      53.663us       8.944us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.056us        13.81%      41.056us       3.421us            12  
+                                              aten::sub         2.49%      34.330us         4.16%      57.351us       9.558us      20.672us         6.95%      20.672us       3.445us             6  
+                                              aten::add         2.29%      31.611us         3.89%      53.723us       8.954us      20.384us         6.86%      20.384us       3.397us             6  
+                                Activity Buffer Request        15.84%     218.487us        15.84%     218.487us     218.487us      17.055us         5.74%      17.055us      17.055us             1  
+                                    aten::empty_strided         2.18%      30.110us         2.18%      30.110us       5.018us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.10%     208.357us        15.10%     208.357us      34.726us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.74%      65.442us         6.15%      84.803us       3.533us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.40%      19.361us         1.40%      19.361us       0.807us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.60%     215.218us        15.60%     215.218us       4.484us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       4.930us         0.36%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.380ms
+Self CUDA time total: 297.241us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     939.162us       529.48%     939.162us     939.162us             1  
+                                            torch_eager        11.57%     307.472us        99.80%       2.653ms       2.653ms       0.000us         0.00%     180.256us     180.256us             1  
+                                              aten::mul         5.55%     147.649us         9.66%     256.649us      10.694us      94.851us        53.47%      94.851us       3.952us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.851us        53.47%      94.851us       3.952us            24  
+                                            aten::copy_         3.85%     102.292us        68.52%       1.821ms     101.186us      57.759us        32.56%      60.639us       3.369us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.671us        22.93%      40.671us       3.389us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.766us        13.96%      24.766us       2.064us            12  
+                                            aten::clone         1.06%      28.080us        65.81%       1.749ms     291.547us       0.000us         0.00%      19.968us       3.328us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us         9.63%      17.088us       2.848us             6  
+                                              aten::add         1.13%      30.133us         1.96%      52.053us       8.675us      12.384us         6.98%      12.384us       2.064us             6  
+                                              aten::sub         1.27%      33.752us         2.15%      57.162us       9.527us      12.382us         6.98%      12.382us       2.064us             6  
+                                Activity Buffer Request        54.50%       1.449ms        54.50%       1.449ms       1.449ms       2.880us         1.62%       2.880us       2.880us             1  
+                                    aten::empty_strided         1.13%      30.142us         1.13%      30.142us       5.024us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         7.84%     208.428us         7.84%     208.428us      34.738us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.02%      80.309us         3.76%      99.911us       4.163us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.74%      19.602us         0.74%      19.602us       0.817us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.14%     216.293us         8.14%     216.293us       4.506us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.200us         0.20%       5.200us       5.200us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.658ms
+Self CUDA time total: 177.376us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     942.515us       317.36%     942.515us     942.515us             1  
+                                            torch_eager        20.57%     285.923us        99.62%       1.385ms       1.385ms       0.000us         0.00%     314.717us     314.717us             1  
+                                              aten::mul        10.73%     149.116us        18.62%     258.870us      10.786us     145.439us        48.97%     145.439us       6.060us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.439us        48.97%     145.439us       6.060us            24  
+                                            aten::copy_         7.46%     103.659us        42.33%     588.488us      32.694us     110.749us        37.29%     128.477us       7.138us            18  
+                                            aten::clone         1.56%      21.753us        36.61%     508.959us      84.826us       0.000us         0.00%      71.104us      11.851us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.373us        19.32%      57.373us       4.781us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.376us        17.97%      53.376us       8.896us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.801us        13.74%      40.801us       3.400us            12  
+                                              aten::sub         2.38%      33.081us         4.03%      56.021us       9.337us      20.449us         6.89%      20.449us       3.408us             6  
+                                              aten::add         2.40%      33.331us         4.05%      56.271us       9.379us      20.352us         6.85%      20.352us       3.392us             6  
+                                Activity Buffer Request        14.18%     197.118us        14.18%     197.118us     197.118us      17.728us         5.97%      17.728us      17.728us             1  
+                                    aten::empty_strided         2.21%      30.780us         2.21%      30.780us       5.130us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.19%     225.018us        16.19%     225.018us      37.503us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.87%      67.722us         6.24%      86.713us       3.613us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.37%      18.991us         1.37%      18.991us       0.791us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.71%     218.327us        15.71%     218.327us       4.548us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.38%       5.310us         0.38%       5.310us       5.310us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.390ms
+Self CUDA time total: 296.989us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     928.214us       158.30%     928.214us     928.214us             1  
+                                            torch_eager        21.21%     285.194us        99.61%       1.340ms       1.340ms       0.000us         0.00%     610.012us     610.012us             1  
+                                            aten::copy_         7.59%     102.047us        40.19%     540.521us      30.029us     268.445us        45.78%     292.093us      16.227us            18  
+                                              aten::mul        11.07%     148.860us        19.42%     261.184us      10.883us     251.679us        42.92%     251.679us      10.487us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     251.679us        42.92%     251.679us      10.487us            24  
+                                            aten::clone         1.57%      21.069us        34.26%     460.696us      76.783us       0.000us         0.00%     201.406us      33.568us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     177.758us        30.32%     177.758us      29.626us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.687us        15.47%      90.687us       7.557us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      66.240us        11.30%      66.240us       5.520us            12  
+                                              aten::sub         2.72%      36.642us         4.50%      60.582us      10.097us      33.152us         5.65%      33.152us       5.525us             6  
+                                              aten::add         2.29%      30.800us         3.93%      52.901us       8.817us      33.088us         5.64%      33.088us       5.515us             6  
+                                Activity Buffer Request        12.31%     165.596us        12.31%     165.596us     165.596us      23.648us         4.03%      23.648us      23.648us             1  
+                                    aten::empty_strided         2.19%      29.501us         2.19%      29.501us       4.917us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.63%     210.266us        15.63%     210.266us      35.044us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         5.16%      69.374us         6.60%      88.734us       3.697us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.44%      19.360us         1.44%      19.360us       0.807us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.43%     220.977us        16.43%     220.977us       4.604us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.39%       5.180us         0.39%       5.180us       5.180us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.345ms
+Self CUDA time total: 586.364us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         9.32%     323.657us        76.63%       2.662ms       2.662ms       0.000us         0.00%       1.834ms       1.834ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.806ms       102.11%       1.806ms       1.806ms             1  
+                                            aten::copy_         3.12%     108.276us        52.46%       1.822ms     101.225us     791.134us        44.74%     857.278us      47.627us            18  
+                                              aten::mul         4.16%     144.572us         7.37%     256.109us      10.671us     827.198us        46.78%     827.198us      34.467us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     827.198us        46.78%     827.198us      34.467us            24  
+                                            aten::clone         0.81%      28.142us        50.15%       1.742ms     290.300us       0.000us         0.00%     624.095us     104.016us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     557.951us        31.55%     557.951us      92.992us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     233.183us        13.19%     233.183us      19.432us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     149.919us         8.48%     149.919us      12.493us            12  
+                                              aten::sub         0.98%      34.102us         1.65%      57.362us       9.560us      90.368us         5.11%      90.368us      15.061us             6  
+                                Activity Buffer Request        41.53%       1.443ms        41.53%       1.443ms       1.443ms      66.144us         3.74%      66.144us      66.144us             1  
+                                              aten::add         0.89%      30.740us         1.53%      53.293us       8.882us      59.551us         3.37%      59.551us       9.925us             6  
+                                    aten::empty_strided         0.86%      29.871us         0.86%      29.871us       4.979us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         5.94%     206.426us         5.94%     206.426us      34.404us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.06%      71.442us         2.62%      91.034us       3.793us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.56%      19.592us         0.56%      19.592us       0.816us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         6.40%     222.192us         6.40%     222.192us       4.629us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize        23.37%     811.698us        23.37%     811.698us     811.698us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 3.473ms
+Self CUDA time total: 1.768ms
+
+
+impl                     wl                  p50(ms)  ok
+torch_eager              cuda_B1_S128_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H32_D64_R32     0.22  True
+torch_eager              cuda_B1_S128_H8_D128_R64     0.23  True
+torch_eager              cuda_B1_S128_H8_D64_R32     0.17  True
+torch_eager              cuda_B1_S2048_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S2048_H32_D64_R32     0.22  True
+torch_eager              cuda_B1_S2048_H8_D128_R64     0.22  True
+torch_eager              cuda_B1_S2048_H8_D64_R32     0.22  True
+torch_eager              cuda_B1_S512_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S512_H32_D64_R32     0.22  True
+torch_eager              cuda_B1_S512_H8_D128_R64     0.22  True
+torch_eager              cuda_B1_S512_H8_D64_R32     0.22  True
+torch_eager              cuda_B2_S128_H32_D128_R64     0.22  True
+torch_eager              cuda_B2_S128_H32_D64_R32     0.22  True
+torch_eager              cuda_B2_S128_H8_D128_R64     0.22  True
+torch_eager              cuda_B2_S128_H8_D64_R32     0.22  True
+torch_eager              cuda_B2_S2048_H32_D128_R64     0.64  True
+torch_eager              cuda_B2_S2048_H32_D64_R32     0.23  True
+torch_eager              cuda_B2_S2048_H8_D128_R64     0.22  True
+torch_eager              cuda_B2_S2048_H8_D64_R32     0.22  True
+torch_eager              cuda_B2_S512_H32_D128_R64     0.22  True
+torch_eager              cuda_B2_S512_H32_D64_R32     0.22  True
+torch_eager              cuda_B2_S512_H8_D128_R64     0.22  True
+torch_eager              cuda_B2_S512_H8_D64_R32     0.22  True
+
+
+

Artifacts:

+rotary.jsonl +
+
+
+