+

HF Kernels - Rotary Position Embeddings

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.23s + | + +Raw +GitHub +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Tue Oct 28 14:08:24 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   29C    P0             90W /  350W |       0MiB /  46068MiB |     24%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+ +

Rotary Embeddings Benchmark

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 8.05s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+#     "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the rotary kernel
+rotary = get_kernel("kernels-community/rotary")
+
+
+def hf_kernels_rotary(query, key, cos, sin, conj=False):
+    rotary_dim = cos.shape[-1]
+
+    # Clone to avoid modifying inputs
+    q_out = query.clone()
+    k_out = key.clone()
+
+    # Apply rotation to query
+    q1 = q_out[..., :rotary_dim]
+    q2 = q_out[..., rotary_dim : 2 * rotary_dim]
+    rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)
+
+    # Apply rotation to key
+    k1 = k_out[..., :rotary_dim]
+    k2 = k_out[..., rotary_dim : 2 * rotary_dim]
+    rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)
+
+    return q_out, k_out
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.ROTARY,
+    impl_name="hf_kernels_rotary",
+    impl_tags={"family": "hf-kernels", "backend": "cuda"},
+    impl_func=hf_kernels_rotary,
+)
+
+ +
+
+
+
+
Running rotary benchmark on cuda with 24 workloads.
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     452.802us      1907.02%     452.802us     452.802us             1  
+                                      hf_kernels_rotary        12.50%     264.332us        99.65%       2.107ms       2.107ms       0.000us         0.00%      24.960us      24.960us             1  
+                          _rotary_dba7d1e::apply_rotary         2.70%      57.162us         4.91%     103.733us      17.289us      16.928us        71.29%      16.928us       2.821us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us        71.29%      16.928us       2.821us             6  
+                                            aten::clone         2.21%      46.761us        79.27%       1.676ms     279.401us       0.000us         0.00%       8.032us       1.339us             6  
+                                            aten::copy_         2.31%      48.833us        74.02%       1.565ms     260.899us       6.816us        28.71%       8.032us       1.339us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.816us        28.71%       6.816us       1.136us             6  
+                                Activity Buffer Request        68.03%       1.439ms        68.03%       1.439ms       1.439ms       1.216us         5.12%       1.216us       1.216us             1  
+                                    aten::empty_strided         3.04%      64.252us         3.04%      64.252us      10.709us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         3.68%      77.892us         3.68%      77.892us      12.982us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.33%      49.309us         2.97%      62.771us       5.231us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.64%      13.462us         0.64%      13.462us       1.122us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.20%      46.571us         2.20%      46.571us       7.762us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.35%       7.480us         0.35%       7.480us       7.480us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.115ms
+Self CUDA time total: 23.744us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     357.532us      1513.94%     357.532us     357.532us             1  
+                                      hf_kernels_rotary         9.61%     183.785us        99.72%       1.907ms       1.907ms       0.000us         0.00%      24.736us      24.736us             1  
+                          _rotary_dba7d1e::apply_rotary         2.38%      45.511us         4.57%      87.364us      14.561us      16.832us        71.27%      16.832us       2.805us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.832us        71.27%      16.832us       2.805us             6  
+                                            aten::clone         1.27%      24.322us        83.40%       1.595ms     265.794us       0.000us         0.00%       7.904us       1.317us             6  
+                                            aten::copy_         1.98%      37.831us        80.39%       1.537ms     256.202us       6.784us        28.73%       7.904us       1.317us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us        28.73%       6.784us       1.131us             6  
+                                Activity Buffer Request        75.51%       1.444ms        75.51%       1.444ms       1.444ms       1.120us         4.74%       1.120us       1.120us             1  
+                                    aten::empty_strided         1.74%      33.230us         1.74%      33.230us       5.538us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.90%      55.533us         2.90%      55.533us       9.256us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.68%      32.211us         2.13%      40.791us       3.399us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.45%       8.580us         0.45%       8.580us       0.715us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.19%      41.853us         2.19%      41.853us       6.976us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       5.420us         0.28%       5.420us       5.420us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.912ms
+Self CUDA time total: 23.616us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     340.570us      1359.24%     340.570us     340.570us             1  
+                                      hf_kernels_rotary         8.83%     169.069us        99.74%       1.910ms       1.910ms       0.000us         0.00%      26.368us      26.368us             1  
+                          _rotary_dba7d1e::apply_rotary         2.33%      44.610us         4.50%      86.120us      14.353us      17.248us        68.84%      17.248us       2.875us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.248us        68.84%      17.248us       2.875us             6  
+                                            aten::clone         1.25%      23.991us        84.27%       1.614ms     269.024us       0.000us         0.00%       9.120us       1.520us             6  
+                                            aten::copy_         1.92%      36.791us        81.38%       1.559ms     259.779us       7.808us        31.16%       9.120us       1.520us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us        31.16%       7.808us       1.301us             6  
+                                Activity Buffer Request        76.60%       1.467ms        76.60%       1.467ms       1.467ms       1.312us         5.24%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.64%      31.482us         1.64%      31.482us       5.247us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.85%      54.600us         2.85%      54.600us       9.100us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.69%      32.440us         2.15%      41.092us       3.424us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.45%       8.652us         0.45%       8.652us       0.721us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.17%      41.510us         2.17%      41.510us       6.918us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.26%       4.990us         0.26%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.915ms
+Self CUDA time total: 25.056us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     346.075us      1340.08%     346.075us     346.075us             1  
+                                      hf_kernels_rotary         7.97%     168.270us        99.76%       2.107ms       2.107ms       0.000us         0.00%      27.137us      27.137us             1  
+                          _rotary_dba7d1e::apply_rotary         2.16%      45.651us         4.14%      87.411us      14.569us      18.049us        69.89%      18.049us       3.008us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.049us        69.89%      18.049us       3.008us             6  
+                                            aten::clone         1.15%      24.271us        85.69%       1.810ms     301.630us       0.000us         0.00%       9.088us       1.515us             6  
+                                            aten::copy_         1.78%      37.581us        83.02%       1.753ms     292.225us       7.776us        30.11%       9.088us       1.515us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        30.11%       7.776us       1.296us             6  
+                                Activity Buffer Request        68.60%       1.449ms        68.60%       1.449ms       1.449ms       1.312us         5.08%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.52%      32.162us         1.52%      32.162us       5.360us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        12.64%     267.018us        12.64%     267.018us      44.503us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.55%      32.701us         1.96%      41.360us       3.447us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.41%       8.659us         0.41%       8.659us       0.722us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.98%      41.760us         1.98%      41.760us       6.960us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       5.141us         0.24%       5.141us       5.141us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.112ms
+Self CUDA time total: 25.825us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     383.355us      1524.21%     383.355us     383.355us             1  
+                                      hf_kernels_rotary         8.48%     177.428us        99.77%       2.088ms       2.088ms       0.000us         0.00%      26.495us      26.495us             1  
+                          _rotary_dba7d1e::apply_rotary         3.05%      63.861us         5.13%     107.442us      17.907us      17.215us        68.45%      17.215us       2.869us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.215us        68.45%      17.215us       2.869us             6  
+                                            aten::clone         1.13%      23.688us        84.02%       1.758ms     293.025us       0.000us         0.00%       9.280us       1.547us             6  
+                                            aten::copy_         1.90%      39.711us        81.30%       1.701ms     283.530us       7.936us        31.55%       9.280us       1.547us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        31.55%       7.936us       1.323us             6  
+                                Activity Buffer Request        67.53%       1.413ms        67.53%       1.413ms       1.413ms       1.344us         5.34%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.59%      33.283us         1.59%      33.283us       5.547us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.87%     248.348us        11.87%     248.348us      41.391us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.70%      35.532us         2.14%      44.714us       3.726us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.44%       9.182us         0.44%       9.182us       0.765us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.08%      43.581us         2.08%      43.581us       7.264us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       4.831us         0.23%       4.831us       4.831us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.093ms
+Self CUDA time total: 25.151us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     348.288us      1348.70%     348.288us     348.288us             1  
+                                      hf_kernels_rotary         8.04%     167.026us        99.77%       2.072ms       2.072ms       0.000us         0.00%      27.136us      27.136us             1  
+                          _rotary_dba7d1e::apply_rotary         2.17%      45.031us         4.15%      86.212us      14.369us      18.016us        69.76%      18.016us       3.003us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.016us        69.76%      18.016us       3.003us             6  
+                                            aten::clone         1.23%      25.613us        85.56%       1.777ms     296.124us       0.000us         0.00%       9.120us       1.520us             6  
+                                            aten::copy_         1.80%      37.380us        82.71%       1.718ms     286.270us       7.808us        30.24%       9.120us       1.520us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us        30.24%       7.808us       1.301us             6  
+                                Activity Buffer Request        69.08%       1.434ms        69.08%       1.434ms       1.434ms       1.312us         5.08%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.61%      33.511us         1.61%      33.511us       5.585us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.83%     245.758us        11.83%     245.758us      40.960us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.59%      33.022us         2.01%      41.843us       3.487us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.42%       8.821us         0.42%       8.821us       0.735us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.98%      41.181us         1.98%      41.181us       6.863us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       4.770us         0.23%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.077ms
+Self CUDA time total: 25.824us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     342.589us      1056.85%     342.589us     342.589us             1  
+                                      hf_kernels_rotary         8.06%     166.005us        99.77%       2.055ms       2.055ms       0.000us         0.00%      34.208us      34.208us             1  
+                          _rotary_dba7d1e::apply_rotary         2.10%      43.163us         4.03%      82.914us      13.819us      21.856us        67.42%      21.856us       3.643us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      21.856us        67.42%      21.856us       3.643us             6  
+                                            aten::clone         1.18%      24.311us        85.73%       1.766ms     294.310us       0.000us         0.00%      12.352us       2.059us             6  
+                                            aten::copy_         1.85%      38.151us        82.92%       1.708ms     284.677us      10.560us        32.58%      12.352us       2.059us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us        32.58%      10.560us       1.760us             6  
+                                Activity Buffer Request        69.37%       1.429ms        69.37%       1.429ms       1.429ms       1.792us         5.53%       1.792us       1.792us             1  
+                                    aten::empty_strided         1.63%      33.490us         1.63%      33.490us       5.582us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.70%     241.040us        11.70%     241.040us      40.173us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.54%      31.672us         1.96%      40.421us       3.368us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.42%       8.749us         0.42%       8.749us       0.729us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.93%      39.751us         1.93%      39.751us       6.625us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       4.681us         0.23%       4.681us       4.681us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.060ms
+Self CUDA time total: 32.416us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.021us       674.53%     349.021us     349.021us             1  
+                                      hf_kernels_rotary         8.13%     167.188us        99.77%       2.053ms       2.053ms       0.000us         0.00%      54.656us      54.656us             1  
+                          _rotary_dba7d1e::apply_rotary         2.05%      42.101us         4.09%      84.171us      14.029us      34.590us        66.85%      34.590us       5.765us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      34.590us        66.85%      34.590us       5.765us             6  
+                                            aten::clone         1.20%      24.743us        85.45%       1.758ms     292.975us       0.000us         0.00%      20.066us       3.344us             6  
+                                            aten::copy_         1.77%      36.360us        82.61%       1.700ms     283.256us      17.153us        33.15%      20.066us       3.344us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.153us        33.15%      17.153us       2.859us             6  
+                                Activity Buffer Request        69.27%       1.425ms        69.27%       1.425ms       1.425ms       2.913us         5.63%       2.913us       2.913us             1  
+                                    aten::empty_strided         1.63%      33.571us         1.63%      33.571us       5.595us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.58%     238.157us        11.58%     238.157us      39.693us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.68%      34.499us         2.11%      43.362us       3.614us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.43%       8.863us         0.43%       8.863us       0.739us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.04%      42.070us         2.04%      42.070us       7.012us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       4.701us         0.23%       4.701us       4.701us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.057ms
+Self CUDA time total: 51.743us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     342.845us      1058.69%     342.845us     342.845us             1  
+                                      hf_kernels_rotary         7.95%     162.638us        99.78%       2.041ms       2.041ms       0.000us         0.00%      34.176us      34.176us             1  
+                          _rotary_dba7d1e::apply_rotary         2.08%      42.501us         4.07%      83.221us      13.870us      21.760us        67.19%      21.760us       3.627us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      21.760us        67.19%      21.760us       3.627us             6  
+                                            aten::clone         1.16%      23.762us        85.72%       1.754ms     292.258us       0.000us         0.00%      12.416us       2.069us             6  
+                                            aten::copy_         1.82%      37.190us        83.02%       1.698ms     283.036us      10.624us        32.81%      12.416us       2.069us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.624us        32.81%      10.624us       1.771us             6  
+                                Activity Buffer Request        69.60%       1.424ms        69.60%       1.424ms       1.424ms       1.792us         5.53%       1.792us       1.792us             1  
+                                    aten::empty_strided         1.54%      31.570us         1.54%      31.570us       5.262us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.60%     237.247us        11.60%     237.247us      39.541us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.62%      33.195us         2.03%      41.584us       3.465us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.41%       8.389us         0.41%       8.389us       0.699us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.99%      40.720us         1.99%      40.720us       6.787us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.22%       4.600us         0.22%       4.600us       4.600us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.046ms
+Self CUDA time total: 32.384us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     345.276us       667.68%     345.276us     345.276us             1  
+                                      hf_kernels_rotary        17.87%     159.778us        99.47%     889.262us     889.262us       0.000us         0.00%      54.593us      54.593us             1  
+                          _rotary_dba7d1e::apply_rotary         4.83%      43.201us         9.55%      85.402us      14.234us      34.656us        67.02%      34.656us       5.776us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      34.656us        67.02%      34.656us       5.776us             6  
+                                            aten::clone         2.69%      24.052us        67.57%     604.071us     100.678us       0.000us         0.00%      19.937us       3.323us             6  
+                                            aten::copy_         3.98%      35.591us        61.32%     548.169us      91.362us      17.057us        32.98%      19.937us       3.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.057us        32.98%      17.057us       2.843us             6  
+                                Activity Buffer Request        31.28%     279.600us        31.28%     279.600us     279.600us       2.880us         5.57%       2.880us       2.880us             1  
+                                    aten::empty_strided         3.56%      31.850us         3.56%      31.850us       5.308us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        26.06%     232.978us        26.06%     232.978us      38.830us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.51%      31.369us         4.48%      40.011us       3.334us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.97%       8.642us         0.97%       8.642us       0.720us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.72%      42.201us         4.72%      42.201us       7.034us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.53%       4.740us         0.53%       4.740us       4.740us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 894.002us
+Self CUDA time total: 51.713us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     372.345us       343.04%     372.345us     372.345us             1  
+                                      hf_kernels_rotary        19.45%     178.278us        99.48%     911.643us     911.643us       0.000us         0.00%     126.592us     126.592us             1  
+                                            aten::clone         2.39%      21.900us        65.33%     598.671us      99.778us       0.000us         0.00%      69.792us      11.632us             6  
+                                            aten::copy_         4.20%      38.503us        59.48%     545.071us      90.845us      51.744us        47.67%      69.792us      11.632us             6  
+                          _rotary_dba7d1e::apply_rotary         5.03%      46.070us         9.81%      89.853us      14.975us      56.800us        52.33%      56.800us       9.467us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      56.800us        52.33%      56.800us       9.467us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.744us        47.67%      51.744us       8.624us             6  
+                                Activity Buffer Request        29.76%     272.689us        29.76%     272.689us     272.689us      18.048us        16.63%      18.048us      18.048us             1  
+                                    aten::empty_strided         3.46%      31.700us         3.46%      31.700us       5.283us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        25.52%     233.879us        25.52%     233.879us      38.980us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.90%      35.730us         4.89%      44.841us       3.737us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.99%       9.111us         0.99%       9.111us       0.759us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.78%      43.783us         4.78%      43.783us       7.297us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.52%       4.730us         0.52%       4.730us       4.730us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 916.373us
+Self CUDA time total: 108.544us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     373.881us       208.27%     373.881us     373.881us             1  
+                                      hf_kernels_rotary        17.56%     156.837us        99.52%     888.752us     888.752us       0.000us         0.00%     203.231us     203.231us             1  
+                                            aten::clone         2.51%      22.450us        65.45%     584.500us      97.417us       0.000us         0.00%     102.431us      17.072us             6  
+                                            aten::copy_         4.24%      37.839us        59.27%     529.299us      88.217us      78.719us        43.85%     102.431us      17.072us             6  
+                          _rotary_dba7d1e::apply_rotary         4.89%      43.682us        11.68%     104.316us      17.386us     100.800us        56.15%     100.800us      16.800us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     100.800us        56.15%     100.800us      16.800us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      78.719us        43.85%      78.719us      13.120us             6  
+                                Activity Buffer Request        29.56%     264.020us        29.56%     264.020us     264.020us      23.712us        13.21%      23.712us      23.712us             1  
+                                    aten::empty_strided         3.67%      32.751us         3.67%      32.751us       5.458us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        25.47%     227.440us        25.47%     227.440us      37.907us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.79%      33.838us         4.83%      43.099us       3.592us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.04%       9.261us         1.04%       9.261us       0.772us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         6.79%      60.634us         6.79%      60.634us      10.106us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.48%       4.320us         0.48%       4.320us       4.320us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 893.072us
+Self CUDA time total: 179.519us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     339.068us      1293.81%     339.068us     339.068us             1  
+                                      hf_kernels_rotary        18.21%     158.266us        99.46%     864.691us     864.691us       0.000us         0.00%      27.359us      27.359us             1  
+                          _rotary_dba7d1e::apply_rotary         4.98%      43.284us         9.71%      84.425us      14.071us      19.391us        73.99%      19.391us       3.232us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.391us        73.99%      19.391us       3.232us             6  
+                                            aten::clone         2.67%      23.179us        66.79%     580.620us      96.770us       0.000us         0.00%       7.968us       1.328us             6  
+                                            aten::copy_         4.38%      38.042us        60.58%     526.630us      87.772us       6.816us        26.01%       7.968us       1.328us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.816us        26.01%       6.816us       1.136us             6  
+                                Activity Buffer Request        29.98%     260.620us        29.98%     260.620us     260.620us       1.152us         4.40%       1.152us       1.152us             1  
+                                    aten::empty_strided         3.54%      30.811us         3.54%      30.811us       5.135us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        26.22%     227.968us        26.22%     227.968us      37.995us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.77%      32.731us         4.76%      41.380us       3.448us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.99%       8.649us         0.99%       8.649us       0.721us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.73%      41.141us         4.73%      41.141us       6.857us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.54%       4.651us         0.54%       4.651us       4.651us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 869.342us
+Self CUDA time total: 26.207us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     345.689us      1259.02%     345.689us     345.689us             1  
+                                      hf_kernels_rotary        18.17%     159.455us        99.46%     872.870us     872.870us       0.000us         0.00%      28.769us      28.769us             1  
+                          _rotary_dba7d1e::apply_rotary         4.92%      43.180us         9.80%      85.973us      14.329us      19.616us        71.44%      19.616us       3.269us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.616us        71.44%      19.616us       3.269us             6  
+                                            aten::clone         2.64%      23.140us        66.83%     586.460us      97.743us       0.000us         0.00%       9.153us       1.526us             6  
+                                            aten::copy_         4.27%      37.430us        60.39%     529.960us      88.327us       7.841us        28.56%       9.153us       1.526us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.841us        28.56%       7.841us       1.307us             6  
+                                Activity Buffer Request        29.89%     262.350us        29.89%     262.350us     262.350us       1.312us         4.78%       1.312us       1.312us             1  
+                                    aten::empty_strided         3.80%      33.360us         3.80%      33.360us       5.560us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        26.23%     230.180us        26.23%     230.180us      38.363us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.66%      32.161us         4.67%      40.982us       3.415us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.01%       8.821us         1.01%       8.821us       0.735us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.88%      42.793us         4.88%      42.793us       7.132us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.54%       4.730us         0.54%       4.730us       4.730us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 877.600us
+Self CUDA time total: 27.457us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     352.280us      1238.42%     352.280us     352.280us             1  
+                                      hf_kernels_rotary        18.63%     163.526us        99.48%     873.041us     873.041us       0.000us         0.00%      29.790us      29.790us             1  
+                          _rotary_dba7d1e::apply_rotary         4.98%      43.742us         9.85%      86.414us      14.402us      20.606us        72.44%      20.606us       3.434us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.606us        72.44%      20.606us       3.434us             6  
+                                            aten::clone         2.59%      22.720us        66.23%     581.279us      96.880us       0.000us         0.00%       9.184us       1.531us             6  
+                                            aten::copy_         4.14%      36.351us        59.98%     526.379us      87.730us       7.840us        27.56%       9.184us       1.531us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.840us        27.56%       7.840us       1.307us             6  
+                                Activity Buffer Request        30.03%     263.549us        30.03%     263.549us     263.549us       1.344us         4.72%       1.344us       1.344us             1  
+                                    aten::empty_strided         3.67%      32.180us         3.67%      32.180us       5.363us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        25.81%     226.479us        25.81%     226.479us      37.747us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.76%      33.033us         4.77%      41.822us       3.485us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.00%       8.789us         1.00%       8.789us       0.732us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.86%      42.672us         4.86%      42.672us       7.112us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.52%       4.560us         0.52%       4.560us       4.560us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 877.601us
+Self CUDA time total: 28.446us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     341.881us       953.86%     341.881us     341.881us             1  
+                                      hf_kernels_rotary        17.61%     155.956us        99.45%     880.921us     880.921us       0.000us         0.00%      37.634us      37.634us             1  
+                          _rotary_dba7d1e::apply_rotary         4.86%      43.060us         9.73%      86.184us      14.364us      25.312us        70.62%      25.312us       4.219us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.312us        70.62%      25.312us       4.219us             6  
+                                            aten::clone         2.52%      22.319us        67.43%     597.290us      99.548us       0.000us         0.00%      12.322us       2.054us             6  
+                                            aten::copy_         4.12%      36.502us        61.34%     543.331us      90.555us      10.530us        29.38%      12.322us       2.054us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.530us        29.38%      10.530us       1.755us             6  
+                                Activity Buffer Request        31.67%     280.550us        31.67%     280.550us     280.550us       1.792us         5.00%       1.792us       1.792us             1  
+                                    aten::empty_strided         3.57%      31.640us         3.57%      31.640us       5.273us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        25.54%     226.279us        25.54%     226.279us      37.713us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.70%      32.812us         4.68%      41.491us       3.458us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.98%       8.679us         0.98%       8.679us       0.723us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.87%      43.124us         4.87%      43.124us       7.187us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.55%       4.910us         0.55%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 885.831us
+Self CUDA time total: 35.842us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     348.158us      1221.01%     348.158us     348.158us             1  
+                                      hf_kernels_rotary         7.73%     158.832us        99.76%       2.051ms       2.051ms       0.000us         0.00%      29.858us      29.858us             1  
+                          _rotary_dba7d1e::apply_rotary         2.18%      44.723us         4.13%      84.825us      14.138us      20.674us        72.50%      20.674us       3.446us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.674us        72.50%      20.674us       3.446us             6  
+                                            aten::clone         1.24%      25.490us        85.81%       1.764ms     294.032us       0.000us         0.00%       9.184us       1.531us             6  
+                                            aten::copy_         1.80%      37.082us        83.01%       1.707ms     284.462us       7.840us        27.50%       9.184us       1.531us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.840us        27.50%       7.840us       1.307us             6  
+                                Activity Buffer Request        70.14%       1.442ms        70.14%       1.442ms       1.442ms       1.344us         4.71%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.55%      31.931us         1.55%      31.931us       5.322us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.07%     227.598us        11.07%     227.598us      37.933us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.67%      34.312us         2.11%      43.312us       3.609us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.44%       9.000us         0.44%       9.000us       0.750us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.95%      40.102us         1.95%      40.102us       6.684us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.880us         0.24%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.056ms
+Self CUDA time total: 28.514us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     344.320us       959.86%     344.320us     344.320us             1  
+                                      hf_kernels_rotary        18.29%     156.315us        99.44%     849.960us     849.960us       0.000us         0.00%      37.664us      37.664us             1  
+                          _rotary_dba7d1e::apply_rotary         5.15%      43.990us        10.72%      91.654us      15.276us      25.312us        70.56%      25.312us       4.219us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.312us        70.56%      25.312us       4.219us             6  
+                                            aten::clone         2.62%      22.368us        65.70%     561.560us      93.593us       0.000us         0.00%      12.352us       2.059us             6  
+                                            aten::copy_         4.13%      35.283us        59.24%     506.308us      84.385us      10.560us        29.44%      12.352us       2.059us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us        29.44%      10.560us       1.760us             6  
+                                Activity Buffer Request        29.39%     251.239us        29.39%     251.239us     251.239us       1.792us         5.00%       1.792us       1.792us             1  
+                                    aten::empty_strided         3.85%      32.884us         3.85%      32.884us       5.481us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        25.71%     219.786us        25.71%     219.786us      36.631us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.67%      31.402us         4.73%      40.431us       3.369us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.06%       9.029us         1.06%       9.029us       0.752us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.58%      47.664us         5.58%      47.664us       7.944us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.56%       4.781us         0.56%       4.781us       4.781us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 854.741us
+Self CUDA time total: 35.872us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     335.158us       593.10%     335.158us     335.158us             1  
+                                      hf_kernels_rotary        18.22%     154.324us        99.44%     842.379us     842.379us       0.000us         0.00%      59.390us      59.390us             1  
+                          _rotary_dba7d1e::apply_rotary         4.99%      42.273us         9.84%      83.374us      13.896us      39.454us        69.82%      39.454us       6.576us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.454us        69.82%      39.454us       6.576us             6  
+                                            aten::clone         2.56%      21.663us        66.58%     564.010us      94.002us       0.000us         0.00%      19.936us       3.323us             6  
+                                            aten::copy_         4.16%      35.260us        60.33%     511.017us      85.169us      17.056us        30.18%      19.936us       3.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us        30.18%      17.056us       2.843us             6  
+                                Activity Buffer Request        30.26%     256.319us        30.26%     256.319us     256.319us       2.880us         5.10%       2.880us       2.880us             1  
+                                    aten::empty_strided         3.70%      31.330us         3.70%      31.330us       5.222us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        25.90%     219.438us        25.90%     219.438us      36.573us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.75%      31.762us         4.80%      40.671us       3.389us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.05%       8.909us         1.05%       8.909us       0.742us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.85%      41.101us         4.85%      41.101us       6.850us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.56%       4.710us         0.56%       4.710us       4.710us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 847.089us
+Self CUDA time total: 56.510us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     369.080us       312.82%     369.080us     369.080us             1  
+                                      hf_kernels_rotary        20.18%     177.506us        99.45%     874.621us     874.621us       0.000us         0.00%     134.912us     134.912us             1  
+                                            aten::clone         2.49%      21.878us        64.31%     565.600us      94.267us       0.000us         0.00%      69.696us      11.616us             6  
+                                            aten::copy_         4.23%      37.163us        58.33%     512.969us      85.495us      52.768us        44.72%      69.696us      11.616us             6  
+                          _rotary_dba7d1e::apply_rotary         5.24%      46.042us        10.09%      88.704us      14.784us      65.216us        55.28%      65.216us      10.869us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      65.216us        55.28%      65.216us      10.869us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.768us        44.72%      52.768us       8.795us             6  
+                                Activity Buffer Request        28.97%     254.819us        28.97%     254.819us     254.819us      16.928us        14.35%      16.928us      16.928us             1  
+                                    aten::empty_strided         3.50%      30.753us         3.50%      30.753us       5.126us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        25.13%     220.987us        25.13%     220.987us      36.831us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.86%      33.990us         4.87%      42.811us       3.568us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.00%       8.821us         1.00%       8.821us       0.735us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.85%      42.662us         4.85%      42.662us       7.110us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.55%       4.870us         0.55%       4.870us       4.870us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 879.491us
+Self CUDA time total: 117.984us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     360.471us       637.52%     360.471us     360.471us             1  
+                                      hf_kernels_rotary        18.70%     161.865us        99.47%     860.760us     860.760us       0.000us         0.00%      59.391us      59.391us             1  
+                          _rotary_dba7d1e::apply_rotary         5.21%      45.111us        10.32%      89.333us      14.889us      39.487us        69.84%      39.487us       6.581us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.487us        69.84%      39.487us       6.581us             6  
+                                            aten::clone         2.76%      23.842us        65.28%     564.941us      94.157us       0.000us         0.00%      19.904us       3.317us             6  
+                                            aten::copy_         4.31%      37.312us        58.89%     509.589us      84.931us      17.056us        30.16%      19.904us       3.317us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us        30.16%      17.056us       2.843us             6  
+                                Activity Buffer Request        29.00%     250.989us        29.00%     250.989us     250.989us       2.848us         5.04%       2.848us       2.848us             1  
+                                    aten::empty_strided         3.64%      31.510us         3.64%      31.510us       5.252us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        25.57%     221.288us        25.57%     221.288us      36.881us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.04%      34.983us         5.16%      44.621us       3.718us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.11%       9.638us         1.11%       9.638us       0.803us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.11%      44.222us         5.11%      44.222us       7.370us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.53%       4.600us         0.53%       4.600us       4.600us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 865.360us
+Self CUDA time total: 56.543us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     348.053us       293.57%     348.053us     348.053us             1  
+                                      hf_kernels_rotary        18.59%     158.086us        99.46%     845.630us     845.630us       0.000us         0.00%     135.933us     135.933us             1  
+                                            aten::clone         2.59%      22.020us        65.95%     560.690us      93.448us       0.000us         0.00%      70.752us      11.792us             6  
+                                            aten::copy_         4.43%      37.632us        59.68%     507.389us      84.565us      53.376us        45.02%      70.752us      11.792us             6  
+                          _rotary_dba7d1e::apply_rotary         5.16%      43.870us        10.14%      86.234us      14.372us      65.181us        54.98%      65.181us      10.864us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      65.181us        54.98%      65.181us      10.864us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.376us        45.02%      53.376us       8.896us             6  
+                                Activity Buffer Request        29.66%     252.179us        29.66%     252.179us     252.179us      17.376us        14.66%      17.376us      17.376us             1  
+                                    aten::empty_strided         3.68%      31.281us         3.68%      31.281us       5.213us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        25.59%     217.578us        25.59%     217.578us      36.263us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.78%      32.121us         4.78%      40.620us       3.385us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.00%       8.499us         1.00%       8.499us       0.708us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.98%      42.364us         4.98%      42.364us       7.061us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.54%       4.590us         0.54%       4.590us       4.590us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 850.220us
+Self CUDA time total: 118.557us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     361.432us       183.93%     361.432us     361.432us             1  
+                                      hf_kernels_rotary        18.55%     158.934us        99.44%     851.909us     851.909us       0.000us         0.00%     220.221us     220.221us             1  
+                          _rotary_dba7d1e::apply_rotary         5.09%      43.629us        10.06%      86.174us      14.362us     115.517us        58.78%     115.517us      19.253us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     115.517us        58.78%     115.517us      19.253us             6  
+                                            aten::clone         2.64%      22.651us        66.00%     565.440us      94.240us       0.000us         0.00%     104.704us      17.451us             6  
+                                            aten::copy_         4.43%      37.970us        59.78%     512.129us      85.355us      80.992us        41.22%     104.704us      17.451us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      80.992us        41.22%      80.992us      13.499us             6  
+                                Activity Buffer Request        29.36%     251.489us        29.36%     251.489us     251.489us      23.712us        12.07%      23.712us      23.712us             1  
+                                    aten::empty_strided         3.58%      30.660us         3.58%      30.660us       5.110us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        25.99%     222.670us        25.99%     222.670us      37.112us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.80%      32.582us         4.83%      41.361us       3.447us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.02%       8.779us         1.02%       8.779us       0.732us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.97%      42.545us         4.97%      42.545us       7.091us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.56%       4.770us         0.56%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 856.679us
+Self CUDA time total: 196.509us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary        12.27%     154.345us        67.03%     843.460us     843.460us       0.000us         0.00%     849.461us     849.461us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     791.349us       101.00%     791.349us     791.349us             1  
+                                            aten::clone         1.79%      22.531us        44.41%     558.811us      93.135us       0.000us         0.00%     577.848us      96.308us             6  
+                                            aten::copy_         2.94%      36.962us        40.15%     505.198us      84.200us     511.865us        65.33%     577.848us      96.308us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     511.865us        65.33%     511.865us      85.311us             6  
+                          _rotary_dba7d1e::apply_rotary         3.50%      44.071us         7.04%      88.532us      14.755us     271.613us        34.67%     271.613us      45.269us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     271.613us        34.67%     271.613us      45.269us             6  
+                                Activity Buffer Request        20.09%     252.769us        20.09%     252.769us     252.769us      65.983us         8.42%      65.983us      65.983us             1  
+                                    aten::empty_strided         2.47%      31.082us         2.47%      31.082us       5.180us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.12%     215.467us        17.12%     215.467us      35.911us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.61%      32.851us         3.32%      41.772us       3.481us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.71%       8.921us         0.71%       8.921us       0.743us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         3.53%      44.461us         3.53%      44.461us       7.410us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        32.97%     414.834us        32.97%     414.834us     414.834us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.258ms
+Self CUDA time total: 783.478us
+
+
+impl                     wl                  p50(ms)  ok
+hf_kernels_rotary        cuda_B1_S128_H32_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S128_H32_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.08  False
+hf_kernels_rotary        cuda_B1_S2048_H32_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B1_S2048_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S2048_H8_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B1_S512_H32_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S512_H32_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B1_S512_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S512_H8_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B2_S128_H32_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B2_S128_H32_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B2_S128_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B2_S128_H8_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B2_S2048_H32_D128_R64     0.28  False
+hf_kernels_rotary        cuda_B2_S2048_H32_D64_R32     0.10  False
+hf_kernels_rotary        cuda_B2_S2048_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B2_S2048_H8_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B2_S512_H32_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B2_S512_H32_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B2_S512_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B2_S512_H8_D64_R32     0.09  False
+
+
+
▶ UV Install Logs
+ +
+
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] +Fetching 5 files: 20%|██ | 1/5 [00:00<00:00, 7.39it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 16.59it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 15.43it/s]
+
+

Artifacts:

+rotary.jsonl +
+
+
+