HF Kernels - Rotary Position Embeddings

GPU Info

▼ code ▼ output ▶ uv-logs | Cell: nv | 0.25s | Raw GitHub 🤗 HF
import subprocess
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
Fri Dec 19 19:54:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
| N/A   36C    P0             90W /  350W |       0MiB /  46068MiB |     17%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

Rotary Embeddings Benchmark

▼ code ▼ output ▶ uv-logs | Cell: benchmark | 6.21s | Raw GitHub 🤗 HF
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "numpy",
#     "torch==2.8.0",
#     "kernels-benchmark-tools",
#     "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel

# Load the rotary kernel
rotary = get_kernel("kernels-community/rotary")


def hf_kernels_rotary(query, key, cos, sin, conj=False):
    rotary_dim = cos.shape[-1]

    # Clone to avoid modifying inputs
    q_out = query.clone()
    k_out = key.clone()

    # Apply rotation to query
    q1 = q_out[..., :rotary_dim]
    q2 = q_out[..., rotary_dim : 2 * rotary_dim]
    rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)

    # Apply rotation to key
    k1 = k_out[..., :rotary_dim]
    k2 = k_out[..., rotary_dim : 2 * rotary_dim]
    rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)

    return q_out, k_out


run_benchmark(
    kernel_type=KernelTypeEnum.ROTARY,
    impl_name="hf_kernels_rotary",
    impl_tags={"family": "hf-kernels", "backend": "cuda"},
    impl_func=hf_kernels_rotary,
    dtype="float32",
)
Running rotary benchmark on cuda with 24 workloads.

======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     397.918us      1712.80%     397.918us     397.918us             1  
                                      hf_kernels_rotary        10.09%     234.566us        99.43%       2.310ms       2.310ms       0.000us         0.00%      24.512us      24.512us             1  
                          _rotary_dba7d1e::apply_rotary         2.29%      53.121us         4.37%     101.432us      16.905us      16.192us        69.70%      16.192us       2.699us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.192us        69.70%      16.192us       2.699us             6  
                                            aten::clone         1.67%      38.889us        82.80%       1.924ms     320.673us       0.000us         0.00%       8.320us       1.387us             6  
                                            aten::copy_         1.71%      39.649us        78.85%       1.832ms     305.366us       7.040us        30.30%       8.320us       1.387us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.040us        30.30%       7.040us       1.173us             6  
                                Activity Buffer Request        74.05%       1.721ms        74.05%       1.721ms       1.721ms       1.280us         5.51%       1.280us       1.280us             1  
                                    aten::empty_strided         2.28%      52.952us         2.28%      52.952us       8.825us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         3.09%      71.834us         3.09%      71.834us      11.972us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.69%      39.231us         2.17%      50.432us       4.203us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.48%      11.201us         0.48%      11.201us       0.933us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         2.08%      48.311us         2.08%      48.311us       8.052us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.57%      13.240us         0.57%      13.240us      13.240us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.324ms
Self CUDA time total: 23.232us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     351.006us      1460.58%     351.006us     351.006us             1  
                                      hf_kernels_rotary         8.50%     188.914us        99.67%       2.215ms       2.215ms       0.000us         0.00%      25.344us      25.344us             1  
                          _rotary_dba7d1e::apply_rotary         1.92%      42.653us         3.86%      85.804us      14.301us      16.160us        67.24%      16.160us       2.693us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.160us        67.24%      16.160us       2.693us             6  
                                            aten::clone         1.34%      29.789us        85.47%       1.899ms     316.566us       0.000us         0.00%       9.184us       1.531us             6  
                                            aten::copy_         1.59%      35.393us        82.63%       1.836ms     306.029us       7.872us        32.76%       9.184us       1.531us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us        32.76%       7.872us       1.312us             6  
                                Activity Buffer Request        78.48%       1.744ms        78.48%       1.744ms       1.744ms       1.312us         5.46%       1.312us       1.312us             1  
                                    aten::empty_strided         1.50%      33.432us         1.50%      33.432us       5.572us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         2.55%      56.710us         2.55%      56.710us       9.452us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.45%      32.280us         1.84%      40.820us       3.402us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.38%       8.540us         0.38%       8.540us       0.712us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.94%      43.151us         1.94%      43.151us       7.192us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.33%       7.360us         0.33%       7.360us       7.360us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.222ms
Self CUDA time total: 24.032us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     328.861us      1359.32%     328.861us     328.861us             1  
                                      hf_kernels_rotary         7.82%     169.265us        99.74%       2.160ms       2.160ms       0.000us         0.00%      25.505us      25.505us             1  
                          _rotary_dba7d1e::apply_rotary         1.90%      41.240us         3.83%      83.032us      13.839us      16.449us        67.99%      16.449us       2.742us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.449us        67.99%      16.449us       2.742us             6  
                                            aten::clone         1.22%      26.522us        86.28%       1.868ms     311.384us       0.000us         0.00%       9.056us       1.509us             6  
                                            aten::copy_         1.60%      34.652us        83.63%       1.811ms     301.836us       7.744us        32.01%       9.056us       1.509us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us        32.01%       7.744us       1.291us             6  
                                Activity Buffer Request        79.55%       1.723ms        79.55%       1.723ms       1.723ms       1.312us         5.42%       1.312us       1.312us             1  
                                    aten::empty_strided         1.42%      30.770us         1.42%      30.770us       5.128us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         2.49%      53.840us         2.49%      53.840us       8.973us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.40%      30.337us         1.81%      39.289us       3.274us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.41%       8.952us         0.41%       8.952us       0.746us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.93%      41.792us         1.93%      41.792us       6.965us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.26%       5.540us         0.26%       5.540us       5.540us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.165ms
Self CUDA time total: 24.193us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     332.000us      1184.36%     332.000us     332.000us             1  
                                      hf_kernels_rotary         7.19%     171.403us        99.79%       2.378ms       2.378ms       0.000us         0.00%      29.792us      29.792us             1  
                          _rotary_dba7d1e::apply_rotary         1.72%      40.922us         3.47%      82.793us      13.799us      17.632us        62.90%      17.632us       2.939us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.632us        62.90%      17.632us       2.939us             6  
                                            aten::clone         1.16%      27.640us        87.43%       2.084ms     347.303us       0.000us         0.00%      12.160us       2.027us             6  
                                            aten::copy_         1.42%      33.951us        84.95%       2.025ms     337.488us      10.400us        37.10%      12.160us       2.027us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us        37.10%      10.400us       1.733us             6  
                                Activity Buffer Request        73.90%       1.761ms        73.90%       1.761ms       1.761ms       1.760us         6.28%       1.760us       1.760us             1  
                                    aten::empty_strided         1.31%      31.250us         1.31%      31.250us       5.208us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.63%     229.586us         9.63%     229.586us      38.264us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.31%      31.193us         1.70%      40.482us       3.373us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.39%       9.289us         0.39%       9.289us       0.774us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.76%      41.871us         1.76%      41.871us       6.979us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.21%       5.050us         0.21%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.384ms
Self CUDA time total: 28.032us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     335.418us      1384.59%     335.418us     335.418us             1  
                                      hf_kernels_rotary        20.26%     170.294us        99.34%     834.940us     834.940us       0.000us         0.00%      25.537us      25.537us             1  
                          _rotary_dba7d1e::apply_rotary         4.83%      40.562us         9.92%      83.412us      13.902us      16.513us        68.17%      16.513us       2.752us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.513us        68.17%      16.513us       2.752us             6  
                                            aten::clone         2.64%      22.222us        64.45%     541.674us      90.279us       0.000us         0.00%       9.024us       1.504us             6  
                                            aten::copy_         4.18%      35.111us        57.94%     486.972us      81.162us       7.712us        31.83%       9.024us       1.504us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us        31.83%       7.712us       1.285us             6  
                                Activity Buffer Request        27.90%     234.506us        27.90%     234.506us     234.506us       1.312us         5.42%       1.312us       1.312us             1  
                                    aten::empty_strided         3.86%      32.480us         3.86%      32.480us       5.413us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        25.86%     217.355us        25.86%     217.355us      36.226us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.66%      30.769us         4.71%      39.560us       3.297us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         1.05%       8.791us         1.05%       8.791us       0.733us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         5.10%      42.850us         5.10%      42.850us       7.142us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.66%       5.560us         0.66%       5.560us       5.560us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 840.500us
Self CUDA time total: 24.225us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     335.419us      1187.11%     335.419us     335.419us             1  
                                      hf_kernels_rotary        22.11%     159.484us        99.14%     715.038us     715.038us       0.000us         0.00%      30.047us      30.047us             1  
                          _rotary_dba7d1e::apply_rotary         6.01%      43.310us        12.22%      88.120us      14.687us      17.727us        62.74%      17.727us       2.954us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.727us        62.74%      17.727us       2.954us             6  
                                            aten::clone         2.86%      20.618us        59.23%     427.160us      71.193us       0.000us         0.00%      12.320us       2.053us             6  
                                            aten::copy_         4.70%      33.871us        52.15%     376.129us      62.688us      10.528us        37.26%      12.320us       2.053us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us        37.26%      10.528us       1.755us             6  
                                Activity Buffer Request        17.96%     129.563us        17.96%     129.563us     129.563us       1.792us         6.34%       1.792us       1.792us             1  
                                    aten::empty_strided         4.22%      30.413us         4.22%      30.413us       5.069us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        29.49%     212.695us        29.49%     212.695us      35.449us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.36%      31.443us         5.58%      40.274us       3.356us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         1.22%       8.831us         1.22%       8.831us       0.736us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         6.21%      44.810us         6.21%      44.810us       7.468us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.86%       6.181us         0.86%       6.181us       6.181us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 721.219us
Self CUDA time total: 28.255us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     339.104us       833.12%     339.104us     339.104us             1  
                                      hf_kernels_rotary         7.67%     177.694us        99.76%       2.310ms       2.310ms       0.000us         0.00%      43.583us      43.583us             1  
                          _rotary_dba7d1e::apply_rotary         1.80%      41.651us         3.62%      83.803us      13.967us      23.520us        57.78%      23.520us       3.920us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.520us        57.78%      23.520us       3.920us             6  
                                            aten::clone         1.20%      27.761us        86.70%       2.008ms     334.588us       0.000us         0.00%      20.063us       3.344us             6  
                                            aten::copy_         1.49%      34.550us        84.17%       1.949ms     324.808us      17.183us        42.22%      20.063us       3.344us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.183us        42.22%      17.183us       2.864us             6  
                                Activity Buffer Request        73.48%       1.701ms        73.48%       1.701ms       1.701ms       2.880us         7.08%       2.880us       2.880us             1  
                                    aten::empty_strided         1.34%      30.920us         1.34%      30.920us       5.153us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.20%     212.976us         9.20%     212.976us      35.496us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.39%      32.120us         1.76%      40.721us       3.393us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.37%       8.601us         0.37%       8.601us       0.717us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.82%      42.152us         1.82%      42.152us       7.025us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.24%       5.660us         0.24%       5.660us       5.660us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.315ms
Self CUDA time total: 40.703us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     342.239us       459.79%     342.239us     342.239us             1  
                                      hf_kernels_rotary         7.38%     176.732us        99.78%       2.390ms       2.390ms       0.000us         0.00%      82.913us      82.913us             1  
                                            aten::clone         1.17%      28.130us        87.22%       2.089ms     348.177us       0.000us         0.00%      43.777us       7.296us             6  
                                            aten::copy_         1.46%      34.971us        84.72%       2.029ms     338.203us      35.297us        47.42%      43.777us       7.296us             6  
                          _rotary_dba7d1e::apply_rotary         1.71%      40.931us         3.50%      83.864us      13.977us      39.136us        52.58%      39.136us       6.523us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.136us        52.58%      39.136us       6.523us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      35.297us        47.42%      35.297us       5.883us             6  
                                Activity Buffer Request        74.20%       1.777ms        74.20%       1.777ms       1.777ms       8.480us        11.39%       8.480us       8.480us             1  
                                    aten::empty_strided         1.32%      31.711us         1.32%      31.711us       5.285us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.06%     217.015us         9.06%     217.015us      36.169us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.30%      31.251us         1.69%      40.431us       3.369us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.38%       9.180us         0.38%       9.180us       0.765us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.79%      42.933us         1.79%      42.933us       7.155us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.22%       5.191us         0.22%       5.191us       5.191us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.395ms
Self CUDA time total: 74.433us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     332.929us       819.86%     332.929us     332.929us             1  
                                      hf_kernels_rotary         7.07%     166.848us        99.78%       2.353ms       2.353ms       0.000us         0.00%      43.488us      43.488us             1  
                          _rotary_dba7d1e::apply_rotary         1.68%      39.560us         3.51%      82.681us      13.780us      23.488us        57.84%      23.488us       3.915us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.488us        57.84%      23.488us       3.915us             6  
                                            aten::clone         1.16%      27.340us        87.49%       2.064ms     343.923us       0.000us         0.00%      20.000us       3.333us             6  
                                            aten::copy_         1.51%      35.519us        85.00%       2.005ms     334.108us      17.120us        42.16%      20.000us       3.333us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.120us        42.16%      17.120us       2.853us             6  
                                Activity Buffer Request        74.50%       1.757ms        74.50%       1.757ms       1.757ms       2.880us         7.09%       2.880us       2.880us             1  
                                    aten::empty_strided         1.34%      31.550us         1.34%      31.550us       5.258us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         8.99%     212.096us         8.99%     212.096us      35.349us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.35%      31.900us         1.71%      40.350us       3.362us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.36%       8.450us         0.36%       8.450us       0.704us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.83%      43.121us         1.83%      43.121us       7.187us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.22%       5.120us         0.22%       5.120us       5.120us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.359ms
Self CUDA time total: 40.608us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     338.142us       439.39%     338.142us     338.142us             1  
                                      hf_kernels_rotary         7.05%     174.064us        99.80%       2.465ms       2.465ms       0.000us         0.00%      86.270us      86.270us             1  
                                            aten::clone         1.20%      29.650us        87.84%       2.170ms     361.584us       0.000us         0.00%      47.071us       7.845us             6  
                                            aten::copy_         1.42%      34.959us        85.36%       2.108ms     351.395us      37.759us        49.06%      47.071us       7.845us             6  
                          _rotary_dba7d1e::apply_rotary         1.66%      41.022us         3.32%      82.043us      13.674us      39.199us        50.94%      39.199us       6.533us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.199us        50.94%      39.199us       6.533us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      37.759us        49.06%      37.759us       6.293us             6  
                                Activity Buffer Request        75.49%       1.864ms        75.49%       1.864ms       1.864ms       9.312us        12.10%       9.312us       9.312us             1  
                                    aten::empty_strided         1.27%      31.482us         1.27%      31.482us       5.247us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         8.46%     208.956us         8.46%     208.956us      34.826us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.25%      30.771us         1.59%      39.290us       3.274us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.34%       8.519us         0.34%       8.519us       0.710us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.66%      41.021us         1.66%      41.021us       6.837us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.20%       5.010us         0.20%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.470ms
Self CUDA time total: 76.958us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     343.679us       247.41%     343.679us     343.679us             1  
                                      hf_kernels_rotary         7.47%     171.684us        99.76%       2.294ms       2.294ms       0.000us         0.00%     162.559us     162.559us             1  
                                            aten::clone         1.25%      28.681us        86.89%       1.998ms     333.015us       0.000us         0.00%     102.592us      17.099us             6  
                                            aten::copy_         1.49%      34.332us        84.20%       1.936ms     322.703us      78.944us        56.83%     102.592us      17.099us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      78.944us        56.83%      78.944us      13.157us             6  
                          _rotary_dba7d1e::apply_rotary         1.83%      42.151us         3.66%      84.183us      14.030us      59.967us        43.17%      59.967us       9.995us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      59.967us        43.17%      59.967us       9.995us             6  
                                Activity Buffer Request        73.81%       1.697ms        73.81%       1.697ms       1.697ms      23.648us        17.02%      23.648us      23.648us             1  
                                    aten::empty_strided         1.44%      33.190us         1.44%      33.190us       5.532us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         8.89%     204.504us         8.89%     204.504us      34.084us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.37%      31.511us         1.74%      40.120us       3.343us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.37%       8.609us         0.37%       8.609us       0.717us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.83%      42.032us         1.83%      42.032us       7.005us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.24%       5.461us         0.24%       5.461us       5.461us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.300ms
Self CUDA time total: 138.911us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         6.47%     175.111us        88.03%       2.384ms       2.384ms       0.000us         0.00%     770.847us     770.847us             1  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     713.695us       101.13%     713.695us     713.695us             1  
                                            aten::clone         1.03%      27.812us        76.48%       2.071ms     345.215us       0.000us         0.00%     568.671us      94.778us             6  
                                            aten::copy_         1.35%      36.632us        74.30%       2.012ms     335.367us     503.551us        71.35%     568.671us      94.778us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     503.551us        71.35%     503.551us      83.925us             6  
                          _rotary_dba7d1e::apply_rotary         1.88%      50.972us         3.57%      96.775us      16.129us     202.176us        28.65%     202.176us      33.696us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     202.176us        28.65%     202.176us      33.696us             6  
                                Activity Buffer Request        65.43%       1.772ms        65.43%       1.772ms       1.772ms      65.120us         9.23%      65.120us      65.120us             1  
                                    aten::empty_strided         1.15%      31.280us         1.15%      31.280us       5.213us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         7.52%     203.605us         7.52%     203.605us      33.934us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.18%      32.050us         1.51%      41.000us       3.417us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.33%       8.950us         0.33%       8.950us       0.746us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.69%      45.803us         1.69%      45.803us       7.634us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize        11.97%     324.077us        11.97%     324.077us     324.077us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.708ms
Self CUDA time total: 705.727us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     336.477us      1265.33%     336.477us     336.477us             1  
                                      hf_kernels_rotary         7.39%     171.995us        99.77%       2.322ms       2.322ms       0.000us         0.00%      27.904us      27.904us             1  
                          _rotary_dba7d1e::apply_rotary         1.76%      41.061us         3.57%      83.061us      13.844us      18.816us        70.76%      18.816us       3.136us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.816us        70.76%      18.816us       3.136us             6  
                                            aten::clone         1.24%      28.803us        87.11%       2.027ms     337.843us       0.000us         0.00%       9.088us       1.515us             6  
                                            aten::copy_         1.64%      38.082us        84.50%       1.966ms     327.735us       7.776us        29.24%       9.088us       1.515us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        29.24%       7.776us       1.296us             6  
                                Activity Buffer Request        72.85%       1.695ms        72.85%       1.695ms       1.695ms       1.312us         4.93%       1.312us       1.312us             1  
                                    aten::empty_strided         1.37%      31.849us         1.37%      31.849us       5.308us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        10.01%     232.925us        10.01%     232.925us      38.821us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.33%      30.872us         1.71%      39.700us       3.308us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.38%       8.828us         0.38%       8.828us       0.736us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.80%      42.000us         1.80%      42.000us       7.000us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.23%       5.320us         0.23%       5.320us       5.320us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.327ms
Self CUDA time total: 26.592us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     361.593us      1356.47%     361.593us     361.593us             1  
                                      hf_kernels_rotary        20.01%     156.574us        99.29%     776.889us     776.889us       0.000us         0.00%      27.969us      27.969us             1  
                          _rotary_dba7d1e::apply_rotary         6.08%      47.572us        11.90%      93.114us      15.519us      18.881us        70.83%      18.881us       3.147us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.881us        70.83%      18.881us       3.147us             6  
                                            aten::clone         2.86%      22.401us        61.73%     483.012us      80.502us       0.000us         0.00%       9.088us       1.515us             6  
                                            aten::copy_         5.02%      39.248us        54.77%     428.540us      71.423us       7.776us        29.17%       9.088us       1.515us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        29.17%       7.776us       1.296us             6  
                                Activity Buffer Request        23.43%     183.305us        23.43%     183.305us     183.305us       1.312us         4.92%       1.312us       1.312us             1  
                                    aten::empty_strided         4.10%      32.071us         4.10%      32.071us       5.345us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        26.33%     205.987us        26.33%     205.987us      34.331us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.44%      34.709us         5.65%      44.189us       3.682us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         1.21%       9.480us         1.21%       9.480us       0.790us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         5.82%      45.542us         5.82%      45.542us       7.590us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.71%       5.560us         0.71%       5.560us       5.560us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 782.449us
Self CUDA time total: 26.657us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     329.530us      1073.84%     329.530us     329.530us             1  
                                      hf_kernels_rotary        18.69%     150.673us        99.29%     800.610us     800.610us       0.000us         0.00%      32.447us      32.447us             1  
                          _rotary_dba7d1e::apply_rotary         5.09%      41.061us        10.37%      83.622us      13.937us      20.159us        65.69%      20.159us       3.360us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.159us        65.69%      20.159us       3.360us             6  
                                            aten::clone         2.45%      19.762us        65.24%     526.013us      87.669us       0.000us         0.00%      12.288us       2.048us             6  
                                            aten::copy_         4.31%      34.749us        58.96%     475.401us      79.234us      10.528us        34.31%      12.288us       2.048us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us        34.31%      10.528us       1.755us             6  
                                Activity Buffer Request        29.87%     240.876us        29.87%     240.876us     240.876us       1.760us         5.74%       1.760us       1.760us             1  
                                    aten::empty_strided         3.83%      30.850us         3.83%      30.850us       5.142us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        24.78%     199.776us        24.78%     199.776us      33.296us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.92%      31.582us         5.00%      40.302us       3.358us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         1.08%       8.720us         1.08%       8.720us       0.727us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         5.28%      42.561us         5.28%      42.561us       7.094us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.71%       5.701us         0.71%       5.701us       5.701us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 806.311us
Self CUDA time total: 30.687us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     329.435us       772.31%     329.435us     329.435us             1  
                                      hf_kernels_rotary        20.10%     145.342us        99.21%     717.528us     717.528us       0.000us         0.00%      45.504us      45.504us             1  
                          _rotary_dba7d1e::apply_rotary         5.72%      41.400us        11.55%      83.552us      13.925us      25.568us        59.94%      25.568us       4.261us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.568us        59.94%      25.568us       4.261us             6  
                                            aten::clone         2.77%      20.032us        61.99%     448.302us      74.717us       0.000us         0.00%      19.936us       3.323us             6  
                                            aten::copy_         4.93%      35.690us        55.04%     398.089us      66.348us      17.088us        40.06%      19.936us       3.323us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us        40.06%      17.088us       2.848us             6  
                                Activity Buffer Request        22.43%     162.214us        22.43%     162.214us     162.214us       2.848us         6.68%       2.848us       2.848us             1  
                                    aten::empty_strided         4.17%      30.181us         4.17%      30.181us       5.030us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        27.68%     200.185us        27.68%     200.185us      33.364us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.37%      31.593us         5.58%      40.332us       3.361us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         1.21%       8.739us         1.21%       8.739us       0.728us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         5.83%      42.152us         5.83%      42.152us       7.025us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.79%       5.700us         0.79%       5.700us       5.700us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 723.228us
Self CUDA time total: 42.656us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     330.651us      1075.32%     330.651us     330.651us             1  
                                      hf_kernels_rotary        18.72%     150.931us        99.33%     800.790us     800.790us       0.000us         0.00%      32.508us      32.508us             1  
                          _rotary_dba7d1e::apply_rotary         5.21%      41.984us        10.24%      82.545us      13.758us      20.318us        66.08%      20.318us       3.386us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.318us        66.08%      20.318us       3.386us             6  
                                            aten::clone         2.51%      20.253us        65.40%     527.224us      87.871us       0.000us         0.00%      12.190us       2.032us             6  
                                            aten::copy_         4.39%      35.371us        59.06%     476.161us      79.360us      10.431us        33.92%      12.190us       2.032us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us        33.92%      10.431us       1.738us             6  
                                Activity Buffer Request        30.20%     243.496us        30.20%     243.496us     243.496us       1.759us         5.72%       1.759us       1.759us             1  
                                    aten::empty_strided         3.82%      30.810us         3.82%      30.810us       5.135us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        24.47%     197.294us        24.47%     197.294us      32.882us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.87%      31.220us         4.97%      40.090us       3.341us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         1.10%       8.870us         1.10%       8.870us       0.739us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         5.03%      40.561us         5.03%      40.561us       6.760us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.67%       5.380us         0.67%       5.380us       5.380us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 806.170us
Self CUDA time total: 30.749us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     330.616us       770.43%     330.616us     330.616us             1  
                                      hf_kernels_rotary        17.99%     148.834us        99.35%     822.020us     822.020us       0.000us         0.00%      45.761us      45.761us             1  
                          _rotary_dba7d1e::apply_rotary         5.00%      41.361us        10.40%      86.012us      14.335us      25.857us        60.25%      25.857us       4.310us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.857us        60.25%      25.857us       4.310us             6  
                                            aten::clone         2.40%      19.879us        66.22%     547.932us      91.322us       0.000us         0.00%      19.904us       3.317us             6  
                                            aten::copy_         4.29%      35.520us        60.10%     497.262us      82.877us      17.056us        39.75%      19.904us       3.317us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us        39.75%      17.056us       2.843us             6  
                                Activity Buffer Request        32.00%     264.767us        32.00%     264.767us     264.767us       2.848us         6.64%       2.848us       2.848us             1  
                                    aten::empty_strided         3.72%      30.791us         3.72%      30.791us       5.132us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        23.81%     196.975us        23.81%     196.975us      32.829us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.76%      31.140us         4.74%      39.242us       3.270us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.98%       8.102us         0.98%       8.102us       0.675us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         5.40%      44.651us         5.40%      44.651us       7.442us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.65%       5.369us         0.65%       5.369us       5.369us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 827.389us
Self CUDA time total: 42.913us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     334.653us       358.15%     334.653us     334.653us             1  
                                      hf_kernels_rotary        18.09%     151.114us        99.35%     829.780us     829.780us       0.000us         0.00%     109.215us     109.215us             1  
                                            aten::clone         3.33%      27.839us        66.37%     554.323us      92.387us       0.000us         0.00%      68.064us      11.344us             6  
                                            aten::copy_         4.18%      34.911us        59.27%     495.081us      82.513us      52.288us        55.96%      68.064us      11.344us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.288us        55.96%      52.288us       8.715us             6  
                          _rotary_dba7d1e::apply_rotary         4.95%      41.342us         9.97%      83.303us      13.884us      41.151us        44.04%      41.151us       6.858us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      41.151us        44.04%      41.151us       6.858us             6  
                                Activity Buffer Request        31.64%     264.256us        31.64%     264.256us     264.256us      15.776us        16.88%      15.776us      15.776us             1  
                                    aten::empty_strided         3.76%      31.403us         3.76%      31.403us       5.234us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        23.46%     195.914us        23.46%     195.914us      32.652us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.87%      32.310us         4.91%      41.040us       3.420us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         1.05%       8.730us         1.05%       8.730us       0.728us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         5.02%      41.961us         5.02%      41.961us       6.994us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.65%       5.470us         0.65%       5.470us       5.470us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 835.250us
Self CUDA time total: 93.439us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     402.293us       278.14%     402.293us     402.293us             1  
                                      hf_kernels_rotary        18.23%     162.247us        99.42%     884.932us     884.932us       0.000us         0.00%     168.347us     168.347us             1  
                                            aten::clone         2.56%      22.809us        61.35%     546.082us      91.014us       0.000us         0.00%     105.212us      17.535us             6  
                                            aten::copy_         3.90%      34.711us        55.16%     490.942us      81.824us      81.501us        56.35%     105.212us      17.535us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.501us        56.35%      81.501us      13.584us             6  
                          _rotary_dba7d1e::apply_rotary         4.99%      44.421us        15.06%     134.003us      22.334us      63.135us        43.65%      63.135us      10.522us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      63.135us        43.65%      63.135us      10.522us             6  
                                Activity Buffer Request        29.54%     262.907us        29.54%     262.907us     262.907us      23.711us        16.39%      23.711us      23.711us             1  
                                    aten::empty_strided         3.63%      32.331us         3.63%      32.331us       5.388us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        21.72%     193.324us        21.72%     193.324us      32.221us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.75%      33.378us         4.79%      42.600us       3.550us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         1.04%       9.222us         1.04%       9.222us       0.769us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel        10.06%      89.582us        10.06%      89.582us      14.930us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.58%       5.120us         0.58%       5.120us       5.120us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 890.052us
Self CUDA time total: 144.636us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     335.030us       423.20%     335.030us     335.030us             1  
                                      hf_kernels_rotary        17.60%     149.462us        99.35%     843.781us     843.781us       0.000us         0.00%      89.342us      89.342us             1  
                                            aten::clone         2.76%      23.471us        67.14%     570.174us      95.029us       0.000us         0.00%      47.328us       7.888us             6  
                                            aten::copy_         4.78%      40.570us        60.71%     515.622us      85.937us      37.152us        46.93%      47.328us       7.888us             6  
                          _rotary_dba7d1e::apply_rotary         4.86%      41.293us         9.90%      84.043us      14.007us      42.014us        53.07%      42.014us       7.002us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      42.014us        53.07%      42.014us       7.002us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      37.152us        46.93%      37.152us       6.192us             6  
                                Activity Buffer Request        33.04%     280.637us        33.04%     280.637us     280.637us      10.176us        12.85%      10.176us      10.176us             1  
                                    aten::empty_strided         3.66%      31.081us         3.66%      31.081us       5.180us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        22.89%     194.415us        22.89%     194.415us      32.403us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.65%      30.971us         4.72%      40.102us       3.342us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         1.08%       9.131us         1.08%       9.131us       0.761us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         5.03%      42.750us         5.03%      42.750us       7.125us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.65%       5.500us         0.65%       5.500us       5.500us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 849.281us
Self CUDA time total: 79.166us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     335.451us       231.31%     335.451us     335.451us             1  
                                      hf_kernels_rotary        20.10%     149.473us        99.28%     738.378us     738.378us       0.000us         0.00%     168.637us     168.637us             1  
                                            aten::clone         2.71%      20.190us        62.55%     465.160us      77.527us       0.000us         0.00%     104.892us      17.482us             6  
                                            aten::copy_         4.80%      35.671us        55.64%     413.780us      68.963us      81.277us        56.04%     104.892us      17.482us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.277us        56.04%      81.277us      13.546us             6  
                          _rotary_dba7d1e::apply_rotary         5.48%      40.762us        11.33%      84.273us      14.046us      63.745us        43.96%      63.745us      10.624us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      63.745us        43.96%      63.745us      10.624us             6  
                                Activity Buffer Request        25.22%     187.575us        25.22%     187.575us     187.575us      23.615us        16.28%      23.615us      23.615us             1  
                                    aten::empty_strided         4.19%      31.190us         4.19%      31.190us       5.198us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        25.62%     190.534us        25.62%     190.534us      31.756us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.11%      30.580us         5.31%      39.472us       3.289us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         1.20%       8.892us         1.20%       8.892us       0.741us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         5.85%      43.511us         5.85%      43.511us       7.252us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.72%       5.340us         0.72%       5.340us       5.340us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 743.718us
Self CUDA time total: 145.022us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary        13.47%     148.469us        71.31%     785.819us     785.819us       0.000us         0.00%     741.458us     741.458us             1  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     683.092us       101.20%     683.092us     683.092us             1  
                                            aten::clone         1.91%      21.000us        46.53%     512.672us      85.445us       0.000us         0.00%     558.390us      93.065us             6  
                                            aten::copy_         3.10%      34.171us        41.83%     460.962us      76.827us     491.927us        72.88%     558.390us      93.065us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     491.927us        72.88%     491.927us      81.988us             6  
                          _rotary_dba7d1e::apply_rotary         3.81%      41.992us         7.74%      85.293us      14.215us     183.068us        27.12%     183.068us      30.511us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     183.068us        27.12%     183.068us      30.511us             6  
                                Activity Buffer Request        21.20%     233.636us        21.20%     233.636us     233.636us      66.463us         9.85%      66.463us      66.463us             1  
                                    aten::empty_strided         2.79%      30.710us         2.79%      30.710us       5.118us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        17.53%     193.155us        17.53%     193.155us      32.192us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.79%      30.724us         3.57%      39.385us       3.282us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.79%       8.661us         0.79%       8.661us       0.722us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         3.93%      43.301us         3.93%      43.301us       7.217us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize        28.69%     316.097us        28.69%     316.097us     316.097us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.102ms
Self CUDA time total: 674.995us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         5.24%     147.368us        25.60%     720.668us     720.668us       0.000us         0.00%       2.633ms       2.633ms             1  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us       2.458ms       100.33%       2.458ms       2.458ms             1  
                                            aten::clone         0.73%      20.601us        15.98%     449.780us      74.963us       0.000us         0.00%       1.394ms     232.377us             6  
                                            aten::copy_         1.20%      33.732us        14.16%     398.690us      66.448us       1.211ms        49.43%       1.394ms     232.377us             6  
                          _rotary_dba7d1e::apply_rotary         1.46%      41.000us         3.02%      85.091us      14.182us       1.239ms        50.57%       1.239ms     206.500us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us       1.239ms        50.57%       1.239ms     206.500us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       1.211ms        49.43%       1.211ms     201.812us             6  
                                Activity Buffer Request         6.06%     170.604us         6.06%     170.604us     170.604us     183.391us         7.49%     183.391us     183.391us             1  
                                    aten::empty_strided         1.08%      30.489us         1.08%      30.489us       5.081us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         6.90%     194.354us         6.90%     194.354us      32.392us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.07%      30.231us         1.37%      38.429us       3.202us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.29%       8.198us         0.29%       8.198us       0.683us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.57%      44.091us         1.57%      44.091us       7.349us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize        74.40%       2.094ms        74.40%       2.094ms       2.094ms       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.815ms
Self CUDA time total: 2.450ms


impl                     wl                  p50(ms)  ok
hf_kernels_rotary        cuda_B1_S128_H32_D128_R64     0.09  True
hf_kernels_rotary        cuda_B1_S128_H32_D64_R32     0.09  True
hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.09  True
hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.08  True
hf_kernels_rotary        cuda_B1_S2048_H32_D128_R64     0.26  True
hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.09  True
hf_kernels_rotary        cuda_B1_S2048_H8_D128_R64     0.09  True
hf_kernels_rotary        cuda_B1_S2048_H8_D64_R32     0.09  True
hf_kernels_rotary        cuda_B1_S512_H32_D128_R64     0.09  True
hf_kernels_rotary        cuda_B1_S512_H32_D64_R32     0.09  True
hf_kernels_rotary        cuda_B1_S512_H8_D128_R64     0.09  True
hf_kernels_rotary        cuda_B1_S512_H8_D64_R32     0.09  True
hf_kernels_rotary        cuda_B2_S128_H32_D128_R64     0.09  True
hf_kernels_rotary        cuda_B2_S128_H32_D64_R32     0.09  True
hf_kernels_rotary        cuda_B2_S128_H8_D128_R64     0.09  True
hf_kernels_rotary        cuda_B2_S128_H8_D64_R32     0.09  True
hf_kernels_rotary        cuda_B2_S2048_H32_D128_R64     0.85  True
hf_kernels_rotary        cuda_B2_S2048_H32_D64_R32     0.26  True
hf_kernels_rotary        cuda_B2_S2048_H8_D128_R64     0.09  True
hf_kernels_rotary        cuda_B2_S2048_H8_D64_R32     0.09  True
hf_kernels_rotary        cuda_B2_S512_H32_D128_R64     0.09  True
hf_kernels_rotary        cuda_B2_S512_H32_D64_R32     0.09  True
hf_kernels_rotary        cuda_B2_S512_H8_D128_R64     0.09  True
hf_kernels_rotary        cuda_B2_S512_H8_D64_R32     0.09  True
▶ UV Install Logs
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. Fetching 5 files: 40%|████ | 2/5 [00:01<00:02, 1.08it/s] Fetching 5 files: 100%|██████████| 5/5 [00:01<00:00, 2.71it/s]

Artifacts:

rotary.jsonl