HF Kernels - Rotary Position Embeddings

GPU Info

▼ code ▼ output ▶ uv-logs | Cell: nv | 0.25s | Raw GitHub 🤗 HF
import subprocess
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
Fri Dec 19 18:55:27 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
| N/A   28C    P0             85W /  350W |       0MiB /  46068MiB |     34%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

Rotary Embeddings Benchmark

▼ code ▼ output ▶ uv-logs | Cell: benchmark | 34.01s | Raw GitHub 🤗 HF
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "numpy",
#     "torch==2.8.0",
#     "kernels-benchmark-tools",
#     "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel

# Load the rotary kernel
rotary = get_kernel("kernels-community/rotary")


def hf_kernels_rotary(query, key, cos, sin, conj=False):
    rotary_dim = cos.shape[-1]

    # Clone to avoid modifying inputs
    q_out = query.clone()
    k_out = key.clone()

    # Apply rotation to query
    q1 = q_out[..., :rotary_dim]
    q2 = q_out[..., rotary_dim : 2 * rotary_dim]
    rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)

    # Apply rotation to key
    k1 = k_out[..., :rotary_dim]
    k2 = k_out[..., rotary_dim : 2 * rotary_dim]
    rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)

    return q_out, k_out


run_benchmark(
    kernel_type=KernelTypeEnum.ROTARY,
    impl_name="hf_kernels_rotary",
    impl_tags={"family": "hf-kernels", "backend": "cuda"},
    impl_func=hf_kernels_rotary,
    dtype="float32",
)
Running rotary benchmark on cuda with 24 workloads.

======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     443.904us      1908.20%     443.904us     443.904us             1  
                                      hf_kernels_rotary         5.70%     250.202us        82.78%       3.636ms       3.636ms       0.000us         0.00%      24.543us      24.543us             1  
                          _rotary_dba7d1e::apply_rotary         1.26%      55.422us         2.46%     108.103us      18.017us      16.158us        69.46%      16.158us       2.693us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.158us        69.46%      16.158us       2.693us             6  
                                            aten::clone         1.27%      55.831us        73.22%       3.217ms     536.122us       0.000us         0.00%       8.385us       1.398us             6  
                                            aten::copy_         1.34%      58.721us        52.83%       2.321ms     386.805us       7.105us        30.54%       8.385us       1.398us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.105us        30.54%       7.105us       1.184us             6  
                                Activity Buffer Request        49.67%       2.182ms        49.67%       2.182ms       2.182ms       1.280us         5.50%       1.280us       1.280us             1  
                                    aten::empty_strided        19.12%     840.074us        19.12%     840.074us     140.012us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         1.82%      80.012us         1.82%      80.012us      13.335us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.10%      48.201us         1.40%      61.410us       5.118us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.30%      13.209us         0.30%      13.209us       1.101us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.20%      52.681us         1.20%      52.681us       8.780us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize        17.22%     756.622us        17.22%     756.622us     756.622us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 4.393ms
Self CUDA time total: 23.263us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     319.740us      1332.31%     319.740us     319.740us             1  
                                      hf_kernels_rotary         7.66%     161.733us        99.74%       2.106ms       2.106ms       0.000us         0.00%      25.311us      25.311us             1  
                          _rotary_dba7d1e::apply_rotary         1.93%      40.672us         3.87%      81.612us      13.602us      16.127us        67.20%      16.127us       2.688us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.127us        67.20%      16.127us       2.688us             6  
                                            aten::clone         0.96%      20.317us        86.36%       1.823ms     303.883us       0.000us         0.00%       9.184us       1.531us             6  
                                            aten::copy_         1.65%      34.881us        83.94%       1.772ms     295.377us       7.872us        32.80%       9.184us       1.531us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us        32.80%       7.872us       1.312us             6  
                                Activity Buffer Request        79.76%       1.684ms        79.76%       1.684ms       1.684ms       1.312us         5.47%       1.312us       1.312us             1  
                                    aten::empty_strided         1.46%      30.722us         1.46%      30.722us       5.120us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         2.53%      53.351us         2.53%      53.351us       8.892us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.45%      30.570us         1.85%      39.091us       3.258us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.40%       8.521us         0.40%       8.521us       0.710us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.94%      40.940us         1.94%      40.940us       6.823us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.26%       5.581us         0.26%       5.581us       5.581us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.111ms
Self CUDA time total: 23.999us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     318.846us      1316.19%     318.846us     318.846us             1  
                                      hf_kernels_rotary         7.42%     158.343us        99.75%       2.128ms       2.128ms       0.000us         0.00%      25.505us      25.505us             1  
                          _rotary_dba7d1e::apply_rotary         1.98%      42.310us         3.93%      83.780us      13.963us      16.544us        68.29%      16.544us       2.757us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.544us        68.29%      16.544us       2.757us             6  
                                            aten::clone         0.91%      19.412us        86.52%       1.845ms     307.560us       0.000us         0.00%       8.961us       1.493us             6  
                                            aten::copy_         1.63%      34.769us        84.21%       1.796ms     299.358us       7.681us        31.71%       8.961us       1.493us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.681us        31.71%       7.681us       1.280us             6  
                                Activity Buffer Request        80.16%       1.710ms        80.16%       1.710ms       1.710ms       1.280us         5.28%       1.280us       1.280us             1  
                                    aten::empty_strided         1.40%      29.800us         1.40%      29.800us       4.967us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         2.42%      51.552us         2.42%      51.552us       8.592us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.48%      31.501us         1.89%      40.211us       3.351us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.41%       8.710us         0.41%       8.710us       0.726us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.94%      41.470us         1.94%      41.470us       6.912us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.25%       5.300us         0.25%       5.300us       5.300us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.133ms
Self CUDA time total: 24.225us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     321.313us      1144.93%     321.313us     321.313us             1  
                                      hf_kernels_rotary         6.51%     155.543us        99.79%       2.386ms       2.386ms       0.000us         0.00%      29.792us      29.792us             1  
                          _rotary_dba7d1e::apply_rotary         1.75%      41.730us         3.43%      82.081us      13.680us      17.695us        63.05%      17.695us       2.949us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.695us        63.05%      17.695us       2.949us             6  
                                            aten::clone         0.86%      20.620us        88.15%       2.107ms     351.189us       0.000us         0.00%      12.097us       2.016us             6  
                                            aten::copy_         1.42%      33.971us        86.01%       2.056ms     342.697us      10.369us        36.95%      12.097us       2.016us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.369us        36.95%      10.369us       1.728us             6  
                                Activity Buffer Request        74.19%       1.773ms        74.19%       1.773ms       1.773ms       1.728us         6.16%       1.728us       1.728us             1  
                                    aten::empty_strided         1.27%      30.330us         1.27%      30.330us       5.055us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        10.41%     248.804us        10.41%     248.804us      41.467us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.33%      31.699us         1.70%      40.751us       3.396us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.38%       9.052us         0.38%       9.052us       0.754us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.69%      40.351us         1.69%      40.351us       6.725us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.21%       5.000us         0.21%       5.000us       5.000us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.391ms
Self CUDA time total: 28.064us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     325.341us      1337.81%     325.341us     325.341us             1  
                                      hf_kernels_rotary         6.82%     156.303us        99.76%       2.286ms       2.286ms       0.000us         0.00%      25.599us      25.599us             1  
                          _rotary_dba7d1e::apply_rotary         1.86%      42.600us         3.70%      84.711us      14.118us      16.512us        67.90%      16.512us       2.752us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.512us        67.90%      16.512us       2.752us             6  
                                            aten::clone         0.89%      20.410us        87.49%       2.005ms     334.180us       0.000us         0.00%       9.087us       1.514us             6  
                                            aten::copy_         1.54%      35.239us        85.28%       1.954ms     325.734us       7.807us        32.10%       9.087us       1.514us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.807us        32.10%       7.807us       1.301us             6  
                                Activity Buffer Request        73.15%       1.676ms        73.15%       1.676ms       1.676ms       1.280us         5.26%       1.280us       1.280us             1  
                                    aten::empty_strided         1.32%      30.271us         1.32%      30.271us       5.045us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        10.60%     242.835us        10.60%     242.835us      40.472us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.38%      31.561us         1.75%      40.080us       3.340us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.37%       8.519us         0.37%       8.519us       0.710us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.84%      42.111us         1.84%      42.111us       7.018us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.24%       5.490us         0.24%       5.490us       5.490us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.292ms
Self CUDA time total: 24.319us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     323.675us      1148.27%     323.675us     323.675us             1  
                                      hf_kernels_rotary         6.64%     156.971us        99.78%       2.359ms       2.359ms       0.000us         0.00%      29.948us      29.948us             1  
                          _rotary_dba7d1e::apply_rotary         1.69%      39.892us         3.46%      81.891us      13.648us      17.695us        62.77%      17.695us       2.949us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.695us        62.77%      17.695us       2.949us             6  
                                            aten::clone         0.91%      21.443us        87.99%       2.080ms     346.632us       0.000us         0.00%      12.253us       2.042us             6  
                                            aten::copy_         1.41%      33.321us        85.79%       2.028ms     337.974us      10.493us        37.23%      12.253us       2.042us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.493us        37.23%      10.493us       1.749us             6  
                                Activity Buffer Request        74.45%       1.760ms        74.45%       1.760ms       1.760ms       1.760us         6.24%       1.760us       1.760us             1  
                                    aten::empty_strided         1.29%      30.510us         1.29%      30.510us       5.085us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.92%     234.593us         9.92%     234.593us      39.099us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.33%      31.410us         1.69%      40.001us       3.333us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.36%       8.591us         0.36%       8.591us       0.716us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.78%      41.999us         1.78%      41.999us       7.000us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.22%       5.111us         0.22%       5.111us       5.111us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.364ms
Self CUDA time total: 28.188us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     324.125us       795.09%     324.125us     324.125us             1  
                                      hf_kernels_rotary         6.45%     153.782us        99.79%       2.380ms       2.380ms       0.000us         0.00%      43.614us      43.614us             1  
                          _rotary_dba7d1e::apply_rotary         1.81%      43.080us         3.50%      83.561us      13.927us      23.646us        58.00%      23.646us       3.941us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.646us        58.00%      23.646us       3.941us             6  
                                            aten::clone         0.86%      20.470us        88.19%       2.103ms     350.526us       0.000us         0.00%      19.968us       3.328us             6  
                                            aten::copy_         1.45%      34.599us        85.96%       2.050ms     341.697us      17.120us        42.00%      19.968us       3.328us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.120us        42.00%      17.120us       2.853us             6  
                                Activity Buffer Request        74.86%       1.785ms        74.86%       1.785ms       1.785ms       2.848us         6.99%       2.848us       2.848us             1  
                                    aten::empty_strided         1.36%      32.503us         1.36%      32.503us       5.417us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.65%     230.194us         9.65%     230.194us      38.366us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.31%      31.171us         1.65%      39.351us       3.279us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.34%       8.180us         0.34%       8.180us       0.682us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.70%      40.481us         1.70%      40.481us       6.747us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.21%       5.060us         0.21%       5.060us       5.060us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.385ms
Self CUDA time total: 40.766us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     326.719us       435.58%     326.719us     326.719us             1  
                                      hf_kernels_rotary         6.86%     156.773us        99.78%       2.282ms       2.282ms       0.000us         0.00%      83.583us      83.583us             1  
                                            aten::clone         0.90%      20.639us        87.67%       2.005ms     334.152us       0.000us         0.00%      44.415us       7.402us             6  
                                            aten::copy_         1.46%      33.361us        85.45%       1.954ms     325.685us      35.839us        47.78%      44.415us       7.402us             6  
                          _rotary_dba7d1e::apply_rotary         1.78%      40.770us         3.52%      80.530us      13.422us      39.168us        52.22%      39.168us       6.528us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.168us        52.22%      39.168us       6.528us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      35.839us        47.78%      35.839us       5.973us             6  
                                Activity Buffer Request        74.00%       1.692ms        74.00%       1.692ms       1.692ms       8.576us        11.43%       8.576us       8.576us             1  
                                    aten::empty_strided         1.32%      30.160us         1.32%      30.160us       5.027us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.99%     228.424us         9.99%     228.424us      38.071us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.36%      31.101us         1.73%      39.632us       3.303us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.37%       8.531us         0.37%       8.531us       0.711us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.74%      39.760us         1.74%      39.760us       6.627us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.22%       5.030us         0.22%       5.030us       5.030us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.287ms
Self CUDA time total: 75.007us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     313.534us       775.13%     313.534us     313.534us             1  
                                      hf_kernels_rotary        17.26%     147.172us        99.40%     847.604us     847.604us       0.000us         0.00%      43.297us      43.297us             1  
                          _rotary_dba7d1e::apply_rotary         4.54%      38.680us         9.23%      78.702us      13.117us      23.425us        57.91%      23.425us       3.904us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.425us        57.91%      23.425us       3.904us             6  
                                            aten::clone         2.34%      19.949us        68.60%     584.900us      97.483us       0.000us         0.00%      19.872us       3.312us             6  
                                            aten::copy_         3.97%      33.871us        62.79%     535.360us      89.227us      17.024us        42.09%      19.872us       3.312us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us        42.09%      17.024us       2.837us             6  
                                Activity Buffer Request        32.38%     276.114us        32.38%     276.114us     276.114us       2.848us         7.04%       2.848us       2.848us             1  
                                    aten::empty_strided         3.47%      29.591us         3.47%      29.591us       4.932us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        26.43%     225.375us        26.43%     225.375us      37.563us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.45%      29.390us         4.32%      36.830us       3.069us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.87%       7.440us         0.87%       7.440us       0.620us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         4.69%      40.022us         4.69%      40.022us       6.670us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.60%       5.080us         0.60%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 852.684us
Self CUDA time total: 40.449us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     317.820us       416.43%     317.820us     317.820us             1  
                                      hf_kernels_rotary        17.56%     148.193us        99.41%     838.994us     838.994us       0.000us         0.00%      85.633us      85.633us             1  
                                            aten::clone         2.33%      19.639us        67.72%     571.550us      95.258us       0.000us         0.00%      46.146us       7.691us             6  
                                            aten::copy_         4.11%      34.662us        61.88%     522.280us      87.047us      36.833us        48.26%      46.146us       7.691us             6  
                          _rotary_dba7d1e::apply_rotary         4.71%      39.770us         9.52%      80.371us      13.395us      39.487us        51.74%      39.487us       6.581us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.487us        51.74%      39.487us       6.581us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      36.833us        48.26%      36.833us       6.139us             6  
                                Activity Buffer Request        31.55%     266.304us        31.55%     266.304us     266.304us       9.313us        12.20%       9.313us       9.313us             1  
                                    aten::empty_strided         3.51%      29.631us         3.51%      29.631us       4.938us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        26.22%     221.314us        26.22%     221.314us      36.886us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.67%      30.998us         4.61%      38.880us       3.240us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.93%       7.882us         0.93%       7.882us       0.657us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         4.81%      40.601us         4.81%      40.601us       6.767us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.59%       5.020us         0.59%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 844.014us
Self CUDA time total: 76.320us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     319.423us       229.00%     319.423us     319.423us             1  
                                      hf_kernels_rotary        18.34%     146.203us        99.38%     792.173us     792.173us       0.000us         0.00%     163.169us     163.169us             1  
                                            aten::clone         2.42%      19.310us        66.03%     526.340us      87.723us       0.000us         0.00%     102.753us      17.126us             6  
                                            aten::copy_         4.16%      33.151us        59.92%     477.629us      79.605us      79.073us        56.69%     102.753us      17.126us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      79.073us        56.69%      79.073us      13.179us             6  
                          _rotary_dba7d1e::apply_rotary         5.00%      39.859us        10.11%      80.600us      13.433us      60.416us        43.31%      60.416us      10.069us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      60.416us        43.31%      60.416us      10.069us             6  
                                Activity Buffer Request        28.37%     226.154us        28.37%     226.154us     226.154us      23.680us        16.98%      23.680us      23.680us             1  
                                    aten::empty_strided         3.69%      29.401us         3.69%      29.401us       4.900us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        27.39%     218.324us        27.39%     218.324us      36.387us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.87%      30.870us         4.90%      39.030us       3.252us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         1.02%       8.160us         1.02%       8.160us       0.680us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         5.11%      40.741us         5.11%      40.741us       6.790us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.62%       4.920us         0.62%       4.920us       4.920us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 797.093us
Self CUDA time total: 139.489us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary        12.16%     146.839us        70.39%     850.223us     850.223us       0.000us         0.00%     769.020us     769.020us             1  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     710.970us       101.11%     710.970us     710.970us             1  
                                            aten::clone         3.16%      38.139us        48.21%     582.349us      97.058us       0.000us         0.00%     567.581us      94.597us             6  
                                            aten::copy_         2.68%      32.339us        42.57%     514.168us      85.695us     501.725us        71.35%     567.581us      94.597us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     501.725us        71.35%     501.725us      83.621us             6  
                          _rotary_dba7d1e::apply_rotary         3.42%      41.301us         6.76%      81.602us      13.600us     201.439us        28.65%     201.439us      33.573us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     201.439us        28.65%     201.439us      33.573us             6  
                                Activity Buffer Request        21.65%     261.455us        21.65%     261.455us     261.455us      65.856us         9.37%      65.856us      65.856us             1  
                                    aten::empty_strided         2.49%      30.042us         2.49%      30.042us       5.007us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        18.25%     220.374us        18.25%     220.374us      36.729us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.57%      30.999us         3.26%      39.433us       3.286us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.70%       8.434us         0.70%       8.434us       0.703us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         3.34%      40.301us         3.34%      40.301us       6.717us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize        29.61%     357.615us        29.61%     357.615us     357.615us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.208ms
Self CUDA time total: 703.164us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     317.658us      1193.13%     317.658us     317.658us             1  
                                      hf_kernels_rotary        17.50%     146.563us        99.40%     832.443us     832.443us       0.000us         0.00%      27.968us      27.968us             1  
                          _rotary_dba7d1e::apply_rotary         4.88%      40.860us         9.91%      82.980us      13.830us      18.751us        70.43%      18.751us       3.125us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.751us        70.43%      18.751us       3.125us             6  
                                            aten::clone         2.35%      19.662us        67.40%     564.410us      94.068us       0.000us         0.00%       9.217us       1.536us             6  
                                            aten::copy_         4.08%      34.181us        61.36%     513.878us      85.646us       7.873us        29.57%       9.217us       1.536us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.873us        29.57%       7.873us       1.312us             6  
                                Activity Buffer Request        31.10%     260.474us        31.10%     260.474us     260.474us       1.344us         5.05%       1.344us       1.344us             1  
                                    aten::empty_strided         3.69%      30.870us         3.69%      30.870us       5.145us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        26.18%     219.223us        26.18%     219.223us      36.537us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.65%      30.540us         4.60%      38.490us       3.208us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.95%       7.950us         0.95%       7.950us       0.663us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         5.03%      42.120us         5.03%      42.120us       7.020us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.60%       5.020us         0.60%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 837.463us
Self CUDA time total: 26.624us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     308.930us      1157.56%     308.930us     308.930us             1  
                                      hf_kernels_rotary        17.42%     145.421us        99.38%     829.543us     829.543us       0.000us         0.00%      28.000us      28.000us             1  
                          _rotary_dba7d1e::apply_rotary         5.67%      47.303us        10.60%      88.453us      14.742us      18.944us        70.98%      18.944us       3.157us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.944us        70.98%      18.944us       3.157us             6  
                                            aten::clone         2.28%      19.028us        66.78%     557.449us      92.908us       0.000us         0.00%       9.056us       1.509us             6  
                                            aten::copy_         3.82%      31.872us        61.07%     509.750us      84.958us       7.744us        29.02%       9.056us       1.509us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us        29.02%       7.744us       1.291us             6  
                                Activity Buffer Request        31.42%     262.245us        31.42%     262.245us     262.245us       1.312us         4.92%       1.312us       1.312us             1  
                                    aten::empty_strided         3.43%      28.671us         3.43%      28.671us       4.778us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        25.83%     215.633us        25.83%     215.633us      35.939us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.59%      29.990us         4.58%      38.220us       3.185us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.99%       8.230us         0.99%       8.230us       0.686us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         4.93%      41.150us         4.93%      41.150us       6.858us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.62%       5.211us         0.62%       5.211us       5.211us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 834.754us
Self CUDA time total: 26.688us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     326.626us      1069.89%     326.626us     326.626us             1  
                                      hf_kernels_rotary         6.56%     153.551us        99.79%       2.337ms       2.337ms       0.000us         0.00%      32.289us      32.289us             1  
                          _rotary_dba7d1e::apply_rotary         1.75%      41.061us         3.50%      81.981us      13.664us      20.161us        66.04%      20.161us       3.360us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.161us        66.04%      20.161us       3.360us             6  
                                            aten::clone         0.92%      21.442us        88.00%       2.061ms     343.521us       0.000us         0.00%      12.128us       2.021us             6  
                                            aten::copy_         1.46%      34.178us        85.80%       2.010ms     334.939us      10.368us        33.96%      12.128us       2.021us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.368us        33.96%      10.368us       1.728us             6  
                                Activity Buffer Request        75.10%       1.759ms        75.10%       1.759ms       1.759ms       1.760us         5.77%       1.760us       1.760us             1  
                                    aten::empty_strided         1.28%      30.051us         1.28%      30.051us       5.009us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.25%     216.555us         9.25%     216.555us      36.092us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.37%      32.001us         1.73%      40.601us       3.383us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.37%       8.600us         0.37%       8.600us       0.717us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.75%      40.920us         1.75%      40.920us       6.820us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.21%       4.951us         0.21%       4.951us       4.951us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.342ms
Self CUDA time total: 30.529us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     323.551us       758.49%     323.551us     323.551us             1  
                                      hf_kernels_rotary         6.57%     149.229us        99.78%       2.268ms       2.268ms       0.000us         0.00%      45.505us      45.505us             1  
                          _rotary_dba7d1e::apply_rotary         1.78%      40.461us         3.63%      82.552us      13.759us      25.601us        60.02%      25.601us       4.267us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.601us        60.02%      25.601us       4.267us             6  
                                            aten::clone         0.92%      20.920us        87.81%       1.996ms     332.671us       0.000us         0.00%      19.904us       3.317us             6  
                                            aten::copy_         1.45%      33.001us        85.55%       1.945ms     324.092us      17.056us        39.98%      19.904us       3.317us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us        39.98%      17.056us       2.843us             6  
                                Activity Buffer Request        74.47%       1.693ms        74.47%       1.693ms       1.693ms       2.848us         6.68%       2.848us       2.848us             1  
                                    aten::empty_strided         1.34%      30.551us         1.34%      30.551us       5.092us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.62%     218.764us         9.62%     218.764us      36.461us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.40%      31.822us         1.77%      40.192us       3.349us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.37%       8.370us         0.37%       8.370us       0.697us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         1.85%      42.091us         1.85%      42.091us       7.015us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.22%       5.030us         0.22%       5.030us       5.030us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.273ms
Self CUDA time total: 42.657us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     313.528us      1024.90%     313.528us     313.528us             1  
                                      hf_kernels_rotary        17.21%     144.859us        99.41%     836.674us     836.674us       0.000us         0.00%      32.319us      32.319us             1  
                          _rotary_dba7d1e::apply_rotary         4.77%      40.179us         9.63%      81.081us      13.513us      20.224us        66.11%      20.224us       3.371us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.224us        66.11%      20.224us       3.371us             6  
                                            aten::clone         2.34%      19.722us        68.09%     573.072us      95.512us       0.000us         0.00%      12.095us       2.016us             6  
                                            aten::copy_         3.82%      32.139us        62.12%     522.779us      87.130us      10.367us        33.89%      12.095us       2.016us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.367us        33.89%      10.367us       1.728us             6  
                                Activity Buffer Request        32.59%     274.305us        32.59%     274.305us     274.305us       1.728us         5.65%       1.728us       1.728us             1  
                                    aten::empty_strided         3.63%      30.571us         3.63%      30.571us       5.095us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        25.70%     216.335us        25.70%     216.335us      36.056us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.51%      29.563us         4.47%      37.662us       3.139us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.96%       8.099us         0.96%       8.099us       0.675us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         4.86%      40.902us         4.86%      40.902us       6.817us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.59%       4.950us         0.59%       4.950us       4.950us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 841.624us
Self CUDA time total: 30.591us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     315.863us       736.07%     315.863us     315.863us             1  
                                      hf_kernels_rotary        17.12%     141.480us        99.37%     821.243us     821.243us       0.000us         0.00%      45.760us      45.760us             1  
                          _rotary_dba7d1e::apply_rotary         4.88%      40.322us         9.94%      82.183us      13.697us      25.824us        60.18%      25.824us       4.304us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.824us        60.18%      25.824us       4.304us             6  
                                            aten::clone         2.37%      19.588us        67.68%     559.279us      93.213us       0.000us         0.00%      19.936us       3.323us             6  
                                            aten::copy_         4.15%      34.293us        61.65%     509.500us      84.917us      17.088us        39.82%      19.936us       3.323us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us        39.82%      17.088us       2.848us             6  
                                Activity Buffer Request        31.27%     258.454us        31.27%     258.454us     258.454us       2.848us         6.64%       2.848us       2.848us             1  
                                    aten::empty_strided         3.65%      30.191us         3.65%      30.191us       5.032us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        26.23%     216.753us        26.23%     216.753us      36.126us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.65%      30.130us         4.63%      38.301us       3.192us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.99%       8.171us         0.99%       8.171us       0.681us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         5.07%      41.861us         5.07%      41.861us       6.977us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.63%       5.170us         0.63%       5.170us       5.170us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 826.413us
Self CUDA time total: 42.912us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     311.420us       333.98%     311.420us     311.420us             1  
                                      hf_kernels_rotary        17.30%     140.163us        99.43%     805.353us     805.353us       0.000us         0.00%     108.190us     108.190us             1  
                                            aten::clone         2.32%      18.760us        67.64%     547.879us      91.313us       0.000us         0.00%      66.399us      11.066us             6  
                                            aten::copy_         4.25%      34.400us        61.74%     500.059us      83.343us      51.455us        55.18%      66.399us      11.066us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.455us        55.18%      51.455us       8.576us             6  
                          _rotary_dba7d1e::apply_rotary         4.76%      38.581us         9.84%      79.710us      13.285us      41.791us        44.82%      41.791us       6.965us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      41.791us        44.82%      41.791us       6.965us             6  
                                Activity Buffer Request        31.09%     251.814us        31.09%     251.814us     251.814us      14.944us        16.03%      14.944us      14.944us             1  
                                    aten::empty_strided         3.59%      29.060us         3.59%      29.060us       4.843us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        26.40%     213.845us        26.40%     213.845us      35.641us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.70%      29.950us         4.64%      37.601us       3.133us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.94%       7.651us         0.94%       7.651us       0.638us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         5.08%      41.129us         5.08%      41.129us       6.855us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.57%       4.640us         0.57%       4.640us       4.640us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 809.993us
Self CUDA time total: 93.246us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     314.038us       216.45%     314.038us     314.038us             1  
                                      hf_kernels_rotary        17.88%     139.471us        99.38%     775.432us     775.432us       0.000us         0.00%     168.795us     168.795us             1  
                                            aten::clone         2.37%      18.470us        66.41%     518.157us      86.360us       0.000us         0.00%     104.989us      17.498us             6  
                                            aten::copy_         4.24%      33.100us        60.11%     469.037us      78.173us      81.277us        56.02%     104.989us      17.498us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.277us        56.02%      81.277us      13.546us             6  
                          _rotary_dba7d1e::apply_rotary         5.14%      40.111us        10.39%      81.071us      13.512us      63.806us        43.98%      63.806us      10.634us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      63.806us        43.98%      63.806us      10.634us             6  
                                Activity Buffer Request        28.68%     223.784us        28.68%     223.784us     223.784us      23.712us        16.34%      23.712us      23.712us             1  
                                    aten::empty_strided         3.93%      30.650us         3.93%      30.650us       5.108us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        27.19%     212.153us        27.19%     212.153us      35.359us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.73%      29.133us         4.71%      36.733us       3.061us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.97%       7.600us         0.97%       7.600us       0.633us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         5.25%      40.960us         5.25%      40.960us       6.827us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.62%       4.811us         0.62%       4.811us       4.811us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 780.243us
Self CUDA time total: 145.083us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     319.458us       399.31%     319.458us     319.458us             1  
                                      hf_kernels_rotary        17.47%     143.453us        99.38%     816.213us     816.213us       0.000us         0.00%      90.147us      90.147us             1  
                                            aten::clone         2.34%      19.239us        67.36%     553.208us      92.201us       0.000us         0.00%      47.811us       7.969us             6  
                                            aten::copy_         3.98%      32.663us        61.19%     502.549us      83.758us      37.666us        47.08%      47.811us       7.969us             6  
                          _rotary_dba7d1e::apply_rotary         4.95%      40.641us         9.85%      80.901us      13.484us      42.336us        52.92%      42.336us       7.056us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      42.336us        52.92%      42.336us       7.056us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      37.666us        47.08%      37.666us       6.278us             6  
                                Activity Buffer Request        31.07%     255.204us        31.07%     255.204us     255.204us      10.145us        12.68%      10.145us      10.145us             1  
                                    aten::empty_strided         3.83%      31.420us         3.83%      31.420us       5.237us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        26.14%     214.682us        26.14%     214.682us      35.780us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.75%      30.830us         4.71%      38.651us       3.221us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.95%       7.821us         0.95%       7.821us       0.652us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         4.90%      40.260us         4.90%      40.260us       6.710us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.62%       5.090us         0.62%       5.090us       5.090us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 821.303us
Self CUDA time total: 80.002us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     325.402us       221.65%     325.402us     325.402us             1  
                                      hf_kernels_rotary        17.64%     145.390us        99.37%     819.093us     819.093us       0.000us         0.00%     170.492us     170.492us             1  
                                            aten::clone         2.33%      19.171us        66.92%     551.610us      91.935us       0.000us         0.00%     105.757us      17.626us             6  
                                            aten::copy_         4.29%      35.392us        61.01%     502.899us      83.816us      82.077us        55.91%     105.757us      17.626us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      82.077us        55.91%      82.077us      13.679us             6  
                          _rotary_dba7d1e::apply_rotary         4.98%      41.061us        10.11%      83.363us      13.894us      64.735us        44.09%      64.735us      10.789us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      64.735us        44.09%      64.735us      10.789us             6  
                                Activity Buffer Request        31.05%     255.945us        31.05%     255.945us     255.945us      23.680us        16.13%      23.680us      23.680us             1  
                                    aten::empty_strided         3.58%      29.540us         3.58%      29.540us       4.923us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        25.67%     211.562us        25.67%     211.562us      35.260us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.74%      30.830us         4.70%      38.730us       3.228us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.96%       7.900us         0.96%       7.900us       0.658us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         5.13%      42.302us         5.13%      42.302us       7.050us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize         0.63%       5.170us         0.63%       5.170us       5.170us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 824.263us
Self CUDA time total: 146.812us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary        12.27%     141.491us        70.82%     816.704us     816.704us       0.000us         0.00%     741.813us     741.813us             1  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     683.383us       101.18%     683.383us     683.383us             1  
                                            aten::clone         1.70%      19.590us        48.29%     556.839us      92.806us       0.000us         0.00%     558.329us      93.055us             6  
                                            aten::copy_         3.05%      35.121us        43.99%     507.258us      84.543us     491.962us        72.84%     558.329us      93.055us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     491.962us        72.84%     491.962us      81.994us             6  
                          _rotary_dba7d1e::apply_rotary         3.41%      39.351us         6.95%      80.182us      13.364us     183.484us        27.16%     183.484us      30.581us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     183.484us        27.16%     183.484us      30.581us             6  
                                Activity Buffer Request        22.79%     262.814us        22.79%     262.814us     262.814us      66.367us         9.83%      66.367us      66.367us             1  
                                    aten::empty_strided         2.60%      29.991us         2.60%      29.991us       4.998us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        18.15%     209.323us        18.15%     209.323us      34.887us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.59%      29.922us         3.31%      38.192us       3.183us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.72%       8.270us         0.72%       8.270us       0.689us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         3.54%      40.831us         3.54%      40.831us       6.805us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize        29.18%     336.495us        29.18%     336.495us     336.495us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.153ms
Self CUDA time total: 675.446us



======================================================================
PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      hf_kernels_rotary         3.40%     150.944us        53.25%       2.364ms       2.364ms       0.000us         0.00%       2.619ms       2.619ms             1  
                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us       2.449ms       100.31%       2.449ms       2.449ms             1  
                                            aten::clone         0.54%      23.821us        47.05%       2.089ms     348.124us       0.000us         0.00%       1.393ms     232.202us             6  
                                            aten::copy_         0.77%      34.330us        45.79%       2.033ms     338.776us       1.215ms        49.78%       1.393ms     232.202us             6  
                          _rotary_dba7d1e::apply_rotary         0.95%      42.082us         1.92%      85.331us      14.222us       1.226ms        50.22%       1.226ms     204.346us             6  
void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us       1.226ms        50.22%       1.226ms     204.346us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       1.215ms        49.78%       1.215ms     202.575us             6  
                                Activity Buffer Request        40.08%       1.779ms        40.08%       1.779ms       1.779ms     177.759us         7.28%     177.759us     177.759us             1  
                                    aten::empty_strided         0.73%      32.270us         0.73%      32.270us       5.378us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         4.93%     218.903us         4.93%     218.903us      36.484us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         0.70%      31.129us         0.88%      39.000us       3.250us       0.000us         0.00%       0.000us       0.000us            12  
                                       aten::as_strided         0.18%       7.871us         0.18%       7.871us       0.656us       0.000us         0.00%       0.000us       0.000us            12  
                                       cudaLaunchKernel         0.97%      43.249us         0.97%      43.249us       7.208us       0.000us         0.00%       0.000us       0.000us             6  
                                  cudaDeviceSynchronize        46.75%       2.075ms        46.75%       2.075ms       2.075ms       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 4.439ms
Self CUDA time total: 2.442ms


impl                     wl                  p50(ms)  ok
hf_kernels_rotary        cuda_B1_S128_H32_D128_R64     0.09  True
hf_kernels_rotary        cuda_B1_S128_H32_D64_R32     0.09  True
hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.09  True
hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.08  True
hf_kernels_rotary        cuda_B1_S2048_H32_D128_R64     0.26  True
hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.09  True
hf_kernels_rotary        cuda_B1_S2048_H8_D128_R64     0.09  True
hf_kernels_rotary        cuda_B1_S2048_H8_D64_R32     0.09  True
hf_kernels_rotary        cuda_B1_S512_H32_D128_R64     0.09  True
hf_kernels_rotary        cuda_B1_S512_H32_D64_R32     0.09  True
hf_kernels_rotary        cuda_B1_S512_H8_D128_R64     0.09  True
hf_kernels_rotary        cuda_B1_S512_H8_D64_R32     0.09  True
hf_kernels_rotary        cuda_B2_S128_H32_D128_R64     0.09  True
hf_kernels_rotary        cuda_B2_S128_H32_D64_R32     0.09  True
hf_kernels_rotary        cuda_B2_S128_H8_D128_R64     0.09  True
hf_kernels_rotary        cuda_B2_S128_H8_D64_R32     0.09  True
hf_kernels_rotary        cuda_B2_S2048_H32_D128_R64     0.85  True
hf_kernels_rotary        cuda_B2_S2048_H32_D64_R32     0.26  True
hf_kernels_rotary        cuda_B2_S2048_H8_D128_R64     0.09  True
hf_kernels_rotary        cuda_B2_S2048_H8_D64_R32     0.09  True
hf_kernels_rotary        cuda_B2_S512_H32_D128_R64     0.09  True
hf_kernels_rotary        cuda_B2_S512_H32_D64_R32     0.09  True
hf_kernels_rotary        cuda_B2_S512_H8_D128_R64     0.09  True
hf_kernels_rotary        cuda_B2_S512_H8_D64_R32     0.09  True
▶ UV Install Logs
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] Fetching 5 files: 40%|████ | 2/5 [00:00<00:00, 19.18it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 9.18it/s] Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 9.78it/s]

Artifacts:

rotary.jsonl