diff --git "a/rotary/impls/hf_kernels_rotary.html" "b/rotary/impls/hf_kernels_rotary.html" --- "a/rotary/impls/hf_kernels_rotary.html" +++ "b/rotary/impls/hf_kernels_rotary.html" @@ -3905,7 +3905,7 @@ Cell: nv | 0.25s
-
Fri Dec 19 19:54:55 2025       
+
Fri Dec 19 23:00:45 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.25s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   36C    P0             90W /  350W |       0MiB /  46068MiB |     17%      Default |
+| N/A   40C    P0             85W /  350W |       0MiB /  46068MiB |     24%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.25s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 6.21s
+Cell: benchmark | 4.78s
  | 
 
 Raw
@@ -4009,23 +4009,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     397.918us      1712.80%     397.918us     397.918us             1  
-                                      hf_kernels_rotary        10.09%     234.566us        99.43%       2.310ms       2.310ms       0.000us         0.00%      24.512us      24.512us             1  
-                          _rotary_dba7d1e::apply_rotary         2.29%      53.121us         4.37%     101.432us      16.905us      16.192us        69.70%      16.192us       2.699us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.192us        69.70%      16.192us       2.699us             6  
-                                            aten::clone         1.67%      38.889us        82.80%       1.924ms     320.673us       0.000us         0.00%       8.320us       1.387us             6  
-                                            aten::copy_         1.71%      39.649us        78.85%       1.832ms     305.366us       7.040us        30.30%       8.320us       1.387us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.040us        30.30%       7.040us       1.173us             6  
-                                Activity Buffer Request        74.05%       1.721ms        74.05%       1.721ms       1.721ms       1.280us         5.51%       1.280us       1.280us             1  
-                                    aten::empty_strided         2.28%      52.952us         2.28%      52.952us       8.825us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         3.09%      71.834us         3.09%      71.834us      11.972us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.69%      39.231us         2.17%      50.432us       4.203us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.48%      11.201us         0.48%      11.201us       0.933us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.08%      48.311us         2.08%      48.311us       8.052us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.57%      13.240us         0.57%      13.240us      13.240us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     384.577us      1653.10%     384.577us     384.577us             1  
+                                      hf_kernels_rotary         9.70%     223.937us        99.35%       2.294ms       2.294ms       0.000us         0.00%      24.544us      24.544us             1  
+                          _rotary_dba7d1e::apply_rotary         2.33%      53.870us         4.38%     101.111us      16.852us      16.128us        69.33%      16.128us       2.688us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.128us        69.33%      16.128us       2.688us             6  
+                                            aten::clone         1.57%      36.153us        83.10%       1.919ms     319.785us       0.000us         0.00%       8.416us       1.403us             6  
+                                            aten::copy_         1.64%      37.980us        79.42%       1.834ms     305.621us       7.136us        30.67%       8.416us       1.403us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.136us        30.67%       7.136us       1.189us             6  
+                                Activity Buffer Request        74.69%       1.725ms        74.69%       1.725ms       1.725ms       1.280us         5.50%       1.280us       1.280us             1  
+                                    aten::empty_strided         2.11%      48.830us         2.11%      48.830us       8.138us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         3.08%      71.171us         3.08%      71.171us      11.862us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.71%      39.468us         2.17%      50.120us       4.177us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.46%      10.652us         0.46%      10.652us       0.888us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.05%      47.241us         2.05%      47.241us       7.873us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.65%      15.020us         0.65%      15.020us      15.020us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.324ms
-Self CUDA time total: 23.232us
+Self CPU time total: 2.309ms
+Self CUDA time total: 23.264us
 
 
 
@@ -4035,23 +4035,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     351.006us      1460.58%     351.006us     351.006us             1  
-                                      hf_kernels_rotary         8.50%     188.914us        99.67%       2.215ms       2.215ms       0.000us         0.00%      25.344us      25.344us             1  
-                          _rotary_dba7d1e::apply_rotary         1.92%      42.653us         3.86%      85.804us      14.301us      16.160us        67.24%      16.160us       2.693us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.160us        67.24%      16.160us       2.693us             6  
-                                            aten::clone         1.34%      29.789us        85.47%       1.899ms     316.566us       0.000us         0.00%       9.184us       1.531us             6  
-                                            aten::copy_         1.59%      35.393us        82.63%       1.836ms     306.029us       7.872us        32.76%       9.184us       1.531us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us        32.76%       7.872us       1.312us             6  
-                                Activity Buffer Request        78.48%       1.744ms        78.48%       1.744ms       1.744ms       1.312us         5.46%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.50%      33.432us         1.50%      33.432us       5.572us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.55%      56.710us         2.55%      56.710us       9.452us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.45%      32.280us         1.84%      40.820us       3.402us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.38%       8.540us         0.38%       8.540us       0.712us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.94%      43.151us         1.94%      43.151us       7.192us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.33%       7.360us         0.33%       7.360us       7.360us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     323.742us      1343.66%     323.742us     323.742us             1  
+                                      hf_kernels_rotary         8.02%     171.501us        99.72%       2.131ms       2.131ms       0.000us         0.00%      25.406us      25.406us             1  
+                          _rotary_dba7d1e::apply_rotary         1.84%      39.272us         3.69%      78.893us      13.149us      16.224us        67.34%      16.224us       2.704us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.224us        67.34%      16.224us       2.704us             6  
+                                            aten::clone         1.20%      25.714us        86.18%       1.842ms     306.982us       0.000us         0.00%       9.182us       1.530us             6  
+                                            aten::copy_         1.70%      36.249us        83.58%       1.786ms     297.695us       7.870us        32.66%       9.182us       1.530us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.870us        32.66%       7.870us       1.312us             6  
+                                Activity Buffer Request        79.35%       1.696ms        79.35%       1.696ms       1.696ms       1.312us         5.45%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.40%      30.010us         1.40%      30.010us       5.002us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.53%      54.011us         2.53%      54.011us       9.002us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.43%      30.532us         1.82%      38.871us       3.239us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.39%       8.339us         0.39%       8.339us       0.695us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.85%      39.621us         1.85%      39.621us       6.603us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       6.040us         0.28%       6.040us       6.040us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.222ms
-Self CUDA time total: 24.032us
+Self CPU time total: 2.137ms
+Self CUDA time total: 24.094us
 
 
 
@@ -4061,23 +4061,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     328.861us      1359.32%     328.861us     328.861us             1  
-                                      hf_kernels_rotary         7.82%     169.265us        99.74%       2.160ms       2.160ms       0.000us         0.00%      25.505us      25.505us             1  
-                          _rotary_dba7d1e::apply_rotary         1.90%      41.240us         3.83%      83.032us      13.839us      16.449us        67.99%      16.449us       2.742us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.449us        67.99%      16.449us       2.742us             6  
-                                            aten::clone         1.22%      26.522us        86.28%       1.868ms     311.384us       0.000us         0.00%       9.056us       1.509us             6  
-                                            aten::copy_         1.60%      34.652us        83.63%       1.811ms     301.836us       7.744us        32.01%       9.056us       1.509us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us        32.01%       7.744us       1.291us             6  
-                                Activity Buffer Request        79.55%       1.723ms        79.55%       1.723ms       1.723ms       1.312us         5.42%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.42%      30.770us         1.42%      30.770us       5.128us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.49%      53.840us         2.49%      53.840us       8.973us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.40%      30.337us         1.81%      39.289us       3.274us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.41%       8.952us         0.41%       8.952us       0.746us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.93%      41.792us         1.93%      41.792us       6.965us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       5.540us         0.26%       5.540us       5.540us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     324.959us      1341.36%     324.959us     324.959us             1  
+                                      hf_kernels_rotary         7.79%     167.945us        99.74%       2.150ms       2.150ms       0.000us         0.00%      25.506us      25.506us             1  
+                          _rotary_dba7d1e::apply_rotary         1.90%      41.040us         3.78%      81.541us      13.590us      16.482us        68.03%      16.482us       2.747us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.482us        68.03%      16.482us       2.747us             6  
+                                            aten::clone         1.17%      25.161us        86.39%       1.862ms     310.280us       0.000us         0.00%       9.024us       1.504us             6  
+                                            aten::copy_         1.63%      35.122us        83.86%       1.807ms     301.213us       7.744us        31.97%       9.024us       1.504us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us        31.97%       7.744us       1.291us             6  
+                                Activity Buffer Request        79.59%       1.715ms        79.59%       1.715ms       1.715ms       1.280us         5.28%       1.280us       1.280us             1  
+                                    aten::empty_strided         1.36%      29.241us         1.36%      29.241us       4.874us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.64%      56.970us         2.64%      56.970us       9.495us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.41%      30.477us         1.78%      38.419us       3.202us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.37%       7.942us         0.37%       7.942us       0.662us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.88%      40.501us         1.88%      40.501us       6.750us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.26%       5.501us         0.26%       5.501us       5.501us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.165ms
-Self CUDA time total: 24.193us
+Self CPU time total: 2.155ms
+Self CUDA time total: 24.226us
 
 
 
@@ -4087,23 +4087,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     332.000us      1184.36%     332.000us     332.000us             1  
-                                      hf_kernels_rotary         7.19%     171.403us        99.79%       2.378ms       2.378ms       0.000us         0.00%      29.792us      29.792us             1  
-                          _rotary_dba7d1e::apply_rotary         1.72%      40.922us         3.47%      82.793us      13.799us      17.632us        62.90%      17.632us       2.939us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.632us        62.90%      17.632us       2.939us             6  
-                                            aten::clone         1.16%      27.640us        87.43%       2.084ms     347.303us       0.000us         0.00%      12.160us       2.027us             6  
-                                            aten::copy_         1.42%      33.951us        84.95%       2.025ms     337.488us      10.400us        37.10%      12.160us       2.027us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us        37.10%      10.400us       1.733us             6  
-                                Activity Buffer Request        73.90%       1.761ms        73.90%       1.761ms       1.761ms       1.760us         6.28%       1.760us       1.760us             1  
-                                    aten::empty_strided         1.31%      31.250us         1.31%      31.250us       5.208us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.63%     229.586us         9.63%     229.586us      38.264us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.31%      31.193us         1.70%      40.482us       3.373us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.39%       9.289us         0.39%       9.289us       0.774us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.76%      41.871us         1.76%      41.871us       6.979us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.21%       5.050us         0.21%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     364.384us      1292.46%     364.384us     364.384us             1  
+                                      hf_kernels_rotary         8.38%     203.415us        99.78%       2.422ms       2.422ms       0.000us         0.00%      29.986us      29.986us             1  
+                          _rotary_dba7d1e::apply_rotary         1.78%      43.242us         3.59%      87.183us      14.530us      17.664us        62.65%      17.664us       2.944us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.664us        62.65%      17.664us       2.944us             6  
+                                            aten::clone         1.06%      25.710us        86.14%       2.091ms     348.456us       0.000us         0.00%      12.322us       2.054us             6  
+                                            aten::copy_         1.40%      33.961us        83.85%       2.035ms     339.187us      10.529us        37.35%      12.322us       2.054us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.529us        37.35%      10.529us       1.755us             6  
+                                Activity Buffer Request        73.02%       1.772ms        73.02%       1.772ms       1.772ms       1.793us         6.36%       1.793us       1.793us             1  
+                                    aten::empty_strided         1.23%      29.901us         1.23%      29.901us       4.983us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.43%     228.855us         9.43%     228.855us      38.143us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.32%      32.031us         1.67%      40.620us       3.385us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.35%       8.589us         0.35%       8.589us       0.716us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.81%      43.941us         1.81%      43.941us       7.323us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.22%       5.260us         0.22%       5.260us       5.260us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.384ms
-Self CUDA time total: 28.032us
+Self CPU time total: 2.427ms
+Self CUDA time total: 28.193us
 
 
 
@@ -4113,23 +4113,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     335.418us      1384.59%     335.418us     335.418us             1  
-                                      hf_kernels_rotary        20.26%     170.294us        99.34%     834.940us     834.940us       0.000us         0.00%      25.537us      25.537us             1  
-                          _rotary_dba7d1e::apply_rotary         4.83%      40.562us         9.92%      83.412us      13.902us      16.513us        68.17%      16.513us       2.752us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.513us        68.17%      16.513us       2.752us             6  
-                                            aten::clone         2.64%      22.222us        64.45%     541.674us      90.279us       0.000us         0.00%       9.024us       1.504us             6  
-                                            aten::copy_         4.18%      35.111us        57.94%     486.972us      81.162us       7.712us        31.83%       9.024us       1.504us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us        31.83%       7.712us       1.285us             6  
-                                Activity Buffer Request        27.90%     234.506us        27.90%     234.506us     234.506us       1.312us         5.42%       1.312us       1.312us             1  
-                                    aten::empty_strided         3.86%      32.480us         3.86%      32.480us       5.413us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.86%     217.355us        25.86%     217.355us      36.226us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.66%      30.769us         4.71%      39.560us       3.297us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.05%       8.791us         1.05%       8.791us       0.733us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.10%      42.850us         5.10%      42.850us       7.142us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.66%       5.560us         0.66%       5.560us       5.560us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     323.517us      1337.29%     323.517us     323.517us             1  
+                                      hf_kernels_rotary        20.05%     169.673us        99.33%     840.688us     840.688us       0.000us         0.00%      25.472us      25.472us             1  
+                          _rotary_dba7d1e::apply_rotary         4.98%      42.181us         9.92%      83.942us      13.990us      16.480us        68.12%      16.480us       2.747us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.480us        68.12%      16.480us       2.747us             6  
+                                            aten::clone         2.34%      19.800us        64.76%     548.062us      91.344us       0.000us         0.00%       8.992us       1.499us             6  
+                                            aten::copy_         3.95%      33.463us        59.03%     499.612us      83.269us       7.712us        31.88%       8.992us       1.499us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us        31.88%       7.712us       1.285us             6  
+                                Activity Buffer Request        28.63%     242.305us        28.63%     242.305us     242.305us       1.280us         5.29%       1.280us       1.280us             1  
+                                    aten::empty_strided         3.39%      28.650us         3.39%      28.650us       4.775us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        26.45%     223.844us        26.45%     223.844us      37.307us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.63%      30.762us         4.61%      39.011us       3.251us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.97%       8.249us         0.97%       8.249us       0.687us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.93%      41.761us         4.93%      41.761us       6.960us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.67%       5.650us         0.67%       5.650us       5.650us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 840.500us
-Self CUDA time total: 24.225us
+Self CPU time total: 846.338us
+Self CUDA time total: 24.192us
 
 
 
@@ -4139,23 +4139,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     335.419us      1187.11%     335.419us     335.419us             1  
-                                      hf_kernels_rotary        22.11%     159.484us        99.14%     715.038us     715.038us       0.000us         0.00%      30.047us      30.047us             1  
-                          _rotary_dba7d1e::apply_rotary         6.01%      43.310us        12.22%      88.120us      14.687us      17.727us        62.74%      17.727us       2.954us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.727us        62.74%      17.727us       2.954us             6  
-                                            aten::clone         2.86%      20.618us        59.23%     427.160us      71.193us       0.000us         0.00%      12.320us       2.053us             6  
-                                            aten::copy_         4.70%      33.871us        52.15%     376.129us      62.688us      10.528us        37.26%      12.320us       2.053us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us        37.26%      10.528us       1.755us             6  
-                                Activity Buffer Request        17.96%     129.563us        17.96%     129.563us     129.563us       1.792us         6.34%       1.792us       1.792us             1  
-                                    aten::empty_strided         4.22%      30.413us         4.22%      30.413us       5.069us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        29.49%     212.695us        29.49%     212.695us      35.449us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.36%      31.443us         5.58%      40.274us       3.356us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.22%       8.831us         1.22%       8.831us       0.736us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         6.21%      44.810us         6.21%      44.810us       7.468us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.86%       6.181us         0.86%       6.181us       6.181us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     311.513us      1102.47%     311.513us     311.513us             1  
+                                      hf_kernels_rotary        20.61%     164.155us        99.30%     791.098us     791.098us       0.000us         0.00%      30.048us      30.048us             1  
+                          _rotary_dba7d1e::apply_rotary         4.94%      39.349us         9.84%      78.361us      13.060us      17.696us        62.63%      17.696us       2.949us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.696us        62.63%      17.696us       2.949us             6  
+                                            aten::clone         2.47%      19.670us        64.03%     510.081us      85.014us       0.000us         0.00%      12.352us       2.059us             6  
+                                            aten::copy_         4.07%      32.461us        57.94%     461.581us      76.930us      10.560us        37.37%      12.352us       2.059us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us        37.37%      10.560us       1.760us             6  
+                                Activity Buffer Request        27.37%     218.044us        27.37%     218.044us     218.044us       1.792us         6.34%       1.792us       1.792us             1  
+                                    aten::empty_strided         3.62%      28.830us         3.62%      28.830us       4.805us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        26.50%     211.076us        26.50%     211.076us      35.179us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.85%      30.651us         4.83%      38.501us       3.208us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.99%       7.850us         0.99%       7.850us       0.654us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.90%      39.012us         4.90%      39.012us       6.502us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.70%       5.550us         0.70%       5.550us       5.550us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 721.219us
-Self CUDA time total: 28.255us
+Self CPU time total: 796.648us
+Self CUDA time total: 28.256us
 
 
 
@@ -4165,23 +4165,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     339.104us       833.12%     339.104us     339.104us             1  
-                                      hf_kernels_rotary         7.67%     177.694us        99.76%       2.310ms       2.310ms       0.000us         0.00%      43.583us      43.583us             1  
-                          _rotary_dba7d1e::apply_rotary         1.80%      41.651us         3.62%      83.803us      13.967us      23.520us        57.78%      23.520us       3.920us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.520us        57.78%      23.520us       3.920us             6  
-                                            aten::clone         1.20%      27.761us        86.70%       2.008ms     334.588us       0.000us         0.00%      20.063us       3.344us             6  
-                                            aten::copy_         1.49%      34.550us        84.17%       1.949ms     324.808us      17.183us        42.22%      20.063us       3.344us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.183us        42.22%      17.183us       2.864us             6  
-                                Activity Buffer Request        73.48%       1.701ms        73.48%       1.701ms       1.701ms       2.880us         7.08%       2.880us       2.880us             1  
-                                    aten::empty_strided         1.34%      30.920us         1.34%      30.920us       5.153us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.20%     212.976us         9.20%     212.976us      35.496us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.39%      32.120us         1.76%      40.721us       3.393us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.37%       8.601us         0.37%       8.601us       0.717us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.82%      42.152us         1.82%      42.152us       7.025us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       5.660us         0.24%       5.660us       5.660us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     323.712us       798.42%     323.712us     323.712us             1  
+                                      hf_kernels_rotary         7.57%     170.665us        99.78%       2.250ms       2.250ms       0.000us         0.00%      43.392us      43.392us             1  
+                          _rotary_dba7d1e::apply_rotary         1.73%      38.911us         3.48%      78.541us      13.090us      23.456us        57.85%      23.456us       3.909us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.456us        57.85%      23.456us       3.909us             6  
+                                            aten::clone         1.16%      26.090us        87.02%       1.962ms     327.007us       0.000us         0.00%      19.936us       3.323us             6  
+                                            aten::copy_         1.56%      35.131us        84.49%       1.905ms     317.482us      17.088us        42.15%      19.936us       3.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us        42.15%      17.088us       2.848us             6  
+                                Activity Buffer Request        74.36%       1.676ms        74.36%       1.676ms       1.676ms       2.848us         7.02%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.38%      31.061us         1.38%      31.061us       5.177us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.57%     193.284us         8.57%     193.284us      32.214us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.34%      30.268us         1.71%      38.460us       3.205us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.36%       8.192us         0.36%       8.192us       0.683us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.76%      39.630us         1.76%      39.630us       6.605us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.22%       4.960us         0.22%       4.960us       4.960us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.315ms
-Self CUDA time total: 40.703us
+Self CPU time total: 2.255ms
+Self CUDA time total: 40.544us
 
 
 
@@ -4191,23 +4191,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     342.239us       459.79%     342.239us     342.239us             1  
-                                      hf_kernels_rotary         7.38%     176.732us        99.78%       2.390ms       2.390ms       0.000us         0.00%      82.913us      82.913us             1  
-                                            aten::clone         1.17%      28.130us        87.22%       2.089ms     348.177us       0.000us         0.00%      43.777us       7.296us             6  
-                                            aten::copy_         1.46%      34.971us        84.72%       2.029ms     338.203us      35.297us        47.42%      43.777us       7.296us             6  
-                          _rotary_dba7d1e::apply_rotary         1.71%      40.931us         3.50%      83.864us      13.977us      39.136us        52.58%      39.136us       6.523us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.136us        52.58%      39.136us       6.523us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      35.297us        47.42%      35.297us       5.883us             6  
-                                Activity Buffer Request        74.20%       1.777ms        74.20%       1.777ms       1.777ms       8.480us        11.39%       8.480us       8.480us             1  
-                                    aten::empty_strided         1.32%      31.711us         1.32%      31.711us       5.285us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.06%     217.015us         9.06%     217.015us      36.169us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.30%      31.251us         1.69%      40.431us       3.369us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.38%       9.180us         0.38%       9.180us       0.765us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.79%      42.933us         1.79%      42.933us       7.155us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.22%       5.191us         0.22%       5.191us       5.191us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     325.696us       437.95%     325.696us     325.696us             1  
+                                      hf_kernels_rotary         7.25%     170.323us        99.77%       2.344ms       2.344ms       0.000us         0.00%      82.880us      82.880us             1  
+                                            aten::clone         1.13%      26.590us        87.43%       2.054ms     342.324us       0.000us         0.00%      44.000us       7.333us             6  
+                                            aten::copy_         1.50%      35.240us        85.03%       1.997ms     332.900us      35.488us        47.72%      44.000us       7.333us             6  
+                          _rotary_dba7d1e::apply_rotary         1.74%      40.881us         3.46%      81.232us      13.539us      38.880us        52.28%      38.880us       6.480us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      38.880us        52.28%      38.880us       6.480us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      35.488us        47.72%      35.488us       5.915us             6  
+                                Activity Buffer Request        75.65%       1.777ms        75.65%       1.777ms       1.777ms       8.512us        11.45%       8.512us       8.512us             1  
+                                    aten::empty_strided         1.27%      29.951us         1.27%      29.951us       4.992us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         7.87%     184.984us         7.87%     184.984us      30.831us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.28%      30.151us         1.62%      38.112us       3.176us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.34%       7.961us         0.34%       7.961us       0.663us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.72%      40.351us         1.72%      40.351us       6.725us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       5.500us         0.23%       5.500us       5.500us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.395ms
-Self CUDA time total: 74.433us
+Self CPU time total: 2.349ms
+Self CUDA time total: 74.368us
 
 
 
@@ -4217,23 +4217,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     332.929us       819.86%     332.929us     332.929us             1  
-                                      hf_kernels_rotary         7.07%     166.848us        99.78%       2.353ms       2.353ms       0.000us         0.00%      43.488us      43.488us             1  
-                          _rotary_dba7d1e::apply_rotary         1.68%      39.560us         3.51%      82.681us      13.780us      23.488us        57.84%      23.488us       3.915us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.488us        57.84%      23.488us       3.915us             6  
-                                            aten::clone         1.16%      27.340us        87.49%       2.064ms     343.923us       0.000us         0.00%      20.000us       3.333us             6  
-                                            aten::copy_         1.51%      35.519us        85.00%       2.005ms     334.108us      17.120us        42.16%      20.000us       3.333us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.120us        42.16%      17.120us       2.853us             6  
-                                Activity Buffer Request        74.50%       1.757ms        74.50%       1.757ms       1.757ms       2.880us         7.09%       2.880us       2.880us             1  
-                                    aten::empty_strided         1.34%      31.550us         1.34%      31.550us       5.258us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.99%     212.096us         8.99%     212.096us      35.349us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.35%      31.900us         1.71%      40.350us       3.362us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.36%       8.450us         0.36%       8.450us       0.704us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.83%      43.121us         1.83%      43.121us       7.187us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.22%       5.120us         0.22%       5.120us       5.120us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     322.973us       797.86%     322.973us     322.973us             1  
+                                      hf_kernels_rotary         8.42%     188.754us        99.76%       2.236ms       2.236ms       0.000us         0.00%      43.328us      43.328us             1  
+                          _rotary_dba7d1e::apply_rotary         1.78%      39.922us         3.52%      78.932us      13.155us      23.456us        57.94%      23.456us       3.909us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.456us        57.94%      23.456us       3.909us             6  
+                                            aten::clone         1.19%      26.589us        86.07%       1.929ms     321.553us       0.000us         0.00%      19.872us       3.312us             6  
+                                            aten::copy_         1.47%      33.001us        83.54%       1.873ms     312.092us      17.024us        42.06%      19.872us       3.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us        42.06%      17.024us       2.837us             6  
+                                Activity Buffer Request        73.86%       1.656ms        73.86%       1.656ms       1.656ms       2.848us         7.04%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.35%      30.180us         1.35%      30.180us       5.030us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.21%     183.975us         8.21%     183.975us      30.662us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.39%      31.060us         1.75%      39.122us       3.260us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.36%       8.062us         0.36%       8.062us       0.672us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.74%      39.010us         1.74%      39.010us       6.502us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       5.491us         0.24%       5.491us       5.491us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.359ms
-Self CUDA time total: 40.608us
+Self CPU time total: 2.242ms
+Self CUDA time total: 40.480us
 
 
 
@@ -4243,23 +4243,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     338.142us       439.39%     338.142us     338.142us             1  
-                                      hf_kernels_rotary         7.05%     174.064us        99.80%       2.465ms       2.465ms       0.000us         0.00%      86.270us      86.270us             1  
-                                            aten::clone         1.20%      29.650us        87.84%       2.170ms     361.584us       0.000us         0.00%      47.071us       7.845us             6  
-                                            aten::copy_         1.42%      34.959us        85.36%       2.108ms     351.395us      37.759us        49.06%      47.071us       7.845us             6  
-                          _rotary_dba7d1e::apply_rotary         1.66%      41.022us         3.32%      82.043us      13.674us      39.199us        50.94%      39.199us       6.533us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.199us        50.94%      39.199us       6.533us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      37.759us        49.06%      37.759us       6.293us             6  
-                                Activity Buffer Request        75.49%       1.864ms        75.49%       1.864ms       1.864ms       9.312us        12.10%       9.312us       9.312us             1  
-                                    aten::empty_strided         1.27%      31.482us         1.27%      31.482us       5.247us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.46%     208.956us         8.46%     208.956us      34.826us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.25%      30.771us         1.59%      39.290us       3.274us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.34%       8.519us         0.34%       8.519us       0.710us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.66%      41.021us         1.66%      41.021us       6.837us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.20%       5.010us         0.20%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     323.101us       423.90%     323.101us     323.101us             1  
+                                      hf_kernels_rotary         7.30%     168.335us        99.78%       2.301ms       2.301ms       0.000us         0.00%      85.500us      85.500us             1  
+                                            aten::clone         1.11%      25.632us        87.37%       2.015ms     335.877us       0.000us         0.00%      46.364us       7.727us             6  
+                                            aten::copy_         1.44%      33.260us        84.98%       1.960ms     326.665us      37.085us        48.65%      46.364us       7.727us             6  
+                          _rotary_dba7d1e::apply_rotary         1.70%      39.159us         3.44%      79.421us      13.237us      39.136us        51.35%      39.136us       6.523us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.136us        51.35%      39.136us       6.523us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      37.085us        48.65%      37.085us       6.181us             6  
+                                Activity Buffer Request        75.65%       1.745ms        75.65%       1.745ms       1.745ms       9.279us        12.17%       9.279us       9.279us             1  
+                                    aten::empty_strided         1.29%      29.640us         1.29%      29.640us       4.940us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         7.89%     181.984us         7.89%     181.984us      30.331us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.33%      30.620us         1.67%      38.470us       3.206us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.34%       7.850us         0.34%       7.850us       0.654us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.75%      40.262us         1.75%      40.262us       6.710us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.22%       5.000us         0.22%       5.000us       5.000us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.470ms
-Self CUDA time total: 76.958us
+Self CPU time total: 2.306ms
+Self CUDA time total: 76.221us
 
 
 
@@ -4269,23 +4269,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     343.679us       247.41%     343.679us     343.679us             1  
-                                      hf_kernels_rotary         7.47%     171.684us        99.76%       2.294ms       2.294ms       0.000us         0.00%     162.559us     162.559us             1  
-                                            aten::clone         1.25%      28.681us        86.89%       1.998ms     333.015us       0.000us         0.00%     102.592us      17.099us             6  
-                                            aten::copy_         1.49%      34.332us        84.20%       1.936ms     322.703us      78.944us        56.83%     102.592us      17.099us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      78.944us        56.83%      78.944us      13.157us             6  
-                          _rotary_dba7d1e::apply_rotary         1.83%      42.151us         3.66%      84.183us      14.030us      59.967us        43.17%      59.967us       9.995us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      59.967us        43.17%      59.967us       9.995us             6  
-                                Activity Buffer Request        73.81%       1.697ms        73.81%       1.697ms       1.697ms      23.648us        17.02%      23.648us      23.648us             1  
-                                    aten::empty_strided         1.44%      33.190us         1.44%      33.190us       5.532us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.89%     204.504us         8.89%     204.504us      34.084us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.37%      31.511us         1.74%      40.120us       3.343us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.37%       8.609us         0.37%       8.609us       0.717us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.83%      42.032us         1.83%      42.032us       7.005us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       5.461us         0.24%       5.461us       5.461us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     328.643us       235.88%     328.643us     328.643us             1  
+                                      hf_kernels_rotary         7.36%     166.641us        99.76%       2.258ms       2.258ms       0.000us         0.00%     163.042us     163.042us             1  
+                                            aten::clone         1.33%      30.021us        87.18%       1.973ms     328.842us       0.000us         0.00%     102.882us      17.147us             6  
+                                            aten::copy_         1.51%      34.171us        84.51%       1.913ms     318.757us      79.169us        56.82%     102.882us      17.147us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      79.169us        56.82%      79.169us      13.195us             6  
+                          _rotary_dba7d1e::apply_rotary         1.75%      39.570us         3.52%      79.562us      13.260us      60.160us        43.18%      60.160us      10.027us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      60.160us        43.18%      60.160us      10.027us             6  
+                                Activity Buffer Request        74.82%       1.693ms        74.82%       1.693ms       1.693ms      23.713us        17.02%      23.713us      23.713us             1  
+                                    aten::empty_strided         1.35%      30.491us         1.35%      30.491us       5.082us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.17%     185.005us         8.17%     185.005us      30.834us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.35%      30.551us         1.70%      38.372us       3.198us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.35%       7.821us         0.35%       7.821us       0.652us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.77%      39.992us         1.77%      39.992us       6.665us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       5.501us         0.24%       5.501us       5.501us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.300ms
-Self CUDA time total: 138.911us
+Self CPU time total: 2.263ms
+Self CUDA time total: 139.329us
 
 
 
@@ -4295,23 +4295,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         6.47%     175.111us        88.03%       2.384ms       2.384ms       0.000us         0.00%     770.847us     770.847us             1  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     713.695us       101.13%     713.695us     713.695us             1  
-                                            aten::clone         1.03%      27.812us        76.48%       2.071ms     345.215us       0.000us         0.00%     568.671us      94.778us             6  
-                                            aten::copy_         1.35%      36.632us        74.30%       2.012ms     335.367us     503.551us        71.35%     568.671us      94.778us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     503.551us        71.35%     503.551us      83.925us             6  
-                          _rotary_dba7d1e::apply_rotary         1.88%      50.972us         3.57%      96.775us      16.129us     202.176us        28.65%     202.176us      33.696us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     202.176us        28.65%     202.176us      33.696us             6  
-                                Activity Buffer Request        65.43%       1.772ms        65.43%       1.772ms       1.772ms      65.120us         9.23%      65.120us      65.120us             1  
-                                    aten::empty_strided         1.15%      31.280us         1.15%      31.280us       5.213us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         7.52%     203.605us         7.52%     203.605us      33.934us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.18%      32.050us         1.51%      41.000us       3.417us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.33%       8.950us         0.33%       8.950us       0.746us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.69%      45.803us         1.69%      45.803us       7.634us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        11.97%     324.077us        11.97%     324.077us     324.077us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         6.40%     173.445us        87.62%       2.374ms       2.374ms       0.000us         0.00%     766.307us     766.307us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     707.554us       101.09%     707.554us     707.554us             1  
+                                            aten::clone         0.97%      26.320us        76.46%       2.071ms     345.229us       0.000us         0.00%     566.210us      94.368us             6  
+                                            aten::copy_         1.24%      33.572us        74.40%       2.016ms     335.956us     499.842us        71.41%     566.210us      94.368us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     499.842us        71.41%     499.842us      83.307us             6  
+                          _rotary_dba7d1e::apply_rotary         1.84%      49.730us         3.34%      90.461us      15.077us     200.097us        28.59%     200.097us      33.350us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     200.097us        28.59%     200.097us      33.350us             6  
+                                Activity Buffer Request        66.43%       1.800ms        66.43%       1.800ms       1.800ms      66.368us         9.48%      66.368us      66.368us             1  
+                                    aten::empty_strided         1.08%      29.321us         1.08%      29.321us       4.887us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.73%     182.313us         6.73%     182.313us      30.385us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.12%      30.440us         1.42%      38.441us       3.203us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.30%       8.001us         0.30%       8.001us       0.667us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.50%      40.731us         1.50%      40.731us       6.788us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        12.38%     335.537us        12.38%     335.537us     335.537us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.708ms
-Self CUDA time total: 705.727us
+Self CPU time total: 2.709ms
+Self CUDA time total: 699.939us
 
 
 
@@ -4321,23 +4321,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     336.477us      1265.33%     336.477us     336.477us             1  
-                                      hf_kernels_rotary         7.39%     171.995us        99.77%       2.322ms       2.322ms       0.000us         0.00%      27.904us      27.904us             1  
-                          _rotary_dba7d1e::apply_rotary         1.76%      41.061us         3.57%      83.061us      13.844us      18.816us        70.76%      18.816us       3.136us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.816us        70.76%      18.816us       3.136us             6  
-                                            aten::clone         1.24%      28.803us        87.11%       2.027ms     337.843us       0.000us         0.00%       9.088us       1.515us             6  
-                                            aten::copy_         1.64%      38.082us        84.50%       1.966ms     327.735us       7.776us        29.24%       9.088us       1.515us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        29.24%       7.776us       1.296us             6  
-                                Activity Buffer Request        72.85%       1.695ms        72.85%       1.695ms       1.695ms       1.312us         4.93%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.37%      31.849us         1.37%      31.849us       5.308us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        10.01%     232.925us        10.01%     232.925us      38.821us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.33%      30.872us         1.71%      39.700us       3.308us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.38%       8.828us         0.38%       8.828us       0.736us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.80%      42.000us         1.80%      42.000us       7.000us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       5.320us         0.23%       5.320us       5.320us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     326.335us      1221.36%     326.335us     326.335us             1  
+                                      hf_kernels_rotary         7.45%     166.364us        99.75%       2.229ms       2.229ms       0.000us         0.00%      28.063us      28.063us             1  
+                          _rotary_dba7d1e::apply_rotary         1.91%      42.732us         3.76%      83.973us      13.996us      18.815us        70.42%      18.815us       3.136us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.815us        70.42%      18.815us       3.136us             6  
+                                            aten::clone         1.19%      26.589us        86.78%       1.939ms     323.190us       0.000us         0.00%       9.248us       1.541us             6  
+                                            aten::copy_         1.45%      32.469us        84.19%       1.881ms     313.518us       7.904us        29.58%       9.248us       1.541us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        29.58%       7.904us       1.317us             6  
+                                Activity Buffer Request        74.65%       1.668ms        74.65%       1.668ms       1.668ms       1.344us         5.03%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.41%      31.442us         1.41%      31.442us       5.240us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.08%     180.586us         8.08%     180.586us      30.098us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.40%      31.310us         1.76%      39.389us       3.282us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.36%       8.079us         0.36%       8.079us       0.673us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.85%      41.241us         1.85%      41.241us       6.873us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.25%       5.600us         0.25%       5.600us       5.600us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.327ms
-Self CUDA time total: 26.592us
+Self CPU time total: 2.234ms
+Self CUDA time total: 26.719us
 
 
 
@@ -4347,23 +4347,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     361.593us      1356.47%     361.593us     361.593us             1  
-                                      hf_kernels_rotary        20.01%     156.574us        99.29%     776.889us     776.889us       0.000us         0.00%      27.969us      27.969us             1  
-                          _rotary_dba7d1e::apply_rotary         6.08%      47.572us        11.90%      93.114us      15.519us      18.881us        70.83%      18.881us       3.147us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.881us        70.83%      18.881us       3.147us             6  
-                                            aten::clone         2.86%      22.401us        61.73%     483.012us      80.502us       0.000us         0.00%       9.088us       1.515us             6  
-                                            aten::copy_         5.02%      39.248us        54.77%     428.540us      71.423us       7.776us        29.17%       9.088us       1.515us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        29.17%       7.776us       1.296us             6  
-                                Activity Buffer Request        23.43%     183.305us        23.43%     183.305us     183.305us       1.312us         4.92%       1.312us       1.312us             1  
-                                    aten::empty_strided         4.10%      32.071us         4.10%      32.071us       5.345us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        26.33%     205.987us        26.33%     205.987us      34.331us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.44%      34.709us         5.65%      44.189us       3.682us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.21%       9.480us         1.21%       9.480us       0.790us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.82%      45.542us         5.82%      45.542us       7.590us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.71%       5.560us         0.71%       5.560us       5.560us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     321.948us      1206.29%     321.948us     321.948us             1  
+                                      hf_kernels_rotary        18.26%     148.741us        99.36%     809.528us     809.528us       0.000us         0.00%      28.001us      28.001us             1  
+                          _rotary_dba7d1e::apply_rotary         5.08%      41.412us        10.07%      82.013us      13.669us      18.880us        70.74%      18.880us       3.147us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.880us        70.74%      18.880us       3.147us             6  
+                                            aten::clone         2.30%      18.723us        66.05%     538.143us      89.691us       0.000us         0.00%       9.121us       1.520us             6  
+                                            aten::copy_         4.35%      35.479us        60.21%     490.570us      81.762us       7.809us        29.26%       9.121us       1.520us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.809us        29.26%       7.809us       1.301us             6  
+                                Activity Buffer Request        33.00%     268.886us        33.00%     268.886us     268.886us       1.312us         4.92%       1.312us       1.312us             1  
+                                    aten::empty_strided         3.54%      28.850us         3.54%      28.850us       4.808us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.85%     186.205us        22.85%     186.205us      31.034us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.00%      32.551us         4.99%      40.631us       3.386us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.99%       8.080us         0.99%       8.080us       0.673us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.98%      40.601us         4.98%      40.601us       6.767us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.64%       5.220us         0.64%       5.220us       5.220us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 782.449us
-Self CUDA time total: 26.657us
+Self CPU time total: 814.748us
+Self CUDA time total: 26.689us
 
 
 
@@ -4373,23 +4373,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     329.530us      1073.84%     329.530us     329.530us             1  
-                                      hf_kernels_rotary        18.69%     150.673us        99.29%     800.610us     800.610us       0.000us         0.00%      32.447us      32.447us             1  
-                          _rotary_dba7d1e::apply_rotary         5.09%      41.061us        10.37%      83.622us      13.937us      20.159us        65.69%      20.159us       3.360us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.159us        65.69%      20.159us       3.360us             6  
-                                            aten::clone         2.45%      19.762us        65.24%     526.013us      87.669us       0.000us         0.00%      12.288us       2.048us             6  
-                                            aten::copy_         4.31%      34.749us        58.96%     475.401us      79.234us      10.528us        34.31%      12.288us       2.048us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us        34.31%      10.528us       1.755us             6  
-                                Activity Buffer Request        29.87%     240.876us        29.87%     240.876us     240.876us       1.760us         5.74%       1.760us       1.760us             1  
-                                    aten::empty_strided         3.83%      30.850us         3.83%      30.850us       5.142us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        24.78%     199.776us        24.78%     199.776us      33.296us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.92%      31.582us         5.00%      40.302us       3.358us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.08%       8.720us         1.08%       8.720us       0.727us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.28%      42.561us         5.28%      42.561us       7.094us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.71%       5.701us         0.71%       5.701us       5.701us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     317.754us      1035.40%     317.754us     317.754us             1  
+                                      hf_kernels_rotary        20.08%     146.926us        99.34%     726.766us     726.766us       0.000us         0.00%      32.481us      32.481us             1  
+                          _rotary_dba7d1e::apply_rotary         5.67%      41.461us        11.01%      80.552us      13.425us      20.192us        65.80%      20.192us       3.365us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.192us        65.80%      20.192us       3.365us             6  
+                                            aten::clone         2.73%      19.937us        62.86%     459.858us      76.643us       0.000us         0.00%      12.289us       2.048us             6  
+                                            aten::copy_         4.61%      33.700us        56.15%     410.789us      68.465us      10.497us        34.20%      12.289us       2.048us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.497us        34.20%      10.497us       1.750us             6  
+                                Activity Buffer Request        26.92%     196.965us        26.92%     196.965us     196.965us       1.792us         5.84%       1.792us       1.792us             1  
+                                    aten::empty_strided         3.98%      29.132us         3.98%      29.132us       4.855us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        24.62%     180.124us        24.62%     180.124us      30.021us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.28%      31.330us         5.39%      39.430us       3.286us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.11%       8.100us         1.11%       8.100us       0.675us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.34%      39.091us         5.34%      39.091us       6.515us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.66%       4.850us         0.66%       4.850us       4.850us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 806.311us
-Self CUDA time total: 30.687us
+Self CPU time total: 731.616us
+Self CUDA time total: 30.689us
 
 
 
@@ -4399,23 +4399,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     329.435us       772.31%     329.435us     329.435us             1  
-                                      hf_kernels_rotary        20.10%     145.342us        99.21%     717.528us     717.528us       0.000us         0.00%      45.504us      45.504us             1  
-                          _rotary_dba7d1e::apply_rotary         5.72%      41.400us        11.55%      83.552us      13.925us      25.568us        59.94%      25.568us       4.261us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.568us        59.94%      25.568us       4.261us             6  
-                                            aten::clone         2.77%      20.032us        61.99%     448.302us      74.717us       0.000us         0.00%      19.936us       3.323us             6  
-                                            aten::copy_         4.93%      35.690us        55.04%     398.089us      66.348us      17.088us        40.06%      19.936us       3.323us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us        40.06%      17.088us       2.848us             6  
-                                Activity Buffer Request        22.43%     162.214us        22.43%     162.214us     162.214us       2.848us         6.68%       2.848us       2.848us             1  
-                                    aten::empty_strided         4.17%      30.181us         4.17%      30.181us       5.030us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        27.68%     200.185us        27.68%     200.185us      33.364us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.37%      31.593us         5.58%      40.332us       3.361us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.21%       8.739us         1.21%       8.739us       0.728us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.83%      42.152us         5.83%      42.152us       7.025us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.79%       5.700us         0.79%       5.700us       5.700us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     319.429us       749.90%     319.429us     319.429us             1  
+                                      hf_kernels_rotary        19.19%     146.865us        99.27%     759.737us     759.737us       0.000us         0.00%      45.476us      45.476us             1  
+                          _rotary_dba7d1e::apply_rotary         5.51%      42.192us        10.66%      81.562us      13.594us      25.698us        60.33%      25.698us       4.283us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.698us        60.33%      25.698us       4.283us             6  
+                                            aten::clone         2.51%      19.231us        64.35%     492.451us      82.075us       0.000us         0.00%      19.778us       3.296us             6  
+                                            aten::copy_         4.10%      31.370us        57.94%     443.419us      73.903us      16.898us        39.67%      19.778us       3.296us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.898us        39.67%      16.898us       2.816us             6  
+                                Activity Buffer Request        30.11%     230.405us        30.11%     230.405us     230.405us       2.880us         6.76%       2.880us       2.880us             1  
+                                    aten::empty_strided         3.89%      29.801us         3.89%      29.801us       4.967us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        23.73%     181.644us        23.73%     181.644us      30.274us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.99%      30.570us         5.08%      38.859us       3.238us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.08%       8.289us         1.08%       8.289us       0.691us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.14%      39.370us         5.14%      39.370us       6.562us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.73%       5.590us         0.73%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 723.228us
-Self CUDA time total: 42.656us
+Self CPU time total: 765.327us
+Self CUDA time total: 42.596us
 
 
 
@@ -4425,23 +4425,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     330.651us      1075.32%     330.651us     330.651us             1  
-                                      hf_kernels_rotary        18.72%     150.931us        99.33%     800.790us     800.790us       0.000us         0.00%      32.508us      32.508us             1  
-                          _rotary_dba7d1e::apply_rotary         5.21%      41.984us        10.24%      82.545us      13.758us      20.318us        66.08%      20.318us       3.386us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.318us        66.08%      20.318us       3.386us             6  
-                                            aten::clone         2.51%      20.253us        65.40%     527.224us      87.871us       0.000us         0.00%      12.190us       2.032us             6  
-                                            aten::copy_         4.39%      35.371us        59.06%     476.161us      79.360us      10.431us        33.92%      12.190us       2.032us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us        33.92%      10.431us       1.738us             6  
-                                Activity Buffer Request        30.20%     243.496us        30.20%     243.496us     243.496us       1.759us         5.72%       1.759us       1.759us             1  
-                                    aten::empty_strided         3.82%      30.810us         3.82%      30.810us       5.135us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        24.47%     197.294us        24.47%     197.294us      32.882us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.87%      31.220us         4.97%      40.090us       3.341us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.10%       8.870us         1.10%       8.870us       0.739us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.03%      40.561us         5.03%      40.561us       6.760us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.67%       5.380us         0.67%       5.380us       5.380us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     322.049us      1047.28%     322.049us     322.049us             1  
+                                      hf_kernels_rotary        18.29%     149.504us        99.36%     812.348us     812.348us       0.000us         0.00%      32.511us      32.511us             1  
+                          _rotary_dba7d1e::apply_rotary         5.08%      41.522us        10.02%      81.922us      13.654us      20.224us        65.77%      20.224us       3.371us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.224us        65.77%      20.224us       3.371us             6  
+                                            aten::clone         2.45%      20.060us        66.16%     540.872us      90.145us       0.000us         0.00%      12.287us       2.048us             6  
+                                            aten::copy_         4.16%      33.972us        60.11%     491.422us      81.904us      10.527us        34.23%      12.287us       2.048us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.527us        34.23%      10.527us       1.754us             6  
+                                Activity Buffer Request        32.85%     268.566us        32.85%     268.566us     268.566us       1.760us         5.72%       1.760us       1.760us             1  
+                                    aten::empty_strided         3.59%      29.390us         3.59%      29.390us       4.898us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        23.10%     188.884us        23.10%     188.884us      31.481us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.91%      31.930us         4.90%      40.050us       3.337us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.99%       8.120us         0.99%       8.120us       0.677us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.94%      40.400us         4.94%      40.400us       6.733us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.64%       5.230us         0.64%       5.230us       5.230us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 806.170us
-Self CUDA time total: 30.749us
+Self CPU time total: 817.578us
+Self CUDA time total: 30.751us
 
 
 
@@ -4451,23 +4451,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     330.616us       770.43%     330.616us     330.616us             1  
-                                      hf_kernels_rotary        17.99%     148.834us        99.35%     822.020us     822.020us       0.000us         0.00%      45.761us      45.761us             1  
-                          _rotary_dba7d1e::apply_rotary         5.00%      41.361us        10.40%      86.012us      14.335us      25.857us        60.25%      25.857us       4.310us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.857us        60.25%      25.857us       4.310us             6  
-                                            aten::clone         2.40%      19.879us        66.22%     547.932us      91.322us       0.000us         0.00%      19.904us       3.317us             6  
-                                            aten::copy_         4.29%      35.520us        60.10%     497.262us      82.877us      17.056us        39.75%      19.904us       3.317us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us        39.75%      17.056us       2.843us             6  
-                                Activity Buffer Request        32.00%     264.767us        32.00%     264.767us     264.767us       2.848us         6.64%       2.848us       2.848us             1  
-                                    aten::empty_strided         3.72%      30.791us         3.72%      30.791us       5.132us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        23.81%     196.975us        23.81%     196.975us      32.829us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.76%      31.140us         4.74%      39.242us       3.270us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.98%       8.102us         0.98%       8.102us       0.675us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.40%      44.651us         5.40%      44.651us       7.442us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.65%       5.369us         0.65%       5.369us       5.369us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     316.538us       739.30%     316.538us     316.538us             1  
+                                      hf_kernels_rotary        19.03%     145.425us        99.27%     758.787us     758.787us       0.000us         0.00%      45.664us      45.664us             1  
+                          _rotary_dba7d1e::apply_rotary         5.13%      39.179us        10.36%      79.161us      13.194us      25.792us        60.24%      25.792us       4.299us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.792us        60.24%      25.792us       4.299us             6  
+                                            aten::clone         2.60%      19.840us        64.72%     494.691us      82.449us       0.000us         0.00%      19.872us       3.312us             6  
+                                            aten::copy_         4.32%      33.011us        58.33%     445.830us      74.305us      17.024us        39.76%      19.872us       3.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us        39.76%      17.024us       2.837us             6  
+                                Activity Buffer Request        30.09%     230.025us        30.09%     230.025us     230.025us       2.848us         6.65%       2.848us       2.848us             1  
+                                    aten::empty_strided         3.80%      29.021us         3.80%      29.021us       4.837us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        23.92%     182.794us        23.92%     182.794us      30.466us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.11%      31.419us         5.17%      39.510us       3.293us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.06%       8.091us         1.06%       8.091us       0.674us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.23%      39.982us         5.23%      39.982us       6.664us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.73%       5.560us         0.73%       5.560us       5.560us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 827.389us
-Self CUDA time total: 42.913us
+Self CPU time total: 764.347us
+Self CUDA time total: 42.816us
 
 
 
@@ -4477,23 +4477,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     334.653us       358.15%     334.653us     334.653us             1  
-                                      hf_kernels_rotary        18.09%     151.114us        99.35%     829.780us     829.780us       0.000us         0.00%     109.215us     109.215us             1  
-                                            aten::clone         3.33%      27.839us        66.37%     554.323us      92.387us       0.000us         0.00%      68.064us      11.344us             6  
-                                            aten::copy_         4.18%      34.911us        59.27%     495.081us      82.513us      52.288us        55.96%      68.064us      11.344us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.288us        55.96%      52.288us       8.715us             6  
-                          _rotary_dba7d1e::apply_rotary         4.95%      41.342us         9.97%      83.303us      13.884us      41.151us        44.04%      41.151us       6.858us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      41.151us        44.04%      41.151us       6.858us             6  
-                                Activity Buffer Request        31.64%     264.256us        31.64%     264.256us     264.256us      15.776us        16.88%      15.776us      15.776us             1  
-                                    aten::empty_strided         3.76%      31.403us         3.76%      31.403us       5.234us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        23.46%     195.914us        23.46%     195.914us      32.652us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.87%      32.310us         4.91%      41.040us       3.420us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.05%       8.730us         1.05%       8.730us       0.728us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.02%      41.961us         5.02%      41.961us       6.994us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.65%       5.470us         0.65%       5.470us       5.470us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     320.924us       345.95%     320.924us     320.924us             1  
+                                      hf_kernels_rotary        18.21%     146.024us        99.34%     796.707us     796.707us       0.000us         0.00%     108.351us     108.351us             1  
+                                            aten::clone         2.45%      19.620us        65.99%     529.232us      88.205us       0.000us         0.00%      67.071us      11.179us             6  
+                                            aten::copy_         4.16%      33.361us        60.01%     481.261us      80.210us      51.487us        55.50%      67.071us      11.179us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.487us        55.50%      51.487us       8.581us             6  
+                          _rotary_dba7d1e::apply_rotary         5.00%      40.060us        10.26%      82.291us      13.715us      41.280us        44.50%      41.280us       6.880us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      41.280us        44.50%      41.280us       6.880us             6  
+                                Activity Buffer Request        33.32%     267.235us        33.32%     267.235us     267.235us      15.584us        16.80%      15.584us      15.584us             1  
+                                    aten::empty_strided         3.54%      28.351us         3.54%      28.351us       4.725us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.53%     180.665us        22.53%     180.665us      30.111us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.88%      31.108us         4.88%      39.160us       3.263us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.00%       8.052us         1.00%       8.052us       0.671us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.27%      42.231us         5.27%      42.231us       7.038us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.66%       5.290us         0.66%       5.290us       5.290us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 835.250us
-Self CUDA time total: 93.439us
+Self CPU time total: 801.997us
+Self CUDA time total: 92.767us
 
 
 
@@ -4503,23 +4503,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     402.293us       278.14%     402.293us     402.293us             1  
-                                      hf_kernels_rotary        18.23%     162.247us        99.42%     884.932us     884.932us       0.000us         0.00%     168.347us     168.347us             1  
-                                            aten::clone         2.56%      22.809us        61.35%     546.082us      91.014us       0.000us         0.00%     105.212us      17.535us             6  
-                                            aten::copy_         3.90%      34.711us        55.16%     490.942us      81.824us      81.501us        56.35%     105.212us      17.535us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.501us        56.35%      81.501us      13.584us             6  
-                          _rotary_dba7d1e::apply_rotary         4.99%      44.421us        15.06%     134.003us      22.334us      63.135us        43.65%      63.135us      10.522us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      63.135us        43.65%      63.135us      10.522us             6  
-                                Activity Buffer Request        29.54%     262.907us        29.54%     262.907us     262.907us      23.711us        16.39%      23.711us      23.711us             1  
-                                    aten::empty_strided         3.63%      32.331us         3.63%      32.331us       5.388us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        21.72%     193.324us        21.72%     193.324us      32.221us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.75%      33.378us         4.79%      42.600us       3.550us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.04%       9.222us         1.04%       9.222us       0.769us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel        10.06%      89.582us        10.06%      89.582us      14.930us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.58%       5.120us         0.58%       5.120us       5.120us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     328.056us       226.11%     328.056us     328.056us             1  
+                                      hf_kernels_rotary        19.08%     156.414us        99.34%     814.518us     814.518us       0.000us         0.00%     168.799us     168.799us             1  
+                                            aten::clone         3.19%      26.153us        65.07%     533.532us      88.922us       0.000us         0.00%     105.343us      17.557us             6  
+                                            aten::copy_         3.98%      32.659us        58.36%     478.499us      79.750us      81.631us        56.26%     105.343us      17.557us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.631us        56.26%      81.631us      13.605us             6  
+                          _rotary_dba7d1e::apply_rotary         5.18%      42.511us        10.36%      84.962us      14.160us      63.456us        43.74%      63.456us      10.576us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      63.456us        43.74%      63.456us      10.576us             6  
+                                Activity Buffer Request        32.74%     268.426us        32.74%     268.426us     268.426us      23.712us        16.34%      23.712us      23.712us             1  
+                                    aten::empty_strided         3.52%      28.880us         3.52%      28.880us       4.813us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.64%     177.414us        21.64%     177.414us      29.569us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.88%      31.780us         4.83%      39.610us       3.301us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.95%       7.830us         0.95%       7.830us       0.652us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.18%      42.451us         5.18%      42.451us       7.075us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.66%       5.440us         0.66%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 890.052us
-Self CUDA time total: 144.636us
+Self CPU time total: 819.958us
+Self CUDA time total: 145.087us
 
 
 
@@ -4529,23 +4529,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     335.030us       423.20%     335.030us     335.030us             1  
-                                      hf_kernels_rotary        17.60%     149.462us        99.35%     843.781us     843.781us       0.000us         0.00%      89.342us      89.342us             1  
-                                            aten::clone         2.76%      23.471us        67.14%     570.174us      95.029us       0.000us         0.00%      47.328us       7.888us             6  
-                                            aten::copy_         4.78%      40.570us        60.71%     515.622us      85.937us      37.152us        46.93%      47.328us       7.888us             6  
-                          _rotary_dba7d1e::apply_rotary         4.86%      41.293us         9.90%      84.043us      14.007us      42.014us        53.07%      42.014us       7.002us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      42.014us        53.07%      42.014us       7.002us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      37.152us        46.93%      37.152us       6.192us             6  
-                                Activity Buffer Request        33.04%     280.637us        33.04%     280.637us     280.637us      10.176us        12.85%      10.176us      10.176us             1  
-                                    aten::empty_strided         3.66%      31.081us         3.66%      31.081us       5.180us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        22.89%     194.415us        22.89%     194.415us      32.403us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.65%      30.971us         4.72%      40.102us       3.342us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.08%       9.131us         1.08%       9.131us       0.761us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.03%      42.750us         5.03%      42.750us       7.125us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.65%       5.500us         0.65%       5.500us       5.500us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     333.210us       419.37%     333.210us     333.210us             1  
+                                      hf_kernels_rotary        17.73%     150.595us        99.35%     844.049us     844.049us       0.000us         0.00%      89.630us      89.630us             1  
+                                            aten::clone         2.52%      21.432us        67.08%     569.832us      94.972us       0.000us         0.00%      47.583us       7.930us             6  
+                                            aten::copy_         3.93%      33.381us        61.05%     518.601us      86.433us      37.407us        47.08%      47.583us       7.930us             6  
+                          _rotary_dba7d1e::apply_rotary         4.77%      40.491us         9.89%      84.021us      14.004us      42.047us        52.92%      42.047us       7.008us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      42.047us        52.92%      42.047us       7.008us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      37.407us        47.08%      37.407us       6.235us             6  
+                                Activity Buffer Request        36.09%     306.637us        36.09%     306.637us     306.637us      10.176us        12.81%      10.176us      10.176us             1  
+                                    aten::empty_strided         3.51%      29.799us         3.51%      29.799us       4.966us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.02%     178.583us        21.02%     178.583us      29.764us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.71%      31.542us         4.66%      39.601us       3.300us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.95%       8.059us         0.95%       8.059us       0.672us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.12%      43.530us         5.12%      43.530us       7.255us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.65%       5.480us         0.65%       5.480us       5.480us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 849.281us
-Self CUDA time total: 79.166us
+Self CPU time total: 849.529us
+Self CUDA time total: 79.454us
 
 
 
@@ -4555,23 +4555,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     335.451us       231.31%     335.451us     335.451us             1  
-                                      hf_kernels_rotary        20.10%     149.473us        99.28%     738.378us     738.378us       0.000us         0.00%     168.637us     168.637us             1  
-                                            aten::clone         2.71%      20.190us        62.55%     465.160us      77.527us       0.000us         0.00%     104.892us      17.482us             6  
-                                            aten::copy_         4.80%      35.671us        55.64%     413.780us      68.963us      81.277us        56.04%     104.892us      17.482us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.277us        56.04%      81.277us      13.546us             6  
-                          _rotary_dba7d1e::apply_rotary         5.48%      40.762us        11.33%      84.273us      14.046us      63.745us        43.96%      63.745us      10.624us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      63.745us        43.96%      63.745us      10.624us             6  
-                                Activity Buffer Request        25.22%     187.575us        25.22%     187.575us     187.575us      23.615us        16.28%      23.615us      23.615us             1  
-                                    aten::empty_strided         4.19%      31.190us         4.19%      31.190us       5.198us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.62%     190.534us        25.62%     190.534us      31.756us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.11%      30.580us         5.31%      39.472us       3.289us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.20%       8.892us         1.20%       8.892us       0.741us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.85%      43.511us         5.85%      43.511us       7.252us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.72%       5.340us         0.72%       5.340us       5.340us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     324.316us       222.02%     324.316us     324.316us             1  
+                                      hf_kernels_rotary        18.22%     148.243us        99.33%     808.388us     808.388us       0.000us         0.00%     169.820us     169.820us             1  
+                                            aten::clone         2.32%      18.910us        66.43%     540.572us      90.095us       0.000us         0.00%     105.630us      17.605us             6  
+                                            aten::copy_         4.07%      33.161us        60.40%     491.511us      81.919us      81.886us        56.06%     105.630us      17.605us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.886us        56.06%      81.886us      13.648us             6  
+                          _rotary_dba7d1e::apply_rotary         4.96%      40.400us        10.02%      81.542us      13.590us      64.190us        43.94%      64.190us      10.698us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      64.190us        43.94%      64.190us      10.698us             6  
+                                Activity Buffer Request        33.97%     276.476us        33.97%     276.476us     276.476us      23.744us        16.25%      23.744us      23.744us             1  
+                                    aten::empty_strided         3.70%      30.151us         3.70%      30.151us       5.025us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.35%     181.874us        22.35%     181.874us      30.312us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.71%      30.201us         4.67%      38.031us       3.169us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.96%       7.830us         0.96%       7.830us       0.652us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.06%      41.142us         5.06%      41.142us       6.857us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.67%       5.420us         0.67%       5.420us       5.420us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 743.718us
-Self CUDA time total: 145.022us
+Self CPU time total: 813.808us
+Self CUDA time total: 146.076us
 
 
 
@@ -4581,23 +4581,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary        13.47%     148.469us        71.31%     785.819us     785.819us       0.000us         0.00%     741.458us     741.458us             1  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     683.092us       101.20%     683.092us     683.092us             1  
-                                            aten::clone         1.91%      21.000us        46.53%     512.672us      85.445us       0.000us         0.00%     558.390us      93.065us             6  
-                                            aten::copy_         3.10%      34.171us        41.83%     460.962us      76.827us     491.927us        72.88%     558.390us      93.065us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     491.927us        72.88%     491.927us      81.988us             6  
-                          _rotary_dba7d1e::apply_rotary         3.81%      41.992us         7.74%      85.293us      14.215us     183.068us        27.12%     183.068us      30.511us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     183.068us        27.12%     183.068us      30.511us             6  
-                                Activity Buffer Request        21.20%     233.636us        21.20%     233.636us     233.636us      66.463us         9.85%      66.463us      66.463us             1  
-                                    aten::empty_strided         2.79%      30.710us         2.79%      30.710us       5.118us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        17.53%     193.155us        17.53%     193.155us      32.192us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.79%      30.724us         3.57%      39.385us       3.282us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.79%       8.661us         0.79%       8.661us       0.722us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         3.93%      43.301us         3.93%      43.301us       7.217us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        28.69%     316.097us        28.69%     316.097us     316.097us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary        13.82%     153.005us        74.75%     827.548us     827.548us       0.000us         0.00%     740.918us     740.918us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     682.102us       101.12%     682.102us     682.102us             1  
+                                            aten::clone         1.78%      19.750us        46.80%     518.112us      86.352us       0.000us         0.00%     558.489us      93.082us             6  
+                                            aten::copy_         3.14%      34.772us        42.33%     468.681us      78.114us     492.121us        72.96%     558.489us      93.082us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     492.121us        72.96%     492.121us      82.020us             6  
+                          _rotary_dba7d1e::apply_rotary         3.85%      42.609us         7.73%      85.591us      14.265us     182.429us        27.04%     182.429us      30.405us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     182.429us        27.04%     182.429us      30.405us             6  
+                                Activity Buffer Request        22.69%     251.255us        22.69%     251.255us     251.255us      66.368us         9.84%      66.368us      66.368us             1  
+                                    aten::empty_strided         2.68%      29.681us         2.68%      29.681us       4.947us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.50%     182.654us        16.50%     182.654us      30.442us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         5.62%      62.189us         6.40%      70.840us       5.903us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.78%       8.651us         0.78%       8.651us       0.721us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         3.88%      42.982us         3.88%      42.982us       7.164us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        25.25%     279.606us        25.25%     279.606us     279.606us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.102ms
-Self CUDA time total: 674.995us
+Self CPU time total: 1.107ms
+Self CUDA time total: 674.550us
 
 
 
@@ -4607,30 +4607,30 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         5.24%     147.368us        25.60%     720.668us     720.668us       0.000us         0.00%       2.633ms       2.633ms             1  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us       2.458ms       100.33%       2.458ms       2.458ms             1  
-                                            aten::clone         0.73%      20.601us        15.98%     449.780us      74.963us       0.000us         0.00%       1.394ms     232.377us             6  
-                                            aten::copy_         1.20%      33.732us        14.16%     398.690us      66.448us       1.211ms        49.43%       1.394ms     232.377us             6  
-                          _rotary_dba7d1e::apply_rotary         1.46%      41.000us         3.02%      85.091us      14.182us       1.239ms        50.57%       1.239ms     206.500us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us       1.239ms        50.57%       1.239ms     206.500us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       1.211ms        49.43%       1.211ms     201.812us             6  
-                                Activity Buffer Request         6.06%     170.604us         6.06%     170.604us     170.604us     183.391us         7.49%     183.391us     183.391us             1  
-                                    aten::empty_strided         1.08%      30.489us         1.08%      30.489us       5.081us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.90%     194.354us         6.90%     194.354us      32.392us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.07%      30.231us         1.37%      38.429us       3.202us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.29%       8.198us         0.29%       8.198us       0.683us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.57%      44.091us         1.57%      44.091us       7.349us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        74.40%       2.094ms        74.40%       2.094ms       2.094ms       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         5.09%     145.551us        26.40%     755.006us     755.006us       0.000us         0.00%       2.627ms       2.627ms             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us       2.456ms       100.32%       2.456ms       2.456ms             1  
+                                            aten::clone         0.65%      18.699us        17.06%     488.000us      81.333us       0.000us         0.00%       1.401ms     233.479us             6  
+                                            aten::copy_         1.14%      32.589us        15.40%     440.469us      73.412us       1.223ms        49.95%       1.401ms     233.479us             6  
+                          _rotary_dba7d1e::apply_rotary         1.42%      40.751us         2.92%      83.492us      13.915us       1.226ms        50.05%       1.226ms     204.274us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us       1.226ms        50.05%       1.226ms     204.274us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       1.223ms        49.95%       1.223ms     203.836us             6  
+                                Activity Buffer Request         7.97%     227.955us         7.97%     227.955us     227.955us     177.858us         7.26%     177.858us     177.858us             1  
+                                    aten::empty_strided         1.01%      28.832us         1.01%      28.832us       4.805us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.29%     179.925us         6.29%     179.925us      29.987us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.05%      30.104us         1.33%      37.963us       3.164us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.27%       7.859us         0.27%       7.859us       0.655us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.49%      42.741us         1.49%      42.741us       7.124us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        73.60%       2.105ms        73.60%       2.105ms       2.105ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.815ms
-Self CUDA time total: 2.450ms
+Self CPU time total: 2.860ms
+Self CUDA time total: 2.449ms
 
 
 impl                     wl                  p50(ms)  ok
 hf_kernels_rotary        cuda_B1_S128_H32_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B1_S128_H32_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.09  True
-hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.08  True
+hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.07  True
 hf_kernels_rotary        cuda_B1_S2048_H32_D128_R64     0.26  True
 hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B1_S2048_H8_D128_R64     0.09  True
@@ -4655,13 +4655,13 @@ hf_kernels_rotary        cuda_B2_S512_H8_D64_R32     0.09  True
 
▶ UV Install Logs
-
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. - -Fetching 5 files: 40%|████ | 2/5 [00:01<00:02, 1.08it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:01<00:00, 2.71it/s]
+
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] +Fetching 5 files: 60%|██████ | 3/5 [00:00<00:00, 16.43it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 12.18it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 12.77it/s]

Artifacts:

rotary.jsonl