diff --git "a/rotary/impls/hf_kernels_rotary.html" "b/rotary/impls/hf_kernels_rotary.html" --- "a/rotary/impls/hf_kernels_rotary.html" +++ "b/rotary/impls/hf_kernels_rotary.html" @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.20s +Cell: nv | 0.23s | Raw @@ -3887,7 +3895,7 @@ Cell: nv | 0.20s
-
Wed Oct 29 14:26:51 2025       
+
Wed Oct 29 15:50:24 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3904,7 @@ Cell: nv | 0.20s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   32C    P0             76W /  350W |       0MiB /  46068MiB |     11%      Default |
+| N/A   29C    P0             88W /  350W |       0MiB /  46068MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,13 +3928,13 @@ Cell: nv | 0.20s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 7.90s
+Cell: benchmark | 4.45s
  | 
 
 Raw
 GitHub
 
-
+
# /// script
 # requires-python = ">=3.10"
@@ -3974,6 +3982,7 @@ Cell: benchmark | 7.90s
     impl_name="hf_kernels_rotary",
     impl_tags={"family": "hf-kernels", "backend": "cuda"},
     impl_func=hf_kernels_rotary,
+    dtype="float32",
 )
 
@@ -3989,23 +3998,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 433.056us 1833.74% 433.056us 433.056us 1 - hf_kernels_rotary 12.39% 257.808us 99.67% 2.073ms 2.073ms 0.000us 0.00% 24.832us 24.832us 1 - _rotary_dba7d1e::apply_rotary 2.75% 57.199us 5.11% 106.332us 17.722us 16.960us 71.82% 16.960us 2.827us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 71.82% 16.960us 2.827us 6 - aten::clone 2.11% 43.871us 79.26% 1.649ms 274.763us 0.000us 0.00% 7.872us 1.312us 6 - aten::copy_ 2.19% 45.572us 74.13% 1.542ms 256.978us 6.656us 28.18% 7.872us 1.312us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.656us 28.18% 6.656us 1.109us 6 - Activity Buffer Request 68.36% 1.422ms 68.36% 1.422ms 1.422ms 1.216us 5.15% 1.216us 1.216us 1 - aten::empty_strided 3.02% 62.841us 3.02% 62.841us 10.473us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 3.58% 74.452us 3.58% 74.452us 12.409us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.28% 47.469us 2.90% 60.410us 5.034us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.62% 12.941us 0.62% 12.941us 1.078us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.36% 49.133us 2.36% 49.133us 8.189us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.33% 6.850us 0.33% 6.850us 6.850us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 415.167us 1791.98% 415.167us 415.167us 1 + hf_kernels_rotary 12.17% 252.218us 99.63% 2.065ms 2.065ms 0.000us 0.00% 24.448us 24.448us 1 + _rotary_dba7d1e::apply_rotary 2.75% 56.920us 5.09% 105.521us 17.587us 16.128us 69.61% 16.128us 2.688us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.128us 69.61% 16.128us 2.688us 6 + aten::clone 2.00% 41.539us 79.52% 1.648ms 274.716us 0.000us 0.00% 8.320us 1.387us 6 + aten::copy_ 1.86% 38.603us 74.72% 1.549ms 258.116us 7.040us 30.39% 8.320us 1.387us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.040us 30.39% 7.040us 1.173us 6 + Activity Buffer Request 69.22% 1.435ms 69.22% 1.435ms 1.435ms 1.280us 5.52% 1.280us 1.280us 1 + aten::empty_strided 2.80% 58.062us 2.80% 58.062us 9.677us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 3.64% 75.429us 3.64% 75.429us 12.571us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.13% 44.231us 2.84% 58.952us 4.913us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.71% 14.721us 0.71% 14.721us 1.227us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.34% 48.601us 2.34% 48.601us 8.100us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.37% 7.760us 0.37% 7.760us 7.760us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.080ms -Self CUDA time total: 23.616us +Self CPU time total: 2.073ms +Self CUDA time total: 23.168us @@ -4015,23 +4024,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 368.319us 1559.68% 368.319us 368.319us 1 - hf_kernels_rotary 8.92% 167.782us 99.73% 1.876ms 1.876ms 0.000us 0.00% 24.767us 24.767us 1 - _rotary_dba7d1e::apply_rotary 2.34% 44.032us 4.50% 84.553us 14.092us 16.832us 71.28% 16.832us 2.805us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.832us 71.28% 16.832us 2.805us 6 - aten::clone 1.16% 21.840us 83.94% 1.579ms 263.113us 0.000us 0.00% 7.935us 1.322us 6 - aten::copy_ 2.86% 53.852us 81.07% 1.525ms 254.111us 6.783us 28.72% 7.935us 1.322us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.783us 28.72% 6.783us 1.130us 6 - Activity Buffer Request 75.10% 1.412ms 75.10% 1.412ms 1.412ms 1.152us 4.88% 1.152us 1.152us 1 - aten::empty_strided 1.71% 32.171us 1.71% 32.171us 5.362us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 3.11% 58.461us 3.11% 58.461us 9.744us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.82% 34.274us 2.37% 44.512us 3.709us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.54% 10.238us 0.54% 10.238us 0.853us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.15% 40.521us 2.15% 40.521us 6.753us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.27% 5.140us 0.27% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 341.917us 1420.81% 341.917us 341.917us 1 + hf_kernels_rotary 9.02% 171.465us 99.73% 1.896ms 1.896ms 0.000us 0.00% 25.377us 25.377us 1 + _rotary_dba7d1e::apply_rotary 2.21% 42.031us 4.55% 86.422us 14.404us 16.192us 67.28% 16.192us 2.699us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.192us 67.28% 16.192us 2.699us 6 + aten::clone 1.19% 22.618us 84.01% 1.597ms 266.223us 0.000us 0.00% 9.185us 1.531us 6 + aten::copy_ 2.09% 39.723us 81.08% 1.542ms 256.918us 7.873us 32.72% 9.185us 1.531us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.873us 32.72% 7.873us 1.312us 6 + Activity Buffer Request 76.21% 1.449ms 76.21% 1.449ms 1.449ms 1.312us 5.45% 1.312us 1.312us 1 + aten::empty_strided 1.75% 33.211us 1.75% 33.211us 5.535us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.78% 52.791us 2.78% 52.791us 8.798us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.69% 32.169us 2.16% 40.981us 3.415us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.46% 8.812us 0.46% 8.812us 0.734us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.33% 44.391us 2.33% 44.391us 7.399us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.27% 5.060us 0.27% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.881ms -Self CUDA time total: 23.615us +Self CPU time total: 1.901ms +Self CUDA time total: 24.065us @@ -4041,23 +4050,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 346.939us 1384.60% 346.939us 346.939us 1 - hf_kernels_rotary 8.57% 160.653us 99.71% 1.870ms 1.870ms 0.000us 0.00% 26.369us 26.369us 1 - _rotary_dba7d1e::apply_rotary 2.32% 43.421us 4.67% 87.601us 14.600us 17.249us 68.84% 17.249us 2.875us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.249us 68.84% 17.249us 2.875us 6 - aten::clone 1.23% 23.032us 84.13% 1.577ms 262.912us 0.000us 0.00% 9.120us 1.520us 6 - aten::copy_ 1.94% 36.311us 81.17% 1.522ms 253.669us 7.808us 31.16% 9.120us 1.520us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 31.16% 7.808us 1.301us 6 - Activity Buffer Request 76.42% 1.433ms 76.42% 1.433ms 1.433ms 1.312us 5.24% 1.312us 1.312us 1 - aten::empty_strided 1.73% 32.420us 1.73% 32.420us 5.403us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.81% 52.730us 2.81% 52.730us 8.788us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.83% 34.233us 2.34% 43.964us 3.664us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.52% 9.731us 0.52% 9.731us 0.811us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.36% 44.180us 2.36% 44.180us 7.363us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.29% 5.410us 0.29% 5.410us 5.410us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 339.260us 1398.72% 339.260us 339.260us 1 + hf_kernels_rotary 9.18% 174.993us 99.74% 1.901ms 1.901ms 0.000us 0.00% 25.567us 25.567us 1 + _rotary_dba7d1e::apply_rotary 2.30% 43.881us 4.51% 86.021us 14.337us 16.479us 67.94% 16.479us 2.746us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.479us 67.94% 16.479us 2.746us 6 + aten::clone 1.43% 27.180us 83.89% 1.599ms 266.516us 0.000us 0.00% 9.088us 1.515us 6 + aten::copy_ 2.04% 38.899us 80.70% 1.538ms 256.369us 7.776us 32.06% 9.088us 1.515us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 32.06% 7.776us 1.296us 6 + Activity Buffer Request 75.82% 1.445ms 75.82% 1.445ms 1.445ms 1.312us 5.41% 1.312us 1.312us 1 + aten::empty_strided 1.77% 33.702us 1.77% 33.702us 5.617us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.83% 54.013us 2.83% 54.013us 9.002us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.70% 32.344us 2.15% 41.003us 3.417us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.45% 8.659us 0.45% 8.659us 0.722us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.21% 42.140us 2.21% 42.140us 7.023us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.26% 5.011us 0.26% 5.011us 5.011us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.875ms -Self CUDA time total: 25.057us +Self CPU time total: 1.906ms +Self CUDA time total: 24.255us @@ -4067,23 +4076,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.904us 1355.61% 347.904us 347.904us 1 - hf_kernels_rotary 7.92% 162.592us 99.76% 2.047ms 2.047ms 0.000us 0.00% 27.009us 27.009us 1 - _rotary_dba7d1e::apply_rotary 2.09% 42.932us 4.15% 85.134us 14.189us 17.951us 69.95% 17.951us 2.992us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.951us 69.95% 17.951us 2.992us 6 - aten::clone 1.22% 25.009us 85.61% 1.757ms 292.750us 0.000us 0.00% 9.058us 1.510us 6 - aten::copy_ 1.81% 37.091us 82.80% 1.699ms 283.112us 7.713us 30.05% 9.058us 1.510us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.713us 30.05% 7.713us 1.285us 6 - Activity Buffer Request 69.84% 1.433ms 69.84% 1.433ms 1.433ms 1.345us 5.24% 1.345us 1.345us 1 - aten::empty_strided 1.60% 32.820us 1.60% 32.820us 5.470us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 11.14% 228.627us 11.14% 228.627us 38.104us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.59% 32.701us 2.07% 42.551us 3.546us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.48% 9.850us 0.48% 9.850us 0.821us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.06% 42.202us 2.06% 42.202us 7.034us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.861us 0.24% 4.861us 4.861us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 337.722us 1204.77% 337.722us 337.722us 1 + hf_kernels_rotary 8.26% 171.103us 99.73% 2.067ms 2.067ms 0.000us 0.00% 29.792us 29.792us 1 + _rotary_dba7d1e::apply_rotary 1.99% 41.331us 4.00% 82.932us 13.822us 17.632us 62.90% 17.632us 2.939us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.632us 62.90% 17.632us 2.939us 6 + aten::clone 1.32% 27.454us 85.43% 1.770ms 295.062us 0.000us 0.00% 12.160us 2.027us 6 + aten::copy_ 1.75% 36.211us 82.55% 1.711ms 285.086us 10.400us 37.10% 12.160us 2.027us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 37.10% 10.400us 1.733us 6 + Activity Buffer Request 68.85% 1.427ms 68.85% 1.427ms 1.427ms 1.760us 6.28% 1.760us 1.760us 1 + aten::empty_strided 1.56% 32.399us 1.56% 32.399us 5.400us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 11.95% 247.595us 11.95% 247.595us 41.266us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.60% 33.121us 2.04% 42.171us 3.514us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.44% 9.050us 0.44% 9.050us 0.754us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.01% 41.601us 2.01% 41.601us 6.933us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.27% 5.640us 0.27% 5.640us 5.640us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.052ms -Self CUDA time total: 25.664us +Self CPU time total: 2.072ms +Self CUDA time total: 28.032us @@ -4093,23 +4102,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 356.192us 1425.17% 356.192us 356.192us 1 - hf_kernels_rotary 9.03% 181.778us 99.74% 2.009ms 2.009ms 0.000us 0.00% 26.306us 26.306us 1 - _rotary_dba7d1e::apply_rotary 2.18% 43.970us 4.25% 85.660us 14.277us 17.088us 68.37% 17.088us 2.848us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 68.37% 17.088us 2.848us 6 - aten::clone 1.16% 23.451us 84.31% 1.698ms 283.035us 0.000us 0.00% 9.218us 1.536us 6 - aten::copy_ 1.79% 36.151us 81.55% 1.643ms 273.753us 7.905us 31.63% 9.218us 1.536us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.905us 31.63% 7.905us 1.318us 6 - Activity Buffer Request 70.14% 1.413ms 70.14% 1.413ms 1.413ms 1.313us 5.25% 1.313us 1.313us 1 - aten::empty_strided 1.60% 32.242us 1.60% 32.242us 5.374us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.61% 193.593us 9.61% 193.593us 32.266us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.67% 33.621us 2.15% 43.371us 3.614us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.48% 9.750us 0.48% 9.750us 0.812us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.07% 41.690us 2.07% 41.690us 6.948us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.26% 5.140us 0.26% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 338.845us 1402.50% 338.845us 338.845us 1 + hf_kernels_rotary 8.30% 171.257us 99.77% 2.058ms 2.058ms 0.000us 0.00% 25.440us 25.440us 1 + _rotary_dba7d1e::apply_rotary 2.01% 41.399us 4.04% 83.350us 13.892us 16.448us 68.08% 16.448us 2.741us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.448us 68.08% 16.448us 2.741us 6 + aten::clone 1.39% 28.751us 85.42% 1.762ms 293.702us 0.000us 0.00% 8.992us 1.499us 6 + aten::copy_ 1.85% 38.232us 82.52% 1.702ms 283.730us 7.712us 31.92% 8.992us 1.499us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 31.92% 7.712us 1.285us 6 + Activity Buffer Request 69.46% 1.433ms 69.46% 1.433ms 1.433ms 1.280us 5.30% 1.280us 1.280us 1 + aten::empty_strided 1.51% 31.081us 1.51% 31.081us 5.180us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 11.21% 231.173us 11.21% 231.173us 38.529us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.58% 32.650us 2.01% 41.461us 3.455us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.43% 8.811us 0.43% 8.811us 0.734us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.03% 41.951us 2.03% 41.951us 6.992us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 4.720us 0.23% 4.720us 4.720us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.014ms -Self CUDA time total: 24.993us +Self CPU time total: 2.063ms +Self CUDA time total: 24.160us @@ -4119,23 +4128,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 345.469us 1341.21% 345.469us 345.469us 1 - hf_kernels_rotary 8.14% 161.605us 99.74% 1.979ms 1.979ms 0.000us 0.00% 27.070us 27.070us 1 - _rotary_dba7d1e::apply_rotary 2.10% 41.690us 4.19% 83.112us 13.852us 17.982us 69.81% 17.982us 2.997us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.982us 69.81% 17.982us 2.997us 6 - aten::clone 1.15% 22.842us 85.12% 1.689ms 281.515us 0.000us 0.00% 9.088us 1.515us 6 - aten::copy_ 1.84% 36.466us 82.36% 1.634ms 272.405us 7.776us 30.19% 9.088us 1.515us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 30.19% 7.776us 1.296us 6 - Activity Buffer Request 71.40% 1.417ms 71.40% 1.417ms 1.417ms 1.312us 5.09% 1.312us 1.312us 1 - aten::empty_strided 1.60% 31.821us 1.60% 31.821us 5.303us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.12% 181.057us 9.12% 181.057us 30.176us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.80% 35.740us 2.29% 45.520us 3.793us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.49% 9.780us 0.49% 9.780us 0.815us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.09% 41.422us 2.09% 41.422us 6.904us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.26% 5.151us 0.26% 5.151us 5.151us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 335.067us 1195.26% 335.067us 335.067us 1 + hf_kernels_rotary 20.13% 167.343us 99.42% 826.269us 826.269us 0.000us 0.00% 29.857us 29.857us 1 + _rotary_dba7d1e::apply_rotary 5.16% 42.850us 10.17% 84.521us 14.087us 17.537us 62.56% 17.537us 2.923us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.537us 62.56% 17.537us 2.923us 6 + aten::clone 2.67% 22.181us 64.36% 534.923us 89.154us 0.000us 0.00% 12.320us 2.053us 6 + aten::copy_ 4.47% 37.140us 57.76% 480.051us 80.008us 10.496us 37.44% 12.320us 2.053us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.496us 37.44% 10.496us 1.749us 6 + Activity Buffer Request 26.04% 216.435us 26.04% 216.435us 216.435us 1.824us 6.51% 1.824us 1.824us 1 + aten::empty_strided 3.93% 32.691us 3.93% 32.691us 5.448us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 27.25% 226.476us 27.25% 226.476us 37.746us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.75% 31.143us 4.75% 39.482us 3.290us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.00% 8.339us 1.00% 8.339us 0.695us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.01% 41.671us 5.01% 41.671us 6.945us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.58% 4.851us 0.58% 4.851us 4.851us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.984ms -Self CUDA time total: 25.758us +Self CPU time total: 831.120us +Self CUDA time total: 28.033us @@ -4145,23 +4154,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 370.847us 1148.52% 370.847us 370.847us 1 - hf_kernels_rotary 8.48% 171.185us 99.77% 2.015ms 2.015ms 0.000us 0.00% 34.081us 34.081us 1 - _rotary_dba7d1e::apply_rotary 2.32% 46.763us 4.49% 90.723us 15.120us 21.793us 67.49% 21.793us 3.632us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 21.793us 67.49% 21.793us 3.632us 6 - aten::clone 1.25% 25.309us 84.59% 1.708ms 284.718us 0.000us 0.00% 12.288us 2.048us 6 - aten::copy_ 1.96% 39.631us 81.62% 1.648ms 274.723us 10.496us 32.51% 12.288us 2.048us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.496us 32.51% 10.496us 1.749us 6 - Activity Buffer Request 70.18% 1.417ms 70.18% 1.417ms 1.417ms 1.792us 5.55% 1.792us 1.792us 1 - aten::empty_strided 1.72% 34.661us 1.72% 34.661us 5.777us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.48% 191.424us 9.48% 191.424us 31.904us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.73% 34.932us 2.22% 44.771us 3.731us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.49% 9.839us 0.49% 9.839us 0.820us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.18% 43.960us 2.18% 43.960us 7.327us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.23% 4.601us 0.23% 4.601us 4.601us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 332.760us 825.36% 332.760us 332.760us 1 + hf_kernels_rotary 19.35% 167.193us 99.38% 858.880us 858.880us 0.000us 0.00% 43.165us 43.165us 1 + _rotary_dba7d1e::apply_rotary 4.67% 40.341us 9.39% 81.181us 13.530us 23.229us 57.62% 23.229us 3.871us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.229us 57.62% 23.229us 3.871us 6 + aten::clone 2.64% 22.801us 65.85% 569.083us 94.847us 0.000us 0.00% 19.936us 3.323us 6 + aten::copy_ 4.30% 37.172us 59.60% 515.092us 85.849us 17.088us 42.38% 19.936us 3.323us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 42.38% 17.088us 2.848us 6 + Activity Buffer Request 29.73% 256.965us 29.73% 256.965us 256.965us 2.848us 7.06% 2.848us 2.848us 1 + aten::empty_strided 3.61% 31.190us 3.61% 31.190us 5.198us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 25.57% 220.955us 25.57% 220.955us 36.826us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.76% 32.492us 4.79% 41.423us 3.452us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.03% 8.931us 1.03% 8.931us 0.744us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.73% 40.840us 4.73% 40.840us 6.807us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.62% 5.380us 0.62% 5.380us 5.380us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.020ms -Self CUDA time total: 32.289us +Self CPU time total: 864.260us +Self CUDA time total: 40.317us @@ -4171,23 +4180,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 345.533us 668.21% 345.533us 345.533us 1 - hf_kernels_rotary 8.13% 161.677us 99.76% 1.983ms 1.983ms 0.000us 0.00% 54.558us 54.558us 1 - _rotary_dba7d1e::apply_rotary 2.15% 42.810us 4.29% 85.240us 14.207us 34.782us 67.26% 34.782us 5.797us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 34.782us 67.26% 34.782us 5.797us 6 - aten::clone 1.16% 23.089us 85.02% 1.690ms 281.665us 0.000us 0.00% 19.776us 3.296us 6 - aten::copy_ 1.78% 35.482us 82.32% 1.636ms 272.722us 16.928us 32.74% 19.776us 3.296us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 32.74% 16.928us 2.821us 6 - Activity Buffer Request 71.53% 1.422ms 71.53% 1.422ms 1.422ms 2.848us 5.51% 2.848us 2.848us 1 - aten::empty_strided 1.54% 30.571us 1.54% 30.571us 5.095us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.00% 178.904us 9.00% 178.904us 29.817us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.84% 36.581us 2.32% 46.051us 3.838us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.48% 9.470us 0.48% 9.470us 0.789us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.13% 42.430us 2.13% 42.430us 7.072us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.870us 0.24% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 353.785us 452.38% 353.785us 353.785us 1 + hf_kernels_rotary 18.85% 162.244us 99.38% 855.480us 855.480us 0.000us 0.00% 90.716us 90.716us 1 + aten::clone 2.60% 22.411us 65.93% 567.532us 94.589us 0.000us 0.00% 52.253us 8.709us 6 + aten::copy_ 4.73% 40.709us 58.92% 507.190us 84.532us 39.742us 50.82% 52.253us 8.709us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 39.742us 50.82% 39.742us 6.624us 6 + _rotary_dba7d1e::apply_rotary 4.83% 41.551us 9.72% 83.643us 13.941us 38.463us 49.18% 38.463us 6.410us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 38.463us 49.18% 38.463us 6.410us 6 + Activity Buffer Request 28.85% 248.356us 28.85% 248.356us 248.356us 12.511us 16.00% 12.511us 12.511us 1 + aten::empty_strided 4.41% 37.931us 4.41% 37.931us 6.322us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 25.34% 218.125us 25.34% 218.125us 36.354us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.86% 33.191us 4.89% 42.061us 3.505us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.03% 8.870us 1.03% 8.870us 0.739us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.89% 42.092us 4.89% 42.092us 7.015us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.62% 5.350us 0.62% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.988ms -Self CUDA time total: 51.710us +Self CPU time total: 860.830us +Self CUDA time total: 78.205us @@ -4197,23 +4206,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 338.136us 1047.28% 338.136us 338.136us 1 - hf_kernels_rotary 19.11% 157.801us 99.43% 820.869us 820.869us 0.000us 0.00% 34.078us 34.078us 1 - _rotary_dba7d1e::apply_rotary 5.12% 42.269us 10.18% 84.080us 14.013us 21.792us 67.49% 21.792us 3.632us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 21.792us 67.49% 21.792us 3.632us 6 - aten::clone 2.56% 21.133us 65.13% 537.684us 89.614us 0.000us 0.00% 12.286us 2.048us 6 - aten::copy_ 4.56% 37.650us 58.77% 485.172us 80.862us 10.495us 32.51% 12.286us 2.048us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.495us 32.51% 10.495us 1.749us 6 - Activity Buffer Request 32.51% 268.347us 32.51% 268.347us 268.347us 1.791us 5.55% 1.791us 1.791us 1 - aten::empty_strided 3.80% 31.379us 3.80% 31.379us 5.230us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.70% 179.175us 21.70% 179.175us 29.862us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.93% 32.405us 5.00% 41.304us 3.442us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.08% 8.899us 1.08% 8.899us 0.742us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.06% 41.811us 5.06% 41.811us 6.969us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.57% 4.680us 0.57% 4.680us 4.680us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 337.342us 833.35% 337.342us 337.342us 1 + hf_kernels_rotary 8.49% 173.955us 99.77% 2.043ms 2.043ms 0.000us 0.00% 43.328us 43.328us 1 + _rotary_dba7d1e::apply_rotary 2.03% 41.590us 4.02% 82.231us 13.705us 23.487us 58.02% 23.487us 3.915us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.487us 58.02% 23.487us 3.915us 6 + aten::clone 1.34% 27.379us 85.23% 1.745ms 290.890us 0.000us 0.00% 19.841us 3.307us 6 + aten::copy_ 1.78% 36.424us 82.41% 1.688ms 281.287us 16.993us 41.98% 19.841us 3.307us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.993us 41.98% 16.993us 2.832us 6 + Activity Buffer Request 70.11% 1.436ms 70.11% 1.436ms 1.436ms 2.848us 7.04% 2.848us 2.848us 1 + aten::empty_strided 1.48% 30.241us 1.48% 30.241us 5.040us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.52% 215.434us 10.52% 215.434us 35.906us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.62% 33.159us 2.03% 41.651us 3.471us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.41% 8.492us 0.41% 8.492us 0.708us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.98% 40.641us 1.98% 40.641us 6.773us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 4.700us 0.23% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 825.549us -Self CUDA time total: 32.287us +Self CPU time total: 2.048ms +Self CUDA time total: 40.480us @@ -4223,23 +4232,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.832us 672.66% 347.832us 347.832us 1 - hf_kernels_rotary 18.98% 156.996us 99.42% 822.501us 822.501us 0.000us 0.00% 54.558us 54.558us 1 - _rotary_dba7d1e::apply_rotary 5.15% 42.621us 10.22% 84.512us 14.085us 34.783us 67.27% 34.783us 5.797us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 34.783us 67.27% 34.783us 5.797us 6 - aten::clone 2.65% 21.930us 64.92% 537.102us 89.517us 0.000us 0.00% 19.775us 3.296us 6 - aten::copy_ 4.53% 37.450us 58.33% 482.542us 80.424us 16.927us 32.73% 19.775us 3.296us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.927us 32.73% 16.927us 2.821us 6 - Activity Buffer Request 32.06% 265.247us 32.06% 265.247us 265.247us 2.848us 5.51% 2.848us 2.848us 1 - aten::empty_strided 3.94% 32.630us 3.94% 32.630us 5.438us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.74% 179.845us 21.74% 179.845us 29.974us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.14% 34.239us 5.31% 43.891us 3.658us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.17% 9.652us 1.17% 9.652us 0.804us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.06% 41.891us 5.06% 41.891us 6.982us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.58% 4.770us 0.58% 4.770us 4.770us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 367.387us 482.60% 367.387us 367.387us 1 + hf_kernels_rotary 8.43% 173.690us 99.77% 2.056ms 2.056ms 0.000us 0.00% 86.687us 86.687us 1 + aten::clone 1.25% 25.689us 84.25% 1.736ms 289.338us 0.000us 0.00% 47.648us 7.941us 6 + aten::copy_ 1.77% 36.381us 81.42% 1.678ms 279.615us 37.088us 48.72% 47.648us 7.941us 6 + _rotary_dba7d1e::apply_rotary 2.83% 58.403us 5.00% 103.123us 17.187us 39.039us 51.28% 39.039us 6.506us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.039us 51.28% 39.039us 6.506us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 37.088us 48.72% 37.088us 6.181us 6 + Activity Buffer Request 69.27% 1.427ms 69.27% 1.427ms 1.427ms 10.560us 13.87% 10.560us 10.560us 1 + aten::empty_strided 1.58% 32.653us 1.58% 32.653us 5.442us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.38% 213.985us 10.38% 213.985us 35.664us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.64% 33.864us 2.09% 42.973us 3.581us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.44% 9.109us 0.44% 9.109us 0.759us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.17% 44.720us 2.17% 44.720us 7.453us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 4.800us 0.23% 4.800us 4.800us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 827.271us -Self CUDA time total: 51.710us +Self CPU time total: 2.061ms +Self CUDA time total: 76.127us @@ -4249,23 +4258,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 352.413us 323.34% 352.413us 352.413us 1 - hf_kernels_rotary 18.38% 152.793us 99.44% 826.801us 826.801us 0.000us 0.00% 127.423us 127.423us 1 - aten::clone 2.64% 21.959us 64.91% 539.754us 89.959us 0.000us 0.00% 69.984us 11.664us 6 - aten::copy_ 4.48% 37.251us 58.50% 486.434us 81.072us 51.552us 47.30% 69.984us 11.664us 6 - _rotary_dba7d1e::apply_rotary 5.35% 44.522us 10.55% 87.704us 14.617us 57.439us 52.70% 57.439us 9.573us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 57.439us 52.70% 57.439us 9.573us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 51.552us 47.30% 51.552us 8.592us 6 - Activity Buffer Request 32.52% 270.437us 32.52% 270.437us 270.437us 18.432us 16.91% 18.432us 18.432us 1 - aten::empty_strided 3.77% 31.361us 3.77% 31.361us 5.227us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.50% 178.746us 21.50% 178.746us 29.791us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.45% 36.960us 5.60% 46.550us 3.879us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.15% 9.590us 1.15% 9.590us 0.799us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.19% 43.182us 5.19% 43.182us 7.197us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.56% 4.690us 0.56% 4.690us 4.690us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 350.717us 252.54% 350.717us 350.717us 1 + hf_kernels_rotary 8.20% 168.514us 99.75% 2.049ms 2.049ms 0.000us 0.00% 162.494us 162.494us 1 + aten::clone 1.38% 28.280us 85.37% 1.754ms 292.317us 0.000us 0.00% 102.494us 17.082us 6 + aten::copy_ 1.89% 38.810us 82.42% 1.693ms 282.225us 78.878us 56.80% 102.494us 17.082us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 78.878us 56.80% 78.878us 13.146us 6 + _rotary_dba7d1e::apply_rotary 2.03% 41.642us 4.17% 85.643us 14.274us 60.000us 43.20% 60.000us 10.000us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 60.000us 43.20% 60.000us 10.000us 6 + Activity Buffer Request 70.32% 1.445ms 70.32% 1.445ms 1.445ms 23.616us 17.00% 23.616us 23.616us 1 + aten::empty_strided 1.57% 32.271us 1.57% 32.271us 5.379us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.22% 209.905us 10.22% 209.905us 34.984us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.59% 32.591us 2.01% 41.291us 3.441us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.42% 8.700us 0.42% 8.700us 0.725us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.14% 44.001us 2.14% 44.001us 7.333us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.25% 5.120us 0.25% 5.120us 5.120us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 831.491us -Self CUDA time total: 108.991us +Self CPU time total: 2.054ms +Self CUDA time total: 138.878us @@ -4275,23 +4284,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 354.429us 196.77% 354.429us 354.429us 1 - hf_kernels_rotary 18.96% 156.272us 99.48% 819.980us 819.980us 0.000us 0.00% 203.900us 203.900us 1 - aten::clone 2.73% 22.479us 64.84% 534.473us 89.079us 0.000us 0.00% 102.557us 17.093us 6 - aten::copy_ 4.31% 35.551us 58.35% 480.933us 80.156us 78.782us 43.74% 102.557us 17.093us 6 - _rotary_dba7d1e::apply_rotary 5.14% 42.393us 10.35% 85.274us 14.212us 101.343us 56.26% 101.343us 16.890us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 101.343us 56.26% 101.343us 16.890us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 78.782us 43.74% 78.782us 13.130us 6 - Activity Buffer Request 32.52% 268.027us 32.52% 268.027us 268.027us 23.775us 13.20% 23.775us 23.775us 1 - aten::empty_strided 3.77% 31.061us 3.77% 31.061us 5.177us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.52% 177.355us 21.52% 177.355us 29.559us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.12% 33.982us 5.33% 43.961us 3.663us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.21% 9.979us 1.21% 9.979us 0.832us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.20% 42.881us 5.20% 42.881us 7.147us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.52% 4.300us 0.52% 4.300us 4.300us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 7.33% 173.322us 87.05% 2.058ms 2.058ms 0.000us 0.00% 773.117us 773.117us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 713.886us 101.15% 713.886us 713.886us 1 + aten::clone 1.12% 26.510us 74.03% 1.750ms 291.637us 0.000us 0.00% 574.975us 95.829us 6 + aten::copy_ 1.66% 39.271us 70.83% 1.674ms 279.060us 507.647us 71.93% 574.975us 95.829us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 507.647us 71.93% 507.647us 84.608us 6 + _rotary_dba7d1e::apply_rotary 1.93% 45.683us 3.90% 92.264us 15.377us 198.142us 28.07% 198.142us 33.024us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 198.142us 28.07% 198.142us 33.024us 6 + Activity Buffer Request 60.04% 1.419ms 60.04% 1.419ms 1.419ms 67.328us 9.54% 67.328us 67.328us 1 + aten::empty_strided 2.07% 48.953us 2.07% 48.953us 8.159us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.13% 215.804us 9.13% 215.804us 35.967us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.42% 33.620us 1.79% 42.281us 3.523us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.37% 8.661us 0.37% 8.661us 0.722us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.97% 46.581us 1.97% 46.581us 7.764us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 12.95% 306.087us 12.95% 306.087us 306.087us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 824.280us -Self CUDA time total: 180.125us +Self CPU time total: 2.364ms +Self CUDA time total: 705.789us @@ -4301,23 +4310,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 338.587us 1293.50% 338.587us 338.587us 1 - hf_kernels_rotary 19.34% 157.366us 99.42% 808.960us 808.960us 0.000us 0.00% 27.296us 27.296us 1 - _rotary_dba7d1e::apply_rotary 5.26% 42.761us 10.55% 85.842us 14.307us 19.392us 74.08% 19.392us 3.232us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 19.392us 74.08% 19.392us 3.232us 6 - aten::clone 2.60% 21.121us 64.41% 524.052us 87.342us 0.000us 0.00% 7.904us 1.317us 6 - aten::copy_ 4.60% 37.442us 58.06% 472.441us 78.740us 6.784us 25.92% 7.904us 1.317us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 25.92% 6.784us 1.131us 6 - Activity Buffer Request 31.61% 257.196us 31.61% 257.196us 257.196us 1.120us 4.28% 1.120us 1.120us 1 - aten::empty_strided 3.75% 30.490us 3.75% 30.490us 5.082us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.85% 177.803us 21.85% 177.803us 29.634us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.95% 32.140us 5.12% 41.700us 3.475us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.17% 9.560us 1.17% 9.560us 0.797us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.29% 43.081us 5.29% 43.081us 7.180us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.58% 4.711us 0.58% 4.711us 4.711us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 345.724us 1295.48% 345.724us 345.724us 1 + hf_kernels_rotary 8.56% 176.117us 99.77% 2.053ms 2.053ms 0.000us 0.00% 27.999us 27.999us 1 + _rotary_dba7d1e::apply_rotary 2.01% 41.279us 4.04% 83.070us 13.845us 18.817us 70.51% 18.817us 3.136us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.817us 70.51% 18.817us 3.136us 6 + aten::clone 1.59% 32.672us 85.14% 1.752ms 292.043us 0.000us 0.00% 9.182us 1.530us 6 + aten::copy_ 1.83% 37.700us 81.90% 1.685ms 280.910us 7.870us 29.49% 9.182us 1.530us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.870us 29.49% 7.870us 1.312us 6 + Activity Buffer Request 70.01% 1.441ms 70.01% 1.441ms 1.441ms 1.312us 4.92% 1.312us 1.312us 1 + aten::empty_strided 1.66% 34.130us 1.66% 34.130us 5.688us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.06% 206.965us 10.06% 206.965us 34.494us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.59% 32.681us 2.04% 41.911us 3.493us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.45% 9.230us 0.45% 9.230us 0.769us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.03% 41.791us 2.03% 41.791us 6.965us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 4.640us 0.23% 4.640us 4.640us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 813.671us -Self CUDA time total: 26.176us +Self CPU time total: 2.058ms +Self CUDA time total: 26.687us @@ -4327,23 +4336,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.862us 1278.50% 349.862us 349.862us 1 - hf_kernels_rotary 19.32% 156.134us 99.42% 803.460us 803.460us 0.000us 0.00% 28.709us 28.709us 1 - _rotary_dba7d1e::apply_rotary 5.33% 43.099us 10.84% 87.643us 14.607us 19.428us 71.00% 19.428us 3.238us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 19.428us 71.00% 19.428us 3.238us 6 - aten::clone 2.80% 22.600us 63.71% 514.893us 85.816us 0.000us 0.00% 9.281us 1.547us 6 - aten::copy_ 4.89% 39.481us 56.99% 460.582us 76.764us 7.937us 29.00% 9.281us 1.547us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.937us 29.00% 7.937us 1.323us 6 - Activity Buffer Request 27.85% 225.076us 27.85% 225.076us 225.076us 1.344us 4.91% 1.344us 1.344us 1 - aten::empty_strided 3.92% 31.711us 3.92% 31.711us 5.285us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 24.26% 196.025us 24.26% 196.025us 32.671us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.38% 35.400us 5.54% 44.790us 3.732us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.16% 9.390us 1.16% 9.390us 0.782us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.51% 44.544us 5.51% 44.544us 7.424us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.58% 4.720us 0.58% 4.720us 4.720us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 334.455us 1253.30% 334.455us 334.455us 1 + hf_kernels_rotary 18.38% 152.071us 99.44% 822.719us 822.719us 0.000us 0.00% 27.966us 27.966us 1 + _rotary_dba7d1e::apply_rotary 5.29% 43.741us 10.38% 85.902us 14.317us 18.975us 71.10% 18.975us 3.163us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.975us 71.10% 18.975us 3.163us 6 + aten::clone 2.47% 20.399us 65.75% 544.023us 90.670us 0.000us 0.00% 8.991us 1.498us 6 + aten::copy_ 4.79% 39.600us 59.70% 493.952us 82.325us 7.711us 28.90% 8.991us 1.498us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.711us 28.90% 7.711us 1.285us 6 + Activity Buffer Request 30.27% 250.456us 30.27% 250.456us 250.456us 1.280us 4.80% 1.280us 1.280us 1 + aten::empty_strided 3.59% 29.672us 3.59% 29.672us 4.945us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 24.64% 203.896us 24.64% 203.896us 33.983us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.84% 31.802us 4.92% 40.723us 3.394us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.08% 8.921us 1.08% 8.921us 0.743us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.10% 42.161us 5.10% 42.161us 7.027us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.56% 4.640us 0.56% 4.640us 4.640us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 808.180us -Self CUDA time total: 27.365us +Self CPU time total: 827.359us +Self CUDA time total: 26.686us @@ -4353,23 +4362,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.981us 1235.85% 349.981us 349.981us 1 - hf_kernels_rotary 8.03% 161.215us 99.76% 2.003ms 2.003ms 0.000us 0.00% 29.663us 29.663us 1 - _rotary_dba7d1e::apply_rotary 2.11% 42.422us 4.23% 84.982us 14.164us 20.544us 72.54% 20.544us 3.424us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.544us 72.54% 20.544us 3.424us 6 - aten::clone 1.12% 22.572us 85.29% 1.712ms 285.349us 0.000us 0.00% 9.119us 1.520us 6 - aten::copy_ 1.91% 38.260us 82.54% 1.657ms 276.143us 7.775us 27.46% 9.119us 1.520us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.775us 27.46% 7.775us 1.296us 6 - Activity Buffer Request 71.67% 1.439ms 71.67% 1.439ms 1.439ms 1.344us 4.75% 1.344us 1.344us 1 - aten::empty_strided 1.63% 32.660us 1.63% 32.660us 5.443us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.96% 179.936us 8.96% 179.936us 29.989us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.74% 34.910us 2.20% 44.250us 3.688us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.47% 9.340us 0.47% 9.340us 0.778us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.12% 42.560us 2.12% 42.560us 7.093us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.741us 0.24% 4.741us 4.741us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 335.033us 1090.71% 335.033us 335.033us 1 + hf_kernels_rotary 18.88% 152.276us 99.36% 801.289us 801.289us 0.000us 0.00% 32.445us 32.445us 1 + _rotary_dba7d1e::apply_rotary 5.08% 40.990us 10.38% 83.672us 13.945us 20.127us 65.52% 20.127us 3.354us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.127us 65.52% 20.127us 3.354us 6 + aten::clone 2.64% 21.299us 65.14% 525.331us 87.555us 0.000us 0.00% 12.318us 2.053us 6 + aten::copy_ 5.22% 42.109us 58.69% 473.291us 78.882us 10.590us 34.48% 12.318us 2.053us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.590us 34.48% 10.590us 1.765us 6 + Activity Buffer Request 26.39% 212.815us 26.39% 212.815us 212.815us 1.728us 5.63% 1.728us 1.728us 1 + aten::empty_strided 3.81% 30.741us 3.81% 30.741us 5.123us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 27.08% 218.367us 27.08% 218.367us 36.394us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.88% 31.271us 4.96% 40.010us 3.334us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.08% 8.739us 1.08% 8.739us 0.728us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.29% 42.682us 5.29% 42.682us 7.114us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.64% 5.180us 0.64% 5.180us 5.180us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.007ms -Self CUDA time total: 28.319us +Self CPU time total: 806.469us +Self CUDA time total: 30.717us @@ -4379,23 +4388,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 346.238us 971.27% 346.238us 346.238us 1 - hf_kernels_rotary 8.04% 160.124us 99.76% 1.988ms 1.988ms 0.000us 0.00% 37.440us 37.440us 1 - _rotary_dba7d1e::apply_rotary 2.20% 43.921us 4.24% 84.493us 14.082us 25.216us 70.74% 25.216us 4.203us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.216us 70.74% 25.216us 4.203us 6 - aten::clone 1.14% 22.762us 85.30% 1.700ms 283.325us 0.000us 0.00% 12.224us 2.037us 6 - aten::copy_ 1.84% 36.620us 82.53% 1.645ms 274.105us 10.432us 29.26% 12.224us 2.037us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 29.26% 10.432us 1.739us 6 - Activity Buffer Request 71.70% 1.429ms 71.70% 1.429ms 1.429ms 1.792us 5.03% 1.792us 1.792us 1 - aten::empty_strided 1.63% 32.561us 1.63% 32.561us 5.427us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.99% 179.114us 8.99% 179.114us 29.852us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.72% 34.250us 2.18% 43.390us 3.616us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.46% 9.140us 0.46% 9.140us 0.762us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.04% 40.572us 2.04% 40.572us 6.762us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.860us 0.24% 4.860us 4.860us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 336.501us 787.14% 336.501us 336.501us 1 + hf_kernels_rotary 18.58% 152.224us 99.36% 814.019us 814.019us 0.000us 0.00% 45.662us 45.662us 1 + _rotary_dba7d1e::apply_rotary 5.02% 41.151us 10.13% 82.992us 13.832us 25.695us 60.11% 25.695us 4.283us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.695us 60.11% 25.695us 4.283us 6 + aten::clone 2.59% 21.259us 65.61% 537.562us 89.594us 0.000us 0.00% 19.967us 3.328us 6 + aten::copy_ 4.69% 38.391us 59.23% 485.282us 80.880us 17.055us 39.89% 19.967us 3.328us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.055us 39.89% 17.055us 2.842us 6 + Activity Buffer Request 29.84% 244.476us 29.84% 244.476us 244.476us 2.912us 6.81% 2.912us 2.912us 1 + aten::empty_strided 3.79% 31.021us 3.79% 31.021us 5.170us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 24.71% 202.415us 24.71% 202.415us 33.736us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.95% 32.360us 5.03% 41.241us 3.437us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.08% 8.881us 1.08% 8.881us 0.740us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.11% 41.841us 5.11% 41.841us 6.973us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.64% 5.261us 0.64% 5.261us 5.261us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.993ms -Self CUDA time total: 35.648us +Self CPU time total: 819.280us +Self CUDA time total: 42.750us @@ -4405,23 +4414,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.675us 1229.10% 347.675us 347.675us 1 - hf_kernels_rotary 8.06% 160.274us 99.76% 1.984ms 1.984ms 0.000us 0.00% 29.631us 29.631us 1 - _rotary_dba7d1e::apply_rotary 2.18% 43.331us 4.28% 85.164us 14.194us 20.511us 72.51% 20.511us 3.418us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.511us 72.51% 20.511us 3.418us 6 - aten::clone 1.13% 22.531us 85.26% 1.696ms 282.610us 0.000us 0.00% 9.120us 1.520us 6 - aten::copy_ 1.97% 39.252us 82.52% 1.641ms 273.528us 7.776us 27.49% 9.120us 1.520us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 27.49% 7.776us 1.296us 6 - Activity Buffer Request 71.58% 1.424ms 71.58% 1.424ms 1.424ms 1.344us 4.75% 1.344us 1.344us 1 - aten::empty_strided 1.61% 31.959us 1.61% 31.959us 5.326us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.97% 178.354us 8.97% 178.354us 29.726us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.68% 33.430us 2.16% 42.920us 3.577us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.48% 9.490us 0.48% 9.490us 0.791us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.10% 41.833us 2.10% 41.833us 6.972us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.801us 0.24% 4.801us 4.801us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 330.306us 1080.84% 330.306us 330.306us 1 + hf_kernels_rotary 18.42% 149.681us 99.42% 807.979us 807.979us 0.000us 0.00% 32.321us 32.321us 1 + _rotary_dba7d1e::apply_rotary 5.10% 41.443us 10.19% 82.853us 13.809us 20.128us 65.86% 20.128us 3.355us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.128us 65.86% 20.128us 3.355us 6 + aten::clone 2.61% 21.203us 65.84% 535.084us 89.181us 0.000us 0.00% 12.193us 2.032us 6 + aten::copy_ 4.51% 36.639us 59.42% 482.940us 80.490us 10.432us 34.14% 12.193us 2.032us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 34.14% 10.432us 1.739us 6 + Activity Buffer Request 29.60% 240.586us 29.60% 240.586us 240.586us 1.761us 5.76% 1.761us 1.761us 1 + aten::empty_strided 3.81% 30.941us 3.81% 30.941us 5.157us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 25.31% 205.715us 25.31% 205.715us 34.286us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.91% 31.761us 4.97% 40.361us 3.363us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.06% 8.600us 1.06% 8.600us 0.717us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.10% 41.410us 5.10% 41.410us 6.902us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.58% 4.711us 0.58% 4.711us 4.711us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.989ms -Self CUDA time total: 28.287us +Self CPU time total: 812.690us +Self CUDA time total: 30.560us @@ -4431,23 +4440,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 341.434us 959.52% 341.434us 341.434us 1 - hf_kernels_rotary 20.68% 156.375us 99.37% 751.248us 751.248us 0.000us 0.00% 37.312us 37.312us 1 - _rotary_dba7d1e::apply_rotary 5.66% 42.780us 11.14% 84.232us 14.039us 25.184us 70.77% 25.184us 4.197us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.184us 70.77% 25.184us 4.197us 6 - aten::clone 3.01% 22.779us 61.92% 468.081us 78.014us 0.000us 0.00% 12.128us 2.021us 6 - aten::copy_ 4.78% 36.161us 54.65% 413.150us 68.858us 10.400us 29.23% 12.128us 2.021us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 29.23% 10.400us 1.733us 6 - Activity Buffer Request 26.22% 198.225us 26.22% 198.225us 198.225us 1.728us 4.86% 1.728us 1.728us 1 - aten::empty_strided 4.25% 32.152us 4.25% 32.152us 5.359us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 23.65% 178.764us 23.65% 178.764us 29.794us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.40% 33.290us 5.63% 42.560us 3.547us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.23% 9.270us 1.23% 9.270us 0.773us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.48% 41.452us 5.48% 41.452us 6.909us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.63% 4.741us 0.63% 4.741us 4.741us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 339.800us 797.80% 339.800us 339.800us 1 + hf_kernels_rotary 14.79% 151.874us 99.55% 1.022ms 1.022ms 0.000us 0.00% 45.440us 45.440us 1 + _rotary_dba7d1e::apply_rotary 4.15% 42.610us 8.29% 85.131us 14.188us 25.536us 59.95% 25.536us 4.256us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.536us 59.95% 25.536us 4.256us 6 + aten::clone 2.08% 21.390us 72.46% 743.968us 123.995us 0.000us 0.00% 19.904us 3.317us 6 + aten::copy_ 3.85% 39.501us 67.34% 691.417us 115.236us 17.056us 40.05% 19.904us 3.317us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.056us 40.05% 17.056us 2.843us 6 + Activity Buffer Request 43.74% 449.121us 43.74% 449.121us 449.121us 2.848us 6.69% 2.848us 2.848us 1 + aten::empty_strided 3.03% 31.161us 3.03% 31.161us 5.193us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 19.75% 202.795us 19.75% 202.795us 33.799us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.15% 32.321us 4.01% 41.121us 3.427us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.86% 8.800us 0.86% 8.800us 0.733us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.14% 42.521us 4.14% 42.521us 7.087us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.45% 4.640us 0.45% 4.640us 4.640us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 755.989us -Self CUDA time total: 35.584us +Self CPU time total: 1.027ms +Self CUDA time total: 42.592us @@ -4457,23 +4466,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.886us 617.06% 349.886us 349.886us 1 - hf_kernels_rotary 15.93% 158.238us 99.46% 988.285us 988.285us 0.000us 0.00% 59.582us 59.582us 1 - _rotary_dba7d1e::apply_rotary 4.43% 44.009us 8.77% 87.171us 14.528us 39.742us 70.09% 39.742us 6.624us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.742us 70.09% 39.742us 6.624us 6 - aten::clone 2.20% 21.907us 70.33% 698.845us 116.474us 0.000us 0.00% 19.840us 3.307us 6 - aten::copy_ 3.76% 37.392us 65.02% 646.067us 107.678us 16.960us 29.91% 19.840us 3.307us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 29.91% 16.960us 2.827us 6 - Activity Buffer Request 43.30% 430.221us 43.30% 430.221us 430.221us 2.880us 5.08% 2.880us 2.880us 1 - aten::empty_strided 3.11% 30.871us 3.11% 30.871us 5.145us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 17.96% 178.454us 17.96% 178.454us 29.742us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.43% 34.051us 4.43% 44.031us 3.669us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.00% 9.980us 1.00% 9.980us 0.832us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.34% 43.162us 4.34% 43.162us 7.194us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.54% 5.320us 0.54% 5.320us 5.320us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 339.355us 384.24% 339.355us 339.355us 1 + hf_kernels_rotary 17.67% 151.144us 99.35% 849.740us 849.740us 0.000us 0.00% 103.359us 103.359us 1 + aten::clone 2.51% 21.431us 67.11% 573.953us 95.659us 0.000us 0.00% 62.527us 10.421us 6 + aten::copy_ 4.55% 38.879us 60.94% 521.222us 86.870us 47.487us 53.77% 62.527us 10.421us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 47.487us 53.77% 47.487us 7.915us 6 + _rotary_dba7d1e::apply_rotary 5.04% 43.111us 9.90% 84.683us 14.114us 40.832us 46.23% 40.832us 6.805us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 40.832us 46.23% 40.832us 6.805us 6 + Activity Buffer Request 33.08% 282.937us 33.08% 282.937us 282.937us 15.040us 17.03% 15.040us 15.040us 1 + aten::empty_strided 3.66% 31.300us 3.66% 31.300us 5.217us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 23.31% 199.406us 23.31% 199.406us 33.234us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.68% 31.469us 4.67% 39.960us 3.330us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.99% 8.491us 0.99% 8.491us 0.708us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.86% 41.572us 4.86% 41.572us 6.929us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.65% 5.540us 0.65% 5.540us 5.540us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 993.605us -Self CUDA time total: 56.702us +Self CPU time total: 855.280us +Self CUDA time total: 88.319us @@ -4483,23 +4492,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 352.574us 297.38% 352.574us 352.574us 1 - hf_kernels_rotary 18.56% 157.003us 99.43% 841.041us 841.041us 0.000us 0.00% 135.680us 135.680us 1 - aten::clone 2.59% 21.881us 65.75% 556.174us 92.696us 0.000us 0.00% 69.984us 11.664us 6 - aten::copy_ 4.37% 36.992us 59.34% 501.912us 83.652us 52.864us 44.59% 69.984us 11.664us 6 - _rotary_dba7d1e::apply_rotary 5.11% 43.221us 10.14% 85.754us 14.292us 65.696us 55.41% 65.696us 10.949us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 65.696us 55.41% 65.696us 10.949us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.864us 44.59% 52.864us 8.811us 6 - Activity Buffer Request 33.65% 284.597us 33.65% 284.597us 284.597us 17.120us 14.44% 17.120us 17.120us 1 - aten::empty_strided 3.83% 32.381us 3.83% 32.381us 5.397us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.32% 180.323us 21.32% 180.323us 30.054us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.89% 32.880us 4.98% 42.110us 3.509us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.09% 9.230us 1.09% 9.230us 0.769us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.03% 42.533us 5.03% 42.533us 7.089us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.57% 4.810us 0.57% 4.810us 4.810us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 340.473us 232.72% 340.473us 340.473us 1 + hf_kernels_rotary 20.27% 170.703us 99.41% 837.069us 837.069us 0.000us 0.00% 170.267us 170.267us 1 + aten::clone 2.43% 20.451us 64.32% 541.612us 90.269us 0.000us 0.00% 106.876us 17.813us 6 + aten::copy_ 4.52% 38.100us 58.32% 491.079us 81.846us 82.909us 56.67% 106.876us 17.813us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 82.909us 56.67% 82.909us 13.818us 6 + _rotary_dba7d1e::apply_rotary 5.01% 42.193us 10.04% 84.563us 14.094us 63.391us 43.33% 63.391us 10.565us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.391us 43.33% 63.391us 10.565us 6 + Activity Buffer Request 30.43% 256.245us 30.43% 256.245us 256.245us 23.967us 16.38% 23.967us 23.967us 1 + aten::empty_strided 3.57% 30.082us 3.57% 30.082us 5.014us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 23.36% 196.734us 23.36% 196.734us 32.789us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.79% 31.880us 4.77% 40.191us 3.349us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.99% 8.311us 0.99% 8.311us 0.693us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.03% 42.370us 5.03% 42.370us 7.062us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.59% 4.950us 0.59% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 845.851us -Self CUDA time total: 118.560us +Self CPU time total: 842.019us +Self CUDA time total: 146.300us @@ -4509,23 +4518,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 341.982us 603.45% 341.982us 341.982us 1 - hf_kernels_rotary 18.98% 155.712us 99.43% 815.710us 815.710us 0.000us 0.00% 59.487us 59.487us 1 - _rotary_dba7d1e::apply_rotary 5.25% 43.112us 10.37% 85.045us 14.174us 39.839us 70.30% 39.839us 6.640us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.839us 70.30% 39.839us 6.640us 6 - aten::clone 2.51% 20.600us 64.82% 531.763us 88.627us 0.000us 0.00% 19.648us 3.275us 6 - aten::copy_ 4.52% 37.100us 58.54% 480.262us 80.044us 16.832us 29.70% 19.648us 3.275us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.832us 29.70% 16.832us 2.805us 6 - Activity Buffer Request 32.45% 266.237us 32.45% 266.237us 266.237us 2.816us 4.97% 2.816us 2.816us 1 - aten::empty_strided 3.77% 30.901us 3.77% 30.901us 5.150us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.57% 176.925us 21.57% 176.925us 29.488us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.05% 33.240us 5.26% 43.190us 3.599us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.21% 9.950us 1.21% 9.950us 0.829us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.11% 41.933us 5.11% 41.933us 6.989us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.57% 4.700us 0.57% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 372.221us 493.09% 372.221us 372.221us 1 + hf_kernels_rotary 8.48% 174.314us 99.77% 2.050ms 2.050ms 0.000us 0.00% 82.399us 82.399us 1 + _rotary_dba7d1e::apply_rotary 1.96% 40.332us 4.07% 83.672us 13.945us 41.887us 55.49% 41.887us 6.981us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 41.887us 55.49% 41.887us 6.981us 6 + aten::clone 1.40% 28.690us 85.15% 1.749ms 291.567us 0.000us 0.00% 40.512us 6.752us 6 + aten::copy_ 2.00% 41.101us 81.21% 1.669ms 278.085us 33.600us 44.51% 40.512us 6.752us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 33.600us 44.51% 33.600us 5.600us 6 + Activity Buffer Request 69.57% 1.429ms 69.57% 1.429ms 1.429ms 6.912us 9.16% 6.912us 6.912us 1 + aten::empty_strided 2.54% 52.203us 2.54% 52.203us 8.701us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.64% 198.094us 9.64% 198.094us 33.016us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.63% 33.409us 2.06% 42.400us 3.533us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.44% 8.991us 0.44% 8.991us 0.749us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.11% 43.340us 2.11% 43.340us 7.223us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 4.681us 0.23% 4.681us 4.681us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 820.410us -Self CUDA time total: 56.671us +Self CPU time total: 2.054ms +Self CUDA time total: 75.487us @@ -4535,23 +4544,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 388.726us 325.86% 388.726us 388.726us 1 - hf_kernels_rotary 19.76% 169.936us 99.45% 855.401us 855.401us 0.000us 0.00% 136.923us 136.923us 1 - aten::clone 2.64% 22.710us 63.15% 543.123us 90.521us 0.000us 0.00% 70.877us 11.813us 6 - aten::copy_ 4.46% 38.370us 56.50% 485.931us 80.988us 53.246us 44.64% 70.877us 11.813us 6 - _rotary_dba7d1e::apply_rotary 5.64% 48.490us 10.91% 93.801us 15.634us 66.046us 55.36% 66.046us 11.008us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 66.046us 55.36% 66.046us 11.008us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.246us 44.64% 53.246us 8.874us 6 - Activity Buffer Request 30.83% 265.147us 30.83% 265.147us 265.147us 17.631us 14.78% 17.631us 17.631us 1 - aten::empty_strided 4.01% 34.482us 4.01% 34.482us 5.747us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.21% 182.414us 21.21% 182.414us 30.402us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.39% 37.781us 5.64% 48.541us 4.045us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.25% 10.760us 1.25% 10.760us 0.897us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.27% 45.311us 5.27% 45.311us 7.552us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.55% 4.700us 0.55% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 352.925us 241.86% 352.925us 352.925us 1 + hf_kernels_rotary 8.45% 172.475us 99.74% 2.036ms 2.036ms 0.000us 0.00% 169.664us 169.664us 1 + aten::clone 1.31% 26.731us 85.08% 1.737ms 289.500us 0.000us 0.00% 105.664us 17.611us 6 + aten::copy_ 1.93% 39.410us 82.15% 1.677ms 279.535us 81.920us 56.14% 105.664us 17.611us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.920us 56.14% 81.920us 13.653us 6 + _rotary_dba7d1e::apply_rotary 2.10% 42.891us 4.20% 85.681us 14.280us 64.000us 43.86% 64.000us 10.667us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 64.000us 43.86% 64.000us 10.667us 6 + Activity Buffer Request 70.60% 1.441ms 70.60% 1.441ms 1.441ms 23.744us 16.27% 23.744us 23.744us 1 + aten::empty_strided 1.62% 33.061us 1.62% 33.061us 5.510us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.62% 196.364us 9.62% 196.364us 32.727us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.58% 32.223us 2.02% 41.242us 3.437us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.44% 9.019us 0.44% 9.019us 0.752us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.10% 42.790us 2.10% 42.790us 7.132us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.26% 5.250us 0.26% 5.250us 5.250us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 860.101us -Self CUDA time total: 119.292us +Self CPU time total: 2.042ms +Self CUDA time total: 145.920us @@ -4561,23 +4570,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 357.115us 181.96% 357.115us 357.115us 1 - hf_kernels_rotary 18.86% 155.885us 99.43% 821.750us 821.750us 0.000us 0.00% 219.904us 219.904us 1 - _rotary_dba7d1e::apply_rotary 5.36% 44.321us 10.59% 87.561us 14.594us 115.808us 59.01% 115.808us 19.301us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 115.808us 59.01% 115.808us 19.301us 6 - aten::clone 2.51% 20.740us 64.81% 535.643us 89.274us 0.000us 0.00% 104.096us 17.349us 6 - aten::copy_ 4.34% 35.891us 58.73% 485.402us 80.900us 80.448us 40.99% 104.096us 17.349us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 80.448us 40.99% 80.448us 13.408us 6 - Activity Buffer Request 32.66% 269.957us 32.66% 269.957us 269.957us 23.648us 12.05% 23.648us 23.648us 1 - aten::empty_strided 3.57% 29.501us 3.57% 29.501us 4.917us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.72% 179.554us 21.72% 179.554us 29.926us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.97% 32.801us 5.16% 42.661us 3.555us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.19% 9.860us 1.19% 9.860us 0.822us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.23% 43.240us 5.23% 43.240us 7.207us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.57% 4.750us 0.57% 4.750us 4.750us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 19.65% 213.486us 76.93% 835.739us 835.739us 0.000us 0.00% 746.680us 746.680us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 687.960us 101.16% 687.960us 687.960us 1 + aten::clone 1.98% 21.549us 45.41% 493.341us 82.224us 0.000us 0.00% 557.242us 92.874us 6 + aten::copy_ 3.46% 37.632us 40.54% 440.431us 73.405us 490.619us 72.14% 557.242us 92.874us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 490.619us 72.14% 490.619us 81.770us 6 + _rotary_dba7d1e::apply_rotary 4.15% 45.032us 8.07% 87.672us 14.612us 189.438us 27.86% 189.438us 31.573us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 189.438us 27.86% 189.438us 31.573us 6 + Activity Buffer Request 19.06% 207.035us 19.06% 207.035us 207.035us 66.623us 9.80% 66.623us 66.623us 1 + aten::empty_strided 2.89% 31.361us 2.89% 31.361us 5.227us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 18.02% 195.764us 18.02% 195.764us 32.627us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.96% 32.192us 3.80% 41.240us 3.437us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.83% 9.048us 0.83% 9.048us 0.754us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 3.93% 42.640us 3.93% 42.640us 7.107us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 23.07% 250.625us 23.07% 250.625us 250.625us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 826.500us -Self CUDA time total: 196.256us +Self CPU time total: 1.086ms +Self CUDA time total: 680.057us @@ -4587,60 +4596,60 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 13.04% 159.984us 66.42% 814.800us 814.800us 0.000us 0.00% 847.705us 847.705us 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 789.466us 101.01% 789.466us 789.466us 1 - aten::clone 1.84% 22.521us 42.98% 527.184us 87.864us 0.000us 0.00% 577.883us 96.314us 6 - aten::copy_ 2.96% 36.311us 38.61% 473.681us 78.947us 511.772us 65.48% 577.883us 96.314us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 511.772us 65.48% 511.772us 85.295us 6 - _rotary_dba7d1e::apply_rotary 3.59% 44.023us 6.92% 84.943us 14.157us 269.822us 34.52% 269.822us 44.970us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 269.822us 34.52% 269.822us 44.970us 6 - Activity Buffer Request 21.07% 258.456us 21.07% 258.456us 258.456us 66.111us 8.46% 66.111us 66.111us 1 - aten::empty_strided 2.53% 30.982us 2.53% 30.982us 5.164us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 14.58% 178.914us 14.58% 178.914us 29.819us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.74% 33.620us 3.48% 42.689us 3.557us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.74% 9.069us 0.74% 9.069us 0.756us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 3.34% 40.920us 3.34% 40.920us 6.820us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 33.58% 411.910us 33.58% 411.910us 411.910us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 5.18% 149.334us 27.72% 799.139us 799.139us 0.000us 0.00% 2.625ms 2.625ms 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 2.454ms 100.33% 2.454ms 2.454ms 1 + aten::clone 0.74% 21.311us 18.12% 522.352us 87.059us 0.000us 0.00% 1.393ms 232.143us 6 + aten::copy_ 1.34% 38.611us 16.30% 469.821us 78.303us 1.214ms 49.62% 1.393ms 232.143us 6 + _rotary_dba7d1e::apply_rotary 1.48% 42.661us 3.03% 87.271us 14.545us 1.232ms 50.38% 1.232ms 205.327us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 1.232ms 50.38% 1.232ms 205.327us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.214ms 49.62% 1.214ms 202.255us 6 + Activity Buffer Request 8.25% 237.786us 8.25% 237.786us 237.786us 179.327us 7.33% 179.327us 179.327us 1 + aten::empty_strided 1.08% 31.220us 1.08% 31.220us 5.203us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.71% 193.424us 6.71% 193.424us 32.237us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.11% 31.861us 1.39% 40.182us 3.349us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.29% 8.321us 0.29% 8.321us 0.693us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.55% 44.610us 1.55% 44.610us 7.435us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 72.28% 2.083ms 72.28% 2.083ms 2.083ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.227ms -Self CUDA time total: 781.594us +Self CPU time total: 2.882ms +Self CUDA time total: 2.445ms impl wl p50(ms) ok -hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 False -hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 False -hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.10 False -hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False -hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.09 False -hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 False -hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 False -hf_kernels_rotary cuda_B1_S2048_H8_D64_R32 0.09 False -hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 False -hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 False -hf_kernels_rotary cuda_B1_S512_H8_D128_R64 0.09 False -hf_kernels_rotary cuda_B1_S512_H8_D64_R32 0.09 False -hf_kernels_rotary cuda_B2_S128_H32_D128_R64 0.09 False -hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 False -hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 False -hf_kernels_rotary cuda_B2_S128_H8_D64_R32 0.09 False -hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.28 False -hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.10 False -hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 False -hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.09 False -hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09 False -hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09 False -hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 False -hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 False +hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 True +hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 True +hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 True +hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.07 True +hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.26 True +hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 True +hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 True +hf_kernels_rotary cuda_B1_S2048_H8_D64_R32 0.09 True +hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 True +hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 True +hf_kernels_rotary cuda_B1_S512_H8_D128_R64 0.09 True +hf_kernels_rotary cuda_B1_S512_H8_D64_R32 0.09 True +hf_kernels_rotary cuda_B2_S128_H32_D128_R64 0.09 True +hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 True +hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 True +hf_kernels_rotary cuda_B2_S128_H8_D64_R32 0.09 True +hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.84 True +hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.26 True +hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 True +hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.09 True +hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09 True +hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09 True +hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 True +hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True
▶ UV Install Logs
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 22.14it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 22.12it/s]
+Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 15.01it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 15.00it/s]

Artifacts:

rotary.jsonl