diff --git "a/rotary/impls/hf_kernels_rotary.html" "b/rotary/impls/hf_kernels_rotary.html" --- "a/rotary/impls/hf_kernels_rotary.html" +++ "b/rotary/impls/hf_kernels_rotary.html" @@ -3905,7 +3905,7 @@ Cell: nv | 0.25s
Fri Dec 19 19:54:55 2025 +Fri Dec 19 23:00:45 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 | +-----------------------------------------+------------------------+----------------------+ @@ -3914,7 +3914,7 @@ Cell: nv | 0.25s | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 36C P0 90W / 350W | 0MiB / 46068MiB | 17% Default | +| N/A 40C P0 85W / 350W | 0MiB / 46068MiB | 24% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -3938,7 +3938,7 @@ Cell: nv | 0.25s ▼ output ▶ uv-logs | -Cell: benchmark | 6.21s +Cell: benchmark | 4.78s | Raw @@ -4009,23 +4009,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 397.918us 1712.80% 397.918us 397.918us 1 - hf_kernels_rotary 10.09% 234.566us 99.43% 2.310ms 2.310ms 0.000us 0.00% 24.512us 24.512us 1 - _rotary_dba7d1e::apply_rotary 2.29% 53.121us 4.37% 101.432us 16.905us 16.192us 69.70% 16.192us 2.699us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.192us 69.70% 16.192us 2.699us 6 - aten::clone 1.67% 38.889us 82.80% 1.924ms 320.673us 0.000us 0.00% 8.320us 1.387us 6 - aten::copy_ 1.71% 39.649us 78.85% 1.832ms 305.366us 7.040us 30.30% 8.320us 1.387us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.040us 30.30% 7.040us 1.173us 6 - Activity Buffer Request 74.05% 1.721ms 74.05% 1.721ms 1.721ms 1.280us 5.51% 1.280us 1.280us 1 - aten::empty_strided 2.28% 52.952us 2.28% 52.952us 8.825us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 3.09% 71.834us 3.09% 71.834us 11.972us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.69% 39.231us 2.17% 50.432us 4.203us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.48% 11.201us 0.48% 11.201us 0.933us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.08% 48.311us 2.08% 48.311us 8.052us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.57% 13.240us 0.57% 13.240us 13.240us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 384.577us 1653.10% 384.577us 384.577us 1 + hf_kernels_rotary 9.70% 223.937us 99.35% 2.294ms 2.294ms 0.000us 0.00% 24.544us 24.544us 1 + _rotary_dba7d1e::apply_rotary 2.33% 53.870us 4.38% 101.111us 16.852us 16.128us 69.33% 16.128us 2.688us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.128us 69.33% 16.128us 2.688us 6 + aten::clone 1.57% 36.153us 83.10% 1.919ms 319.785us 0.000us 0.00% 8.416us 1.403us 6 + aten::copy_ 1.64% 37.980us 79.42% 1.834ms 305.621us 7.136us 30.67% 8.416us 1.403us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.136us 30.67% 7.136us 1.189us 6 + Activity Buffer Request 74.69% 1.725ms 74.69% 1.725ms 1.725ms 1.280us 5.50% 1.280us 1.280us 1 + aten::empty_strided 2.11% 48.830us 2.11% 48.830us 8.138us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 3.08% 71.171us 3.08% 71.171us 11.862us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.71% 39.468us 2.17% 50.120us 4.177us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.46% 10.652us 0.46% 10.652us 0.888us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.05% 47.241us 2.05% 47.241us 7.873us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.65% 15.020us 0.65% 15.020us 15.020us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.324ms -Self CUDA time total: 23.232us +Self CPU time total: 2.309ms +Self CUDA time total: 23.264us @@ -4035,23 +4035,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 351.006us 1460.58% 351.006us 351.006us 1 - hf_kernels_rotary 8.50% 188.914us 99.67% 2.215ms 2.215ms 0.000us 0.00% 25.344us 25.344us 1 - _rotary_dba7d1e::apply_rotary 1.92% 42.653us 3.86% 85.804us 14.301us 16.160us 67.24% 16.160us 2.693us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.160us 67.24% 16.160us 2.693us 6 - aten::clone 1.34% 29.789us 85.47% 1.899ms 316.566us 0.000us 0.00% 9.184us 1.531us 6 - aten::copy_ 1.59% 35.393us 82.63% 1.836ms 306.029us 7.872us 32.76% 9.184us 1.531us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 32.76% 7.872us 1.312us 6 - Activity Buffer Request 78.48% 1.744ms 78.48% 1.744ms 1.744ms 1.312us 5.46% 1.312us 1.312us 1 - aten::empty_strided 1.50% 33.432us 1.50% 33.432us 5.572us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.55% 56.710us 2.55% 56.710us 9.452us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.45% 32.280us 1.84% 40.820us 3.402us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.38% 8.540us 0.38% 8.540us 0.712us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.94% 43.151us 1.94% 43.151us 7.192us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.33% 7.360us 0.33% 7.360us 7.360us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 323.742us 1343.66% 323.742us 323.742us 1 + hf_kernels_rotary 8.02% 171.501us 99.72% 2.131ms 2.131ms 0.000us 0.00% 25.406us 25.406us 1 + _rotary_dba7d1e::apply_rotary 1.84% 39.272us 3.69% 78.893us 13.149us 16.224us 67.34% 16.224us 2.704us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.224us 67.34% 16.224us 2.704us 6 + aten::clone 1.20% 25.714us 86.18% 1.842ms 306.982us 0.000us 0.00% 9.182us 1.530us 6 + aten::copy_ 1.70% 36.249us 83.58% 1.786ms 297.695us 7.870us 32.66% 9.182us 1.530us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.870us 32.66% 7.870us 1.312us 6 + Activity Buffer Request 79.35% 1.696ms 79.35% 1.696ms 1.696ms 1.312us 5.45% 1.312us 1.312us 1 + aten::empty_strided 1.40% 30.010us 1.40% 30.010us 5.002us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.53% 54.011us 2.53% 54.011us 9.002us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.43% 30.532us 1.82% 38.871us 3.239us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.39% 8.339us 0.39% 8.339us 0.695us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.85% 39.621us 1.85% 39.621us 6.603us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.28% 6.040us 0.28% 6.040us 6.040us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.222ms -Self CUDA time total: 24.032us +Self CPU time total: 2.137ms +Self CUDA time total: 24.094us @@ -4061,23 +4061,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 328.861us 1359.32% 328.861us 328.861us 1 - hf_kernels_rotary 7.82% 169.265us 99.74% 2.160ms 2.160ms 0.000us 0.00% 25.505us 25.505us 1 - _rotary_dba7d1e::apply_rotary 1.90% 41.240us 3.83% 83.032us 13.839us 16.449us 67.99% 16.449us 2.742us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.449us 67.99% 16.449us 2.742us 6 - aten::clone 1.22% 26.522us 86.28% 1.868ms 311.384us 0.000us 0.00% 9.056us 1.509us 6 - aten::copy_ 1.60% 34.652us 83.63% 1.811ms 301.836us 7.744us 32.01% 9.056us 1.509us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 32.01% 7.744us 1.291us 6 - Activity Buffer Request 79.55% 1.723ms 79.55% 1.723ms 1.723ms 1.312us 5.42% 1.312us 1.312us 1 - aten::empty_strided 1.42% 30.770us 1.42% 30.770us 5.128us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.49% 53.840us 2.49% 53.840us 8.973us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.40% 30.337us 1.81% 39.289us 3.274us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.41% 8.952us 0.41% 8.952us 0.746us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.93% 41.792us 1.93% 41.792us 6.965us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.26% 5.540us 0.26% 5.540us 5.540us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 324.959us 1341.36% 324.959us 324.959us 1 + hf_kernels_rotary 7.79% 167.945us 99.74% 2.150ms 2.150ms 0.000us 0.00% 25.506us 25.506us 1 + _rotary_dba7d1e::apply_rotary 1.90% 41.040us 3.78% 81.541us 13.590us 16.482us 68.03% 16.482us 2.747us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.482us 68.03% 16.482us 2.747us 6 + aten::clone 1.17% 25.161us 86.39% 1.862ms 310.280us 0.000us 0.00% 9.024us 1.504us 6 + aten::copy_ 1.63% 35.122us 83.86% 1.807ms 301.213us 7.744us 31.97% 9.024us 1.504us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 31.97% 7.744us 1.291us 6 + Activity Buffer Request 79.59% 1.715ms 79.59% 1.715ms 1.715ms 1.280us 5.28% 1.280us 1.280us 1 + aten::empty_strided 1.36% 29.241us 1.36% 29.241us 4.874us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.64% 56.970us 2.64% 56.970us 9.495us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.41% 30.477us 1.78% 38.419us 3.202us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.37% 7.942us 0.37% 7.942us 0.662us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.88% 40.501us 1.88% 40.501us 6.750us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.26% 5.501us 0.26% 5.501us 5.501us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.165ms -Self CUDA time total: 24.193us +Self CPU time total: 2.155ms +Self CUDA time total: 24.226us @@ -4087,23 +4087,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 332.000us 1184.36% 332.000us 332.000us 1 - hf_kernels_rotary 7.19% 171.403us 99.79% 2.378ms 2.378ms 0.000us 0.00% 29.792us 29.792us 1 - _rotary_dba7d1e::apply_rotary 1.72% 40.922us 3.47% 82.793us 13.799us 17.632us 62.90% 17.632us 2.939us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.632us 62.90% 17.632us 2.939us 6 - aten::clone 1.16% 27.640us 87.43% 2.084ms 347.303us 0.000us 0.00% 12.160us 2.027us 6 - aten::copy_ 1.42% 33.951us 84.95% 2.025ms 337.488us 10.400us 37.10% 12.160us 2.027us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 37.10% 10.400us 1.733us 6 - Activity Buffer Request 73.90% 1.761ms 73.90% 1.761ms 1.761ms 1.760us 6.28% 1.760us 1.760us 1 - aten::empty_strided 1.31% 31.250us 1.31% 31.250us 5.208us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.63% 229.586us 9.63% 229.586us 38.264us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.31% 31.193us 1.70% 40.482us 3.373us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.39% 9.289us 0.39% 9.289us 0.774us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.76% 41.871us 1.76% 41.871us 6.979us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.21% 5.050us 0.21% 5.050us 5.050us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 364.384us 1292.46% 364.384us 364.384us 1 + hf_kernels_rotary 8.38% 203.415us 99.78% 2.422ms 2.422ms 0.000us 0.00% 29.986us 29.986us 1 + _rotary_dba7d1e::apply_rotary 1.78% 43.242us 3.59% 87.183us 14.530us 17.664us 62.65% 17.664us 2.944us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.664us 62.65% 17.664us 2.944us 6 + aten::clone 1.06% 25.710us 86.14% 2.091ms 348.456us 0.000us 0.00% 12.322us 2.054us 6 + aten::copy_ 1.40% 33.961us 83.85% 2.035ms 339.187us 10.529us 37.35% 12.322us 2.054us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.529us 37.35% 10.529us 1.755us 6 + Activity Buffer Request 73.02% 1.772ms 73.02% 1.772ms 1.772ms 1.793us 6.36% 1.793us 1.793us 1 + aten::empty_strided 1.23% 29.901us 1.23% 29.901us 4.983us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.43% 228.855us 9.43% 228.855us 38.143us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.32% 32.031us 1.67% 40.620us 3.385us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.35% 8.589us 0.35% 8.589us 0.716us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.81% 43.941us 1.81% 43.941us 7.323us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.22% 5.260us 0.22% 5.260us 5.260us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.384ms -Self CUDA time total: 28.032us +Self CPU time total: 2.427ms +Self CUDA time total: 28.193us @@ -4113,23 +4113,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 335.418us 1384.59% 335.418us 335.418us 1 - hf_kernels_rotary 20.26% 170.294us 99.34% 834.940us 834.940us 0.000us 0.00% 25.537us 25.537us 1 - _rotary_dba7d1e::apply_rotary 4.83% 40.562us 9.92% 83.412us 13.902us 16.513us 68.17% 16.513us 2.752us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.513us 68.17% 16.513us 2.752us 6 - aten::clone 2.64% 22.222us 64.45% 541.674us 90.279us 0.000us 0.00% 9.024us 1.504us 6 - aten::copy_ 4.18% 35.111us 57.94% 486.972us 81.162us 7.712us 31.83% 9.024us 1.504us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 31.83% 7.712us 1.285us 6 - Activity Buffer Request 27.90% 234.506us 27.90% 234.506us 234.506us 1.312us 5.42% 1.312us 1.312us 1 - aten::empty_strided 3.86% 32.480us 3.86% 32.480us 5.413us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 25.86% 217.355us 25.86% 217.355us 36.226us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.66% 30.769us 4.71% 39.560us 3.297us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.05% 8.791us 1.05% 8.791us 0.733us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.10% 42.850us 5.10% 42.850us 7.142us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.66% 5.560us 0.66% 5.560us 5.560us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 323.517us 1337.29% 323.517us 323.517us 1 + hf_kernels_rotary 20.05% 169.673us 99.33% 840.688us 840.688us 0.000us 0.00% 25.472us 25.472us 1 + _rotary_dba7d1e::apply_rotary 4.98% 42.181us 9.92% 83.942us 13.990us 16.480us 68.12% 16.480us 2.747us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.480us 68.12% 16.480us 2.747us 6 + aten::clone 2.34% 19.800us 64.76% 548.062us 91.344us 0.000us 0.00% 8.992us 1.499us 6 + aten::copy_ 3.95% 33.463us 59.03% 499.612us 83.269us 7.712us 31.88% 8.992us 1.499us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 31.88% 7.712us 1.285us 6 + Activity Buffer Request 28.63% 242.305us 28.63% 242.305us 242.305us 1.280us 5.29% 1.280us 1.280us 1 + aten::empty_strided 3.39% 28.650us 3.39% 28.650us 4.775us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 26.45% 223.844us 26.45% 223.844us 37.307us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.63% 30.762us 4.61% 39.011us 3.251us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.97% 8.249us 0.97% 8.249us 0.687us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.93% 41.761us 4.93% 41.761us 6.960us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.67% 5.650us 0.67% 5.650us 5.650us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 840.500us -Self CUDA time total: 24.225us +Self CPU time total: 846.338us +Self CUDA time total: 24.192us @@ -4139,23 +4139,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 335.419us 1187.11% 335.419us 335.419us 1 - hf_kernels_rotary 22.11% 159.484us 99.14% 715.038us 715.038us 0.000us 0.00% 30.047us 30.047us 1 - _rotary_dba7d1e::apply_rotary 6.01% 43.310us 12.22% 88.120us 14.687us 17.727us 62.74% 17.727us 2.954us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.727us 62.74% 17.727us 2.954us 6 - aten::clone 2.86% 20.618us 59.23% 427.160us 71.193us 0.000us 0.00% 12.320us 2.053us 6 - aten::copy_ 4.70% 33.871us 52.15% 376.129us 62.688us 10.528us 37.26% 12.320us 2.053us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.528us 37.26% 10.528us 1.755us 6 - Activity Buffer Request 17.96% 129.563us 17.96% 129.563us 129.563us 1.792us 6.34% 1.792us 1.792us 1 - aten::empty_strided 4.22% 30.413us 4.22% 30.413us 5.069us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 29.49% 212.695us 29.49% 212.695us 35.449us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.36% 31.443us 5.58% 40.274us 3.356us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.22% 8.831us 1.22% 8.831us 0.736us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 6.21% 44.810us 6.21% 44.810us 7.468us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.86% 6.181us 0.86% 6.181us 6.181us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 311.513us 1102.47% 311.513us 311.513us 1 + hf_kernels_rotary 20.61% 164.155us 99.30% 791.098us 791.098us 0.000us 0.00% 30.048us 30.048us 1 + _rotary_dba7d1e::apply_rotary 4.94% 39.349us 9.84% 78.361us 13.060us 17.696us 62.63% 17.696us 2.949us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.696us 62.63% 17.696us 2.949us 6 + aten::clone 2.47% 19.670us 64.03% 510.081us 85.014us 0.000us 0.00% 12.352us 2.059us 6 + aten::copy_ 4.07% 32.461us 57.94% 461.581us 76.930us 10.560us 37.37% 12.352us 2.059us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.560us 37.37% 10.560us 1.760us 6 + Activity Buffer Request 27.37% 218.044us 27.37% 218.044us 218.044us 1.792us 6.34% 1.792us 1.792us 1 + aten::empty_strided 3.62% 28.830us 3.62% 28.830us 4.805us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 26.50% 211.076us 26.50% 211.076us 35.179us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.85% 30.651us 4.83% 38.501us 3.208us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.99% 7.850us 0.99% 7.850us 0.654us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.90% 39.012us 4.90% 39.012us 6.502us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.70% 5.550us 0.70% 5.550us 5.550us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 721.219us -Self CUDA time total: 28.255us +Self CPU time total: 796.648us +Self CUDA time total: 28.256us @@ -4165,23 +4165,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 339.104us 833.12% 339.104us 339.104us 1 - hf_kernels_rotary 7.67% 177.694us 99.76% 2.310ms 2.310ms 0.000us 0.00% 43.583us 43.583us 1 - _rotary_dba7d1e::apply_rotary 1.80% 41.651us 3.62% 83.803us 13.967us 23.520us 57.78% 23.520us 3.920us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.520us 57.78% 23.520us 3.920us 6 - aten::clone 1.20% 27.761us 86.70% 2.008ms 334.588us 0.000us 0.00% 20.063us 3.344us 6 - aten::copy_ 1.49% 34.550us 84.17% 1.949ms 324.808us 17.183us 42.22% 20.063us 3.344us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.183us 42.22% 17.183us 2.864us 6 - Activity Buffer Request 73.48% 1.701ms 73.48% 1.701ms 1.701ms 2.880us 7.08% 2.880us 2.880us 1 - aten::empty_strided 1.34% 30.920us 1.34% 30.920us 5.153us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.20% 212.976us 9.20% 212.976us 35.496us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.39% 32.120us 1.76% 40.721us 3.393us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.37% 8.601us 0.37% 8.601us 0.717us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.82% 42.152us 1.82% 42.152us 7.025us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 5.660us 0.24% 5.660us 5.660us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 323.712us 798.42% 323.712us 323.712us 1 + hf_kernels_rotary 7.57% 170.665us 99.78% 2.250ms 2.250ms 0.000us 0.00% 43.392us 43.392us 1 + _rotary_dba7d1e::apply_rotary 1.73% 38.911us 3.48% 78.541us 13.090us 23.456us 57.85% 23.456us 3.909us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.456us 57.85% 23.456us 3.909us 6 + aten::clone 1.16% 26.090us 87.02% 1.962ms 327.007us 0.000us 0.00% 19.936us 3.323us 6 + aten::copy_ 1.56% 35.131us 84.49% 1.905ms 317.482us 17.088us 42.15% 19.936us 3.323us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 42.15% 17.088us 2.848us 6 + Activity Buffer Request 74.36% 1.676ms 74.36% 1.676ms 1.676ms 2.848us 7.02% 2.848us 2.848us 1 + aten::empty_strided 1.38% 31.061us 1.38% 31.061us 5.177us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.57% 193.284us 8.57% 193.284us 32.214us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.34% 30.268us 1.71% 38.460us 3.205us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.36% 8.192us 0.36% 8.192us 0.683us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.76% 39.630us 1.76% 39.630us 6.605us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.22% 4.960us 0.22% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.315ms -Self CUDA time total: 40.703us +Self CPU time total: 2.255ms +Self CUDA time total: 40.544us @@ -4191,23 +4191,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 342.239us 459.79% 342.239us 342.239us 1 - hf_kernels_rotary 7.38% 176.732us 99.78% 2.390ms 2.390ms 0.000us 0.00% 82.913us 82.913us 1 - aten::clone 1.17% 28.130us 87.22% 2.089ms 348.177us 0.000us 0.00% 43.777us 7.296us 6 - aten::copy_ 1.46% 34.971us 84.72% 2.029ms 338.203us 35.297us 47.42% 43.777us 7.296us 6 - _rotary_dba7d1e::apply_rotary 1.71% 40.931us 3.50% 83.864us 13.977us 39.136us 52.58% 39.136us 6.523us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.136us 52.58% 39.136us 6.523us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 35.297us 47.42% 35.297us 5.883us 6 - Activity Buffer Request 74.20% 1.777ms 74.20% 1.777ms 1.777ms 8.480us 11.39% 8.480us 8.480us 1 - aten::empty_strided 1.32% 31.711us 1.32% 31.711us 5.285us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.06% 217.015us 9.06% 217.015us 36.169us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.30% 31.251us 1.69% 40.431us 3.369us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.38% 9.180us 0.38% 9.180us 0.765us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.79% 42.933us 1.79% 42.933us 7.155us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.22% 5.191us 0.22% 5.191us 5.191us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 325.696us 437.95% 325.696us 325.696us 1 + hf_kernels_rotary 7.25% 170.323us 99.77% 2.344ms 2.344ms 0.000us 0.00% 82.880us 82.880us 1 + aten::clone 1.13% 26.590us 87.43% 2.054ms 342.324us 0.000us 0.00% 44.000us 7.333us 6 + aten::copy_ 1.50% 35.240us 85.03% 1.997ms 332.900us 35.488us 47.72% 44.000us 7.333us 6 + _rotary_dba7d1e::apply_rotary 1.74% 40.881us 3.46% 81.232us 13.539us 38.880us 52.28% 38.880us 6.480us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 38.880us 52.28% 38.880us 6.480us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 35.488us 47.72% 35.488us 5.915us 6 + Activity Buffer Request 75.65% 1.777ms 75.65% 1.777ms 1.777ms 8.512us 11.45% 8.512us 8.512us 1 + aten::empty_strided 1.27% 29.951us 1.27% 29.951us 4.992us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 7.87% 184.984us 7.87% 184.984us 30.831us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.28% 30.151us 1.62% 38.112us 3.176us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.34% 7.961us 0.34% 7.961us 0.663us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.72% 40.351us 1.72% 40.351us 6.725us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 5.500us 0.23% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.395ms -Self CUDA time total: 74.433us +Self CPU time total: 2.349ms +Self CUDA time total: 74.368us @@ -4217,23 +4217,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 332.929us 819.86% 332.929us 332.929us 1 - hf_kernels_rotary 7.07% 166.848us 99.78% 2.353ms 2.353ms 0.000us 0.00% 43.488us 43.488us 1 - _rotary_dba7d1e::apply_rotary 1.68% 39.560us 3.51% 82.681us 13.780us 23.488us 57.84% 23.488us 3.915us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.488us 57.84% 23.488us 3.915us 6 - aten::clone 1.16% 27.340us 87.49% 2.064ms 343.923us 0.000us 0.00% 20.000us 3.333us 6 - aten::copy_ 1.51% 35.519us 85.00% 2.005ms 334.108us 17.120us 42.16% 20.000us 3.333us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.120us 42.16% 17.120us 2.853us 6 - Activity Buffer Request 74.50% 1.757ms 74.50% 1.757ms 1.757ms 2.880us 7.09% 2.880us 2.880us 1 - aten::empty_strided 1.34% 31.550us 1.34% 31.550us 5.258us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.99% 212.096us 8.99% 212.096us 35.349us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.35% 31.900us 1.71% 40.350us 3.362us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.36% 8.450us 0.36% 8.450us 0.704us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.83% 43.121us 1.83% 43.121us 7.187us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.22% 5.120us 0.22% 5.120us 5.120us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 322.973us 797.86% 322.973us 322.973us 1 + hf_kernels_rotary 8.42% 188.754us 99.76% 2.236ms 2.236ms 0.000us 0.00% 43.328us 43.328us 1 + _rotary_dba7d1e::apply_rotary 1.78% 39.922us 3.52% 78.932us 13.155us 23.456us 57.94% 23.456us 3.909us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.456us 57.94% 23.456us 3.909us 6 + aten::clone 1.19% 26.589us 86.07% 1.929ms 321.553us 0.000us 0.00% 19.872us 3.312us 6 + aten::copy_ 1.47% 33.001us 83.54% 1.873ms 312.092us 17.024us 42.06% 19.872us 3.312us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.024us 42.06% 17.024us 2.837us 6 + Activity Buffer Request 73.86% 1.656ms 73.86% 1.656ms 1.656ms 2.848us 7.04% 2.848us 2.848us 1 + aten::empty_strided 1.35% 30.180us 1.35% 30.180us 5.030us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.21% 183.975us 8.21% 183.975us 30.662us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.39% 31.060us 1.75% 39.122us 3.260us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.36% 8.062us 0.36% 8.062us 0.672us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.74% 39.010us 1.74% 39.010us 6.502us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.24% 5.491us 0.24% 5.491us 5.491us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.359ms -Self CUDA time total: 40.608us +Self CPU time total: 2.242ms +Self CUDA time total: 40.480us @@ -4243,23 +4243,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 338.142us 439.39% 338.142us 338.142us 1 - hf_kernels_rotary 7.05% 174.064us 99.80% 2.465ms 2.465ms 0.000us 0.00% 86.270us 86.270us 1 - aten::clone 1.20% 29.650us 87.84% 2.170ms 361.584us 0.000us 0.00% 47.071us 7.845us 6 - aten::copy_ 1.42% 34.959us 85.36% 2.108ms 351.395us 37.759us 49.06% 47.071us 7.845us 6 - _rotary_dba7d1e::apply_rotary 1.66% 41.022us 3.32% 82.043us 13.674us 39.199us 50.94% 39.199us 6.533us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.199us 50.94% 39.199us 6.533us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 37.759us 49.06% 37.759us 6.293us 6 - Activity Buffer Request 75.49% 1.864ms 75.49% 1.864ms 1.864ms 9.312us 12.10% 9.312us 9.312us 1 - aten::empty_strided 1.27% 31.482us 1.27% 31.482us 5.247us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.46% 208.956us 8.46% 208.956us 34.826us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.25% 30.771us 1.59% 39.290us 3.274us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.34% 8.519us 0.34% 8.519us 0.710us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.66% 41.021us 1.66% 41.021us 6.837us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.20% 5.010us 0.20% 5.010us 5.010us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 323.101us 423.90% 323.101us 323.101us 1 + hf_kernels_rotary 7.30% 168.335us 99.78% 2.301ms 2.301ms 0.000us 0.00% 85.500us 85.500us 1 + aten::clone 1.11% 25.632us 87.37% 2.015ms 335.877us 0.000us 0.00% 46.364us 7.727us 6 + aten::copy_ 1.44% 33.260us 84.98% 1.960ms 326.665us 37.085us 48.65% 46.364us 7.727us 6 + _rotary_dba7d1e::apply_rotary 1.70% 39.159us 3.44% 79.421us 13.237us 39.136us 51.35% 39.136us 6.523us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.136us 51.35% 39.136us 6.523us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 37.085us 48.65% 37.085us 6.181us 6 + Activity Buffer Request 75.65% 1.745ms 75.65% 1.745ms 1.745ms 9.279us 12.17% 9.279us 9.279us 1 + aten::empty_strided 1.29% 29.640us 1.29% 29.640us 4.940us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 7.89% 181.984us 7.89% 181.984us 30.331us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.33% 30.620us 1.67% 38.470us 3.206us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.34% 7.850us 0.34% 7.850us 0.654us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.75% 40.262us 1.75% 40.262us 6.710us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.22% 5.000us 0.22% 5.000us 5.000us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.470ms -Self CUDA time total: 76.958us +Self CPU time total: 2.306ms +Self CUDA time total: 76.221us @@ -4269,23 +4269,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 343.679us 247.41% 343.679us 343.679us 1 - hf_kernels_rotary 7.47% 171.684us 99.76% 2.294ms 2.294ms 0.000us 0.00% 162.559us 162.559us 1 - aten::clone 1.25% 28.681us 86.89% 1.998ms 333.015us 0.000us 0.00% 102.592us 17.099us 6 - aten::copy_ 1.49% 34.332us 84.20% 1.936ms 322.703us 78.944us 56.83% 102.592us 17.099us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 78.944us 56.83% 78.944us 13.157us 6 - _rotary_dba7d1e::apply_rotary 1.83% 42.151us 3.66% 84.183us 14.030us 59.967us 43.17% 59.967us 9.995us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 59.967us 43.17% 59.967us 9.995us 6 - Activity Buffer Request 73.81% 1.697ms 73.81% 1.697ms 1.697ms 23.648us 17.02% 23.648us 23.648us 1 - aten::empty_strided 1.44% 33.190us 1.44% 33.190us 5.532us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.89% 204.504us 8.89% 204.504us 34.084us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.37% 31.511us 1.74% 40.120us 3.343us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.37% 8.609us 0.37% 8.609us 0.717us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.83% 42.032us 1.83% 42.032us 7.005us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 5.461us 0.24% 5.461us 5.461us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 328.643us 235.88% 328.643us 328.643us 1 + hf_kernels_rotary 7.36% 166.641us 99.76% 2.258ms 2.258ms 0.000us 0.00% 163.042us 163.042us 1 + aten::clone 1.33% 30.021us 87.18% 1.973ms 328.842us 0.000us 0.00% 102.882us 17.147us 6 + aten::copy_ 1.51% 34.171us 84.51% 1.913ms 318.757us 79.169us 56.82% 102.882us 17.147us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 79.169us 56.82% 79.169us 13.195us 6 + _rotary_dba7d1e::apply_rotary 1.75% 39.570us 3.52% 79.562us 13.260us 60.160us 43.18% 60.160us 10.027us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 60.160us 43.18% 60.160us 10.027us 6 + Activity Buffer Request 74.82% 1.693ms 74.82% 1.693ms 1.693ms 23.713us 17.02% 23.713us 23.713us 1 + aten::empty_strided 1.35% 30.491us 1.35% 30.491us 5.082us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.17% 185.005us 8.17% 185.005us 30.834us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.35% 30.551us 1.70% 38.372us 3.198us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.35% 7.821us 0.35% 7.821us 0.652us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.77% 39.992us 1.77% 39.992us 6.665us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.24% 5.501us 0.24% 5.501us 5.501us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.300ms -Self CUDA time total: 138.911us +Self CPU time total: 2.263ms +Self CUDA time total: 139.329us @@ -4295,23 +4295,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 6.47% 175.111us 88.03% 2.384ms 2.384ms 0.000us 0.00% 770.847us 770.847us 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 713.695us 101.13% 713.695us 713.695us 1 - aten::clone 1.03% 27.812us 76.48% 2.071ms 345.215us 0.000us 0.00% 568.671us 94.778us 6 - aten::copy_ 1.35% 36.632us 74.30% 2.012ms 335.367us 503.551us 71.35% 568.671us 94.778us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 503.551us 71.35% 503.551us 83.925us 6 - _rotary_dba7d1e::apply_rotary 1.88% 50.972us 3.57% 96.775us 16.129us 202.176us 28.65% 202.176us 33.696us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 202.176us 28.65% 202.176us 33.696us 6 - Activity Buffer Request 65.43% 1.772ms 65.43% 1.772ms 1.772ms 65.120us 9.23% 65.120us 65.120us 1 - aten::empty_strided 1.15% 31.280us 1.15% 31.280us 5.213us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 7.52% 203.605us 7.52% 203.605us 33.934us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.18% 32.050us 1.51% 41.000us 3.417us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.33% 8.950us 0.33% 8.950us 0.746us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.69% 45.803us 1.69% 45.803us 7.634us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 11.97% 324.077us 11.97% 324.077us 324.077us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 6.40% 173.445us 87.62% 2.374ms 2.374ms 0.000us 0.00% 766.307us 766.307us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 707.554us 101.09% 707.554us 707.554us 1 + aten::clone 0.97% 26.320us 76.46% 2.071ms 345.229us 0.000us 0.00% 566.210us 94.368us 6 + aten::copy_ 1.24% 33.572us 74.40% 2.016ms 335.956us 499.842us 71.41% 566.210us 94.368us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 499.842us 71.41% 499.842us 83.307us 6 + _rotary_dba7d1e::apply_rotary 1.84% 49.730us 3.34% 90.461us 15.077us 200.097us 28.59% 200.097us 33.350us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 200.097us 28.59% 200.097us 33.350us 6 + Activity Buffer Request 66.43% 1.800ms 66.43% 1.800ms 1.800ms 66.368us 9.48% 66.368us 66.368us 1 + aten::empty_strided 1.08% 29.321us 1.08% 29.321us 4.887us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.73% 182.313us 6.73% 182.313us 30.385us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.12% 30.440us 1.42% 38.441us 3.203us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.30% 8.001us 0.30% 8.001us 0.667us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.50% 40.731us 1.50% 40.731us 6.788us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 12.38% 335.537us 12.38% 335.537us 335.537us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.708ms -Self CUDA time total: 705.727us +Self CPU time total: 2.709ms +Self CUDA time total: 699.939us @@ -4321,23 +4321,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 336.477us 1265.33% 336.477us 336.477us 1 - hf_kernels_rotary 7.39% 171.995us 99.77% 2.322ms 2.322ms 0.000us 0.00% 27.904us 27.904us 1 - _rotary_dba7d1e::apply_rotary 1.76% 41.061us 3.57% 83.061us 13.844us 18.816us 70.76% 18.816us 3.136us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.816us 70.76% 18.816us 3.136us 6 - aten::clone 1.24% 28.803us 87.11% 2.027ms 337.843us 0.000us 0.00% 9.088us 1.515us 6 - aten::copy_ 1.64% 38.082us 84.50% 1.966ms 327.735us 7.776us 29.24% 9.088us 1.515us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 29.24% 7.776us 1.296us 6 - Activity Buffer Request 72.85% 1.695ms 72.85% 1.695ms 1.695ms 1.312us 4.93% 1.312us 1.312us 1 - aten::empty_strided 1.37% 31.849us 1.37% 31.849us 5.308us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 10.01% 232.925us 10.01% 232.925us 38.821us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.33% 30.872us 1.71% 39.700us 3.308us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.38% 8.828us 0.38% 8.828us 0.736us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.80% 42.000us 1.80% 42.000us 7.000us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.23% 5.320us 0.23% 5.320us 5.320us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 326.335us 1221.36% 326.335us 326.335us 1 + hf_kernels_rotary 7.45% 166.364us 99.75% 2.229ms 2.229ms 0.000us 0.00% 28.063us 28.063us 1 + _rotary_dba7d1e::apply_rotary 1.91% 42.732us 3.76% 83.973us 13.996us 18.815us 70.42% 18.815us 3.136us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.815us 70.42% 18.815us 3.136us 6 + aten::clone 1.19% 26.589us 86.78% 1.939ms 323.190us 0.000us 0.00% 9.248us 1.541us 6 + aten::copy_ 1.45% 32.469us 84.19% 1.881ms 313.518us 7.904us 29.58% 9.248us 1.541us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 29.58% 7.904us 1.317us 6 + Activity Buffer Request 74.65% 1.668ms 74.65% 1.668ms 1.668ms 1.344us 5.03% 1.344us 1.344us 1 + aten::empty_strided 1.41% 31.442us 1.41% 31.442us 5.240us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.08% 180.586us 8.08% 180.586us 30.098us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.40% 31.310us 1.76% 39.389us 3.282us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.36% 8.079us 0.36% 8.079us 0.673us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.85% 41.241us 1.85% 41.241us 6.873us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.25% 5.600us 0.25% 5.600us 5.600us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.327ms -Self CUDA time total: 26.592us +Self CPU time total: 2.234ms +Self CUDA time total: 26.719us @@ -4347,23 +4347,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 361.593us 1356.47% 361.593us 361.593us 1 - hf_kernels_rotary 20.01% 156.574us 99.29% 776.889us 776.889us 0.000us 0.00% 27.969us 27.969us 1 - _rotary_dba7d1e::apply_rotary 6.08% 47.572us 11.90% 93.114us 15.519us 18.881us 70.83% 18.881us 3.147us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.881us 70.83% 18.881us 3.147us 6 - aten::clone 2.86% 22.401us 61.73% 483.012us 80.502us 0.000us 0.00% 9.088us 1.515us 6 - aten::copy_ 5.02% 39.248us 54.77% 428.540us 71.423us 7.776us 29.17% 9.088us 1.515us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 29.17% 7.776us 1.296us 6 - Activity Buffer Request 23.43% 183.305us 23.43% 183.305us 183.305us 1.312us 4.92% 1.312us 1.312us 1 - aten::empty_strided 4.10% 32.071us 4.10% 32.071us 5.345us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 26.33% 205.987us 26.33% 205.987us 34.331us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.44% 34.709us 5.65% 44.189us 3.682us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.21% 9.480us 1.21% 9.480us 0.790us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.82% 45.542us 5.82% 45.542us 7.590us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.71% 5.560us 0.71% 5.560us 5.560us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 321.948us 1206.29% 321.948us 321.948us 1 + hf_kernels_rotary 18.26% 148.741us 99.36% 809.528us 809.528us 0.000us 0.00% 28.001us 28.001us 1 + _rotary_dba7d1e::apply_rotary 5.08% 41.412us 10.07% 82.013us 13.669us 18.880us 70.74% 18.880us 3.147us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.880us 70.74% 18.880us 3.147us 6 + aten::clone 2.30% 18.723us 66.05% 538.143us 89.691us 0.000us 0.00% 9.121us 1.520us 6 + aten::copy_ 4.35% 35.479us 60.21% 490.570us 81.762us 7.809us 29.26% 9.121us 1.520us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.809us 29.26% 7.809us 1.301us 6 + Activity Buffer Request 33.00% 268.886us 33.00% 268.886us 268.886us 1.312us 4.92% 1.312us 1.312us 1 + aten::empty_strided 3.54% 28.850us 3.54% 28.850us 4.808us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 22.85% 186.205us 22.85% 186.205us 31.034us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.00% 32.551us 4.99% 40.631us 3.386us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.99% 8.080us 0.99% 8.080us 0.673us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.98% 40.601us 4.98% 40.601us 6.767us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.64% 5.220us 0.64% 5.220us 5.220us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 782.449us -Self CUDA time total: 26.657us +Self CPU time total: 814.748us +Self CUDA time total: 26.689us @@ -4373,23 +4373,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 329.530us 1073.84% 329.530us 329.530us 1 - hf_kernels_rotary 18.69% 150.673us 99.29% 800.610us 800.610us 0.000us 0.00% 32.447us 32.447us 1 - _rotary_dba7d1e::apply_rotary 5.09% 41.061us 10.37% 83.622us 13.937us 20.159us 65.69% 20.159us 3.360us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.159us 65.69% 20.159us 3.360us 6 - aten::clone 2.45% 19.762us 65.24% 526.013us 87.669us 0.000us 0.00% 12.288us 2.048us 6 - aten::copy_ 4.31% 34.749us 58.96% 475.401us 79.234us 10.528us 34.31% 12.288us 2.048us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.528us 34.31% 10.528us 1.755us 6 - Activity Buffer Request 29.87% 240.876us 29.87% 240.876us 240.876us 1.760us 5.74% 1.760us 1.760us 1 - aten::empty_strided 3.83% 30.850us 3.83% 30.850us 5.142us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 24.78% 199.776us 24.78% 199.776us 33.296us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.92% 31.582us 5.00% 40.302us 3.358us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.08% 8.720us 1.08% 8.720us 0.727us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.28% 42.561us 5.28% 42.561us 7.094us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.71% 5.701us 0.71% 5.701us 5.701us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 317.754us 1035.40% 317.754us 317.754us 1 + hf_kernels_rotary 20.08% 146.926us 99.34% 726.766us 726.766us 0.000us 0.00% 32.481us 32.481us 1 + _rotary_dba7d1e::apply_rotary 5.67% 41.461us 11.01% 80.552us 13.425us 20.192us 65.80% 20.192us 3.365us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.192us 65.80% 20.192us 3.365us 6 + aten::clone 2.73% 19.937us 62.86% 459.858us 76.643us 0.000us 0.00% 12.289us 2.048us 6 + aten::copy_ 4.61% 33.700us 56.15% 410.789us 68.465us 10.497us 34.20% 12.289us 2.048us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.497us 34.20% 10.497us 1.750us 6 + Activity Buffer Request 26.92% 196.965us 26.92% 196.965us 196.965us 1.792us 5.84% 1.792us 1.792us 1 + aten::empty_strided 3.98% 29.132us 3.98% 29.132us 4.855us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 24.62% 180.124us 24.62% 180.124us 30.021us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.28% 31.330us 5.39% 39.430us 3.286us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.11% 8.100us 1.11% 8.100us 0.675us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.34% 39.091us 5.34% 39.091us 6.515us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.66% 4.850us 0.66% 4.850us 4.850us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 806.311us -Self CUDA time total: 30.687us +Self CPU time total: 731.616us +Self CUDA time total: 30.689us @@ -4399,23 +4399,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 329.435us 772.31% 329.435us 329.435us 1 - hf_kernels_rotary 20.10% 145.342us 99.21% 717.528us 717.528us 0.000us 0.00% 45.504us 45.504us 1 - _rotary_dba7d1e::apply_rotary 5.72% 41.400us 11.55% 83.552us 13.925us 25.568us 59.94% 25.568us 4.261us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.568us 59.94% 25.568us 4.261us 6 - aten::clone 2.77% 20.032us 61.99% 448.302us 74.717us 0.000us 0.00% 19.936us 3.323us 6 - aten::copy_ 4.93% 35.690us 55.04% 398.089us 66.348us 17.088us 40.06% 19.936us 3.323us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 40.06% 17.088us 2.848us 6 - Activity Buffer Request 22.43% 162.214us 22.43% 162.214us 162.214us 2.848us 6.68% 2.848us 2.848us 1 - aten::empty_strided 4.17% 30.181us 4.17% 30.181us 5.030us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 27.68% 200.185us 27.68% 200.185us 33.364us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.37% 31.593us 5.58% 40.332us 3.361us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.21% 8.739us 1.21% 8.739us 0.728us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.83% 42.152us 5.83% 42.152us 7.025us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.79% 5.700us 0.79% 5.700us 5.700us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 319.429us 749.90% 319.429us 319.429us 1 + hf_kernels_rotary 19.19% 146.865us 99.27% 759.737us 759.737us 0.000us 0.00% 45.476us 45.476us 1 + _rotary_dba7d1e::apply_rotary 5.51% 42.192us 10.66% 81.562us 13.594us 25.698us 60.33% 25.698us 4.283us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.698us 60.33% 25.698us 4.283us 6 + aten::clone 2.51% 19.231us 64.35% 492.451us 82.075us 0.000us 0.00% 19.778us 3.296us 6 + aten::copy_ 4.10% 31.370us 57.94% 443.419us 73.903us 16.898us 39.67% 19.778us 3.296us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.898us 39.67% 16.898us 2.816us 6 + Activity Buffer Request 30.11% 230.405us 30.11% 230.405us 230.405us 2.880us 6.76% 2.880us 2.880us 1 + aten::empty_strided 3.89% 29.801us 3.89% 29.801us 4.967us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 23.73% 181.644us 23.73% 181.644us 30.274us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.99% 30.570us 5.08% 38.859us 3.238us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.08% 8.289us 1.08% 8.289us 0.691us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.14% 39.370us 5.14% 39.370us 6.562us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.73% 5.590us 0.73% 5.590us 5.590us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 723.228us -Self CUDA time total: 42.656us +Self CPU time total: 765.327us +Self CUDA time total: 42.596us @@ -4425,23 +4425,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 330.651us 1075.32% 330.651us 330.651us 1 - hf_kernels_rotary 18.72% 150.931us 99.33% 800.790us 800.790us 0.000us 0.00% 32.508us 32.508us 1 - _rotary_dba7d1e::apply_rotary 5.21% 41.984us 10.24% 82.545us 13.758us 20.318us 66.08% 20.318us 3.386us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.318us 66.08% 20.318us 3.386us 6 - aten::clone 2.51% 20.253us 65.40% 527.224us 87.871us 0.000us 0.00% 12.190us 2.032us 6 - aten::copy_ 4.39% 35.371us 59.06% 476.161us 79.360us 10.431us 33.92% 12.190us 2.032us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 33.92% 10.431us 1.738us 6 - Activity Buffer Request 30.20% 243.496us 30.20% 243.496us 243.496us 1.759us 5.72% 1.759us 1.759us 1 - aten::empty_strided 3.82% 30.810us 3.82% 30.810us 5.135us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 24.47% 197.294us 24.47% 197.294us 32.882us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.87% 31.220us 4.97% 40.090us 3.341us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.10% 8.870us 1.10% 8.870us 0.739us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.03% 40.561us 5.03% 40.561us 6.760us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.67% 5.380us 0.67% 5.380us 5.380us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 322.049us 1047.28% 322.049us 322.049us 1 + hf_kernels_rotary 18.29% 149.504us 99.36% 812.348us 812.348us 0.000us 0.00% 32.511us 32.511us 1 + _rotary_dba7d1e::apply_rotary 5.08% 41.522us 10.02% 81.922us 13.654us 20.224us 65.77% 20.224us 3.371us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.224us 65.77% 20.224us 3.371us 6 + aten::clone 2.45% 20.060us 66.16% 540.872us 90.145us 0.000us 0.00% 12.287us 2.048us 6 + aten::copy_ 4.16% 33.972us 60.11% 491.422us 81.904us 10.527us 34.23% 12.287us 2.048us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.527us 34.23% 10.527us 1.754us 6 + Activity Buffer Request 32.85% 268.566us 32.85% 268.566us 268.566us 1.760us 5.72% 1.760us 1.760us 1 + aten::empty_strided 3.59% 29.390us 3.59% 29.390us 4.898us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 23.10% 188.884us 23.10% 188.884us 31.481us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.91% 31.930us 4.90% 40.050us 3.337us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.99% 8.120us 0.99% 8.120us 0.677us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.94% 40.400us 4.94% 40.400us 6.733us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.64% 5.230us 0.64% 5.230us 5.230us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 806.170us -Self CUDA time total: 30.749us +Self CPU time total: 817.578us +Self CUDA time total: 30.751us @@ -4451,23 +4451,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 330.616us 770.43% 330.616us 330.616us 1 - hf_kernels_rotary 17.99% 148.834us 99.35% 822.020us 822.020us 0.000us 0.00% 45.761us 45.761us 1 - _rotary_dba7d1e::apply_rotary 5.00% 41.361us 10.40% 86.012us 14.335us 25.857us 60.25% 25.857us 4.310us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.857us 60.25% 25.857us 4.310us 6 - aten::clone 2.40% 19.879us 66.22% 547.932us 91.322us 0.000us 0.00% 19.904us 3.317us 6 - aten::copy_ 4.29% 35.520us 60.10% 497.262us 82.877us 17.056us 39.75% 19.904us 3.317us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.056us 39.75% 17.056us 2.843us 6 - Activity Buffer Request 32.00% 264.767us 32.00% 264.767us 264.767us 2.848us 6.64% 2.848us 2.848us 1 - aten::empty_strided 3.72% 30.791us 3.72% 30.791us 5.132us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 23.81% 196.975us 23.81% 196.975us 32.829us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.76% 31.140us 4.74% 39.242us 3.270us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.98% 8.102us 0.98% 8.102us 0.675us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.40% 44.651us 5.40% 44.651us 7.442us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.65% 5.369us 0.65% 5.369us 5.369us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 316.538us 739.30% 316.538us 316.538us 1 + hf_kernels_rotary 19.03% 145.425us 99.27% 758.787us 758.787us 0.000us 0.00% 45.664us 45.664us 1 + _rotary_dba7d1e::apply_rotary 5.13% 39.179us 10.36% 79.161us 13.194us 25.792us 60.24% 25.792us 4.299us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.792us 60.24% 25.792us 4.299us 6 + aten::clone 2.60% 19.840us 64.72% 494.691us 82.449us 0.000us 0.00% 19.872us 3.312us 6 + aten::copy_ 4.32% 33.011us 58.33% 445.830us 74.305us 17.024us 39.76% 19.872us 3.312us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.024us 39.76% 17.024us 2.837us 6 + Activity Buffer Request 30.09% 230.025us 30.09% 230.025us 230.025us 2.848us 6.65% 2.848us 2.848us 1 + aten::empty_strided 3.80% 29.021us 3.80% 29.021us 4.837us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 23.92% 182.794us 23.92% 182.794us 30.466us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.11% 31.419us 5.17% 39.510us 3.293us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.06% 8.091us 1.06% 8.091us 0.674us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.23% 39.982us 5.23% 39.982us 6.664us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.73% 5.560us 0.73% 5.560us 5.560us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 827.389us -Self CUDA time total: 42.913us +Self CPU time total: 764.347us +Self CUDA time total: 42.816us @@ -4477,23 +4477,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 334.653us 358.15% 334.653us 334.653us 1 - hf_kernels_rotary 18.09% 151.114us 99.35% 829.780us 829.780us 0.000us 0.00% 109.215us 109.215us 1 - aten::clone 3.33% 27.839us 66.37% 554.323us 92.387us 0.000us 0.00% 68.064us 11.344us 6 - aten::copy_ 4.18% 34.911us 59.27% 495.081us 82.513us 52.288us 55.96% 68.064us 11.344us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.288us 55.96% 52.288us 8.715us 6 - _rotary_dba7d1e::apply_rotary 4.95% 41.342us 9.97% 83.303us 13.884us 41.151us 44.04% 41.151us 6.858us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 41.151us 44.04% 41.151us 6.858us 6 - Activity Buffer Request 31.64% 264.256us 31.64% 264.256us 264.256us 15.776us 16.88% 15.776us 15.776us 1 - aten::empty_strided 3.76% 31.403us 3.76% 31.403us 5.234us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 23.46% 195.914us 23.46% 195.914us 32.652us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.87% 32.310us 4.91% 41.040us 3.420us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.05% 8.730us 1.05% 8.730us 0.728us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.02% 41.961us 5.02% 41.961us 6.994us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.65% 5.470us 0.65% 5.470us 5.470us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 320.924us 345.95% 320.924us 320.924us 1 + hf_kernels_rotary 18.21% 146.024us 99.34% 796.707us 796.707us 0.000us 0.00% 108.351us 108.351us 1 + aten::clone 2.45% 19.620us 65.99% 529.232us 88.205us 0.000us 0.00% 67.071us 11.179us 6 + aten::copy_ 4.16% 33.361us 60.01% 481.261us 80.210us 51.487us 55.50% 67.071us 11.179us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 51.487us 55.50% 51.487us 8.581us 6 + _rotary_dba7d1e::apply_rotary 5.00% 40.060us 10.26% 82.291us 13.715us 41.280us 44.50% 41.280us 6.880us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 41.280us 44.50% 41.280us 6.880us 6 + Activity Buffer Request 33.32% 267.235us 33.32% 267.235us 267.235us 15.584us 16.80% 15.584us 15.584us 1 + aten::empty_strided 3.54% 28.351us 3.54% 28.351us 4.725us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 22.53% 180.665us 22.53% 180.665us 30.111us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.88% 31.108us 4.88% 39.160us 3.263us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.00% 8.052us 1.00% 8.052us 0.671us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.27% 42.231us 5.27% 42.231us 7.038us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.66% 5.290us 0.66% 5.290us 5.290us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 835.250us -Self CUDA time total: 93.439us +Self CPU time total: 801.997us +Self CUDA time total: 92.767us @@ -4503,23 +4503,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 402.293us 278.14% 402.293us 402.293us 1 - hf_kernels_rotary 18.23% 162.247us 99.42% 884.932us 884.932us 0.000us 0.00% 168.347us 168.347us 1 - aten::clone 2.56% 22.809us 61.35% 546.082us 91.014us 0.000us 0.00% 105.212us 17.535us 6 - aten::copy_ 3.90% 34.711us 55.16% 490.942us 81.824us 81.501us 56.35% 105.212us 17.535us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.501us 56.35% 81.501us 13.584us 6 - _rotary_dba7d1e::apply_rotary 4.99% 44.421us 15.06% 134.003us 22.334us 63.135us 43.65% 63.135us 10.522us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.135us 43.65% 63.135us 10.522us 6 - Activity Buffer Request 29.54% 262.907us 29.54% 262.907us 262.907us 23.711us 16.39% 23.711us 23.711us 1 - aten::empty_strided 3.63% 32.331us 3.63% 32.331us 5.388us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.72% 193.324us 21.72% 193.324us 32.221us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.75% 33.378us 4.79% 42.600us 3.550us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.04% 9.222us 1.04% 9.222us 0.769us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 10.06% 89.582us 10.06% 89.582us 14.930us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.58% 5.120us 0.58% 5.120us 5.120us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 328.056us 226.11% 328.056us 328.056us 1 + hf_kernels_rotary 19.08% 156.414us 99.34% 814.518us 814.518us 0.000us 0.00% 168.799us 168.799us 1 + aten::clone 3.19% 26.153us 65.07% 533.532us 88.922us 0.000us 0.00% 105.343us 17.557us 6 + aten::copy_ 3.98% 32.659us 58.36% 478.499us 79.750us 81.631us 56.26% 105.343us 17.557us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.631us 56.26% 81.631us 13.605us 6 + _rotary_dba7d1e::apply_rotary 5.18% 42.511us 10.36% 84.962us 14.160us 63.456us 43.74% 63.456us 10.576us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.456us 43.74% 63.456us 10.576us 6 + Activity Buffer Request 32.74% 268.426us 32.74% 268.426us 268.426us 23.712us 16.34% 23.712us 23.712us 1 + aten::empty_strided 3.52% 28.880us 3.52% 28.880us 4.813us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 21.64% 177.414us 21.64% 177.414us 29.569us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.88% 31.780us 4.83% 39.610us 3.301us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.95% 7.830us 0.95% 7.830us 0.652us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.18% 42.451us 5.18% 42.451us 7.075us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.66% 5.440us 0.66% 5.440us 5.440us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 890.052us -Self CUDA time total: 144.636us +Self CPU time total: 819.958us +Self CUDA time total: 145.087us @@ -4529,23 +4529,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 335.030us 423.20% 335.030us 335.030us 1 - hf_kernels_rotary 17.60% 149.462us 99.35% 843.781us 843.781us 0.000us 0.00% 89.342us 89.342us 1 - aten::clone 2.76% 23.471us 67.14% 570.174us 95.029us 0.000us 0.00% 47.328us 7.888us 6 - aten::copy_ 4.78% 40.570us 60.71% 515.622us 85.937us 37.152us 46.93% 47.328us 7.888us 6 - _rotary_dba7d1e::apply_rotary 4.86% 41.293us 9.90% 84.043us 14.007us 42.014us 53.07% 42.014us 7.002us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 42.014us 53.07% 42.014us 7.002us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 37.152us 46.93% 37.152us 6.192us 6 - Activity Buffer Request 33.04% 280.637us 33.04% 280.637us 280.637us 10.176us 12.85% 10.176us 10.176us 1 - aten::empty_strided 3.66% 31.081us 3.66% 31.081us 5.180us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 22.89% 194.415us 22.89% 194.415us 32.403us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.65% 30.971us 4.72% 40.102us 3.342us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.08% 9.131us 1.08% 9.131us 0.761us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.03% 42.750us 5.03% 42.750us 7.125us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.65% 5.500us 0.65% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 333.210us 419.37% 333.210us 333.210us 1 + hf_kernels_rotary 17.73% 150.595us 99.35% 844.049us 844.049us 0.000us 0.00% 89.630us 89.630us 1 + aten::clone 2.52% 21.432us 67.08% 569.832us 94.972us 0.000us 0.00% 47.583us 7.930us 6 + aten::copy_ 3.93% 33.381us 61.05% 518.601us 86.433us 37.407us 47.08% 47.583us 7.930us 6 + _rotary_dba7d1e::apply_rotary 4.77% 40.491us 9.89% 84.021us 14.004us 42.047us 52.92% 42.047us 7.008us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 42.047us 52.92% 42.047us 7.008us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 37.407us 47.08% 37.407us 6.235us 6 + Activity Buffer Request 36.09% 306.637us 36.09% 306.637us 306.637us 10.176us 12.81% 10.176us 10.176us 1 + aten::empty_strided 3.51% 29.799us 3.51% 29.799us 4.966us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 21.02% 178.583us 21.02% 178.583us 29.764us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.71% 31.542us 4.66% 39.601us 3.300us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.95% 8.059us 0.95% 8.059us 0.672us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.12% 43.530us 5.12% 43.530us 7.255us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.65% 5.480us 0.65% 5.480us 5.480us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 849.281us -Self CUDA time total: 79.166us +Self CPU time total: 849.529us +Self CUDA time total: 79.454us @@ -4555,23 +4555,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 335.451us 231.31% 335.451us 335.451us 1 - hf_kernels_rotary 20.10% 149.473us 99.28% 738.378us 738.378us 0.000us 0.00% 168.637us 168.637us 1 - aten::clone 2.71% 20.190us 62.55% 465.160us 77.527us 0.000us 0.00% 104.892us 17.482us 6 - aten::copy_ 4.80% 35.671us 55.64% 413.780us 68.963us 81.277us 56.04% 104.892us 17.482us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.277us 56.04% 81.277us 13.546us 6 - _rotary_dba7d1e::apply_rotary 5.48% 40.762us 11.33% 84.273us 14.046us 63.745us 43.96% 63.745us 10.624us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.745us 43.96% 63.745us 10.624us 6 - Activity Buffer Request 25.22% 187.575us 25.22% 187.575us 187.575us 23.615us 16.28% 23.615us 23.615us 1 - aten::empty_strided 4.19% 31.190us 4.19% 31.190us 5.198us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 25.62% 190.534us 25.62% 190.534us 31.756us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.11% 30.580us 5.31% 39.472us 3.289us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.20% 8.892us 1.20% 8.892us 0.741us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.85% 43.511us 5.85% 43.511us 7.252us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.72% 5.340us 0.72% 5.340us 5.340us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 324.316us 222.02% 324.316us 324.316us 1 + hf_kernels_rotary 18.22% 148.243us 99.33% 808.388us 808.388us 0.000us 0.00% 169.820us 169.820us 1 + aten::clone 2.32% 18.910us 66.43% 540.572us 90.095us 0.000us 0.00% 105.630us 17.605us 6 + aten::copy_ 4.07% 33.161us 60.40% 491.511us 81.919us 81.886us 56.06% 105.630us 17.605us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.886us 56.06% 81.886us 13.648us 6 + _rotary_dba7d1e::apply_rotary 4.96% 40.400us 10.02% 81.542us 13.590us 64.190us 43.94% 64.190us 10.698us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 64.190us 43.94% 64.190us 10.698us 6 + Activity Buffer Request 33.97% 276.476us 33.97% 276.476us 276.476us 23.744us 16.25% 23.744us 23.744us 1 + aten::empty_strided 3.70% 30.151us 3.70% 30.151us 5.025us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 22.35% 181.874us 22.35% 181.874us 30.312us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.71% 30.201us 4.67% 38.031us 3.169us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.96% 7.830us 0.96% 7.830us 0.652us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.06% 41.142us 5.06% 41.142us 6.857us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.67% 5.420us 0.67% 5.420us 5.420us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 743.718us -Self CUDA time total: 145.022us +Self CPU time total: 813.808us +Self CUDA time total: 146.076us @@ -4581,23 +4581,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 13.47% 148.469us 71.31% 785.819us 785.819us 0.000us 0.00% 741.458us 741.458us 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 683.092us 101.20% 683.092us 683.092us 1 - aten::clone 1.91% 21.000us 46.53% 512.672us 85.445us 0.000us 0.00% 558.390us 93.065us 6 - aten::copy_ 3.10% 34.171us 41.83% 460.962us 76.827us 491.927us 72.88% 558.390us 93.065us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 491.927us 72.88% 491.927us 81.988us 6 - _rotary_dba7d1e::apply_rotary 3.81% 41.992us 7.74% 85.293us 14.215us 183.068us 27.12% 183.068us 30.511us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 183.068us 27.12% 183.068us 30.511us 6 - Activity Buffer Request 21.20% 233.636us 21.20% 233.636us 233.636us 66.463us 9.85% 66.463us 66.463us 1 - aten::empty_strided 2.79% 30.710us 2.79% 30.710us 5.118us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 17.53% 193.155us 17.53% 193.155us 32.192us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.79% 30.724us 3.57% 39.385us 3.282us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.79% 8.661us 0.79% 8.661us 0.722us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 3.93% 43.301us 3.93% 43.301us 7.217us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 28.69% 316.097us 28.69% 316.097us 316.097us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 13.82% 153.005us 74.75% 827.548us 827.548us 0.000us 0.00% 740.918us 740.918us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 682.102us 101.12% 682.102us 682.102us 1 + aten::clone 1.78% 19.750us 46.80% 518.112us 86.352us 0.000us 0.00% 558.489us 93.082us 6 + aten::copy_ 3.14% 34.772us 42.33% 468.681us 78.114us 492.121us 72.96% 558.489us 93.082us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 492.121us 72.96% 492.121us 82.020us 6 + _rotary_dba7d1e::apply_rotary 3.85% 42.609us 7.73% 85.591us 14.265us 182.429us 27.04% 182.429us 30.405us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 182.429us 27.04% 182.429us 30.405us 6 + Activity Buffer Request 22.69% 251.255us 22.69% 251.255us 251.255us 66.368us 9.84% 66.368us 66.368us 1 + aten::empty_strided 2.68% 29.681us 2.68% 29.681us 4.947us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 16.50% 182.654us 16.50% 182.654us 30.442us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 5.62% 62.189us 6.40% 70.840us 5.903us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.78% 8.651us 0.78% 8.651us 0.721us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 3.88% 42.982us 3.88% 42.982us 7.164us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 25.25% 279.606us 25.25% 279.606us 279.606us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.102ms -Self CUDA time total: 674.995us +Self CPU time total: 1.107ms +Self CUDA time total: 674.550us @@ -4607,30 +4607,30 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 5.24% 147.368us 25.60% 720.668us 720.668us 0.000us 0.00% 2.633ms 2.633ms 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 2.458ms 100.33% 2.458ms 2.458ms 1 - aten::clone 0.73% 20.601us 15.98% 449.780us 74.963us 0.000us 0.00% 1.394ms 232.377us 6 - aten::copy_ 1.20% 33.732us 14.16% 398.690us 66.448us 1.211ms 49.43% 1.394ms 232.377us 6 - _rotary_dba7d1e::apply_rotary 1.46% 41.000us 3.02% 85.091us 14.182us 1.239ms 50.57% 1.239ms 206.500us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 1.239ms 50.57% 1.239ms 206.500us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.211ms 49.43% 1.211ms 201.812us 6 - Activity Buffer Request 6.06% 170.604us 6.06% 170.604us 170.604us 183.391us 7.49% 183.391us 183.391us 1 - aten::empty_strided 1.08% 30.489us 1.08% 30.489us 5.081us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.90% 194.354us 6.90% 194.354us 32.392us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.07% 30.231us 1.37% 38.429us 3.202us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.29% 8.198us 0.29% 8.198us 0.683us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.57% 44.091us 1.57% 44.091us 7.349us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 74.40% 2.094ms 74.40% 2.094ms 2.094ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 5.09% 145.551us 26.40% 755.006us 755.006us 0.000us 0.00% 2.627ms 2.627ms 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 2.456ms 100.32% 2.456ms 2.456ms 1 + aten::clone 0.65% 18.699us 17.06% 488.000us 81.333us 0.000us 0.00% 1.401ms 233.479us 6 + aten::copy_ 1.14% 32.589us 15.40% 440.469us 73.412us 1.223ms 49.95% 1.401ms 233.479us 6 + _rotary_dba7d1e::apply_rotary 1.42% 40.751us 2.92% 83.492us 13.915us 1.226ms 50.05% 1.226ms 204.274us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 1.226ms 50.05% 1.226ms 204.274us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.223ms 49.95% 1.223ms 203.836us 6 + Activity Buffer Request 7.97% 227.955us 7.97% 227.955us 227.955us 177.858us 7.26% 177.858us 177.858us 1 + aten::empty_strided 1.01% 28.832us 1.01% 28.832us 4.805us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.29% 179.925us 6.29% 179.925us 29.987us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.05% 30.104us 1.33% 37.963us 3.164us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.27% 7.859us 0.27% 7.859us 0.655us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.49% 42.741us 1.49% 42.741us 7.124us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 73.60% 2.105ms 73.60% 2.105ms 2.105ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.815ms -Self CUDA time total: 2.450ms +Self CPU time total: 2.860ms +Self CUDA time total: 2.449ms impl wl p50(ms) ok hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 True hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 True hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 True -hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 True +hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.07 True hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.26 True hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 True hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 True @@ -4655,13 +4655,13 @@ hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True-▶ UV Install LogsFetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. - -Fetching 5 files: 40%|████ | 2/5 [00:01<00:02, 1.08it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:01<00:00, 2.71it/s]+Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] +Fetching 5 files: 60%|██████ | 3/5 [00:00<00:00, 16.43it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 12.18it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 12.77it/s]Artifacts:
rotary.jsonl