diff --git "a/rotary/impls/hf_kernels_rotary.html" "b/rotary/impls/hf_kernels_rotary.html" --- "a/rotary/impls/hf_kernels_rotary.html" +++ "b/rotary/impls/hf_kernels_rotary.html" @@ -3905,7 +3905,7 @@ Cell: nv | 0.25s
Fri Dec 19 18:55:27 2025 +Fri Dec 19 19:54:55 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 | +-----------------------------------------+------------------------+----------------------+ @@ -3914,7 +3914,7 @@ Cell: nv | 0.25s | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 28C P0 85W / 350W | 0MiB / 46068MiB | 34% Default | +| N/A 36C P0 90W / 350W | 0MiB / 46068MiB | 17% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -3938,7 +3938,7 @@ Cell: nv | 0.25s ▼ output ▶ uv-logs | -Cell: benchmark | 34.01s +Cell: benchmark | 6.21s | Raw @@ -4009,23 +4009,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 443.904us 1908.20% 443.904us 443.904us 1 - hf_kernels_rotary 5.70% 250.202us 82.78% 3.636ms 3.636ms 0.000us 0.00% 24.543us 24.543us 1 - _rotary_dba7d1e::apply_rotary 1.26% 55.422us 2.46% 108.103us 18.017us 16.158us 69.46% 16.158us 2.693us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.158us 69.46% 16.158us 2.693us 6 - aten::clone 1.27% 55.831us 73.22% 3.217ms 536.122us 0.000us 0.00% 8.385us 1.398us 6 - aten::copy_ 1.34% 58.721us 52.83% 2.321ms 386.805us 7.105us 30.54% 8.385us 1.398us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.105us 30.54% 7.105us 1.184us 6 - Activity Buffer Request 49.67% 2.182ms 49.67% 2.182ms 2.182ms 1.280us 5.50% 1.280us 1.280us 1 - aten::empty_strided 19.12% 840.074us 19.12% 840.074us 140.012us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 1.82% 80.012us 1.82% 80.012us 13.335us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.10% 48.201us 1.40% 61.410us 5.118us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.30% 13.209us 0.30% 13.209us 1.101us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.20% 52.681us 1.20% 52.681us 8.780us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 17.22% 756.622us 17.22% 756.622us 756.622us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 397.918us 1712.80% 397.918us 397.918us 1 + hf_kernels_rotary 10.09% 234.566us 99.43% 2.310ms 2.310ms 0.000us 0.00% 24.512us 24.512us 1 + _rotary_dba7d1e::apply_rotary 2.29% 53.121us 4.37% 101.432us 16.905us 16.192us 69.70% 16.192us 2.699us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.192us 69.70% 16.192us 2.699us 6 + aten::clone 1.67% 38.889us 82.80% 1.924ms 320.673us 0.000us 0.00% 8.320us 1.387us 6 + aten::copy_ 1.71% 39.649us 78.85% 1.832ms 305.366us 7.040us 30.30% 8.320us 1.387us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.040us 30.30% 7.040us 1.173us 6 + Activity Buffer Request 74.05% 1.721ms 74.05% 1.721ms 1.721ms 1.280us 5.51% 1.280us 1.280us 1 + aten::empty_strided 2.28% 52.952us 2.28% 52.952us 8.825us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 3.09% 71.834us 3.09% 71.834us 11.972us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.69% 39.231us 2.17% 50.432us 4.203us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.48% 11.201us 0.48% 11.201us 0.933us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.08% 48.311us 2.08% 48.311us 8.052us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.57% 13.240us 0.57% 13.240us 13.240us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.393ms -Self CUDA time total: 23.263us +Self CPU time total: 2.324ms +Self CUDA time total: 23.232us @@ -4035,23 +4035,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 319.740us 1332.31% 319.740us 319.740us 1 - hf_kernels_rotary 7.66% 161.733us 99.74% 2.106ms 2.106ms 0.000us 0.00% 25.311us 25.311us 1 - _rotary_dba7d1e::apply_rotary 1.93% 40.672us 3.87% 81.612us 13.602us 16.127us 67.20% 16.127us 2.688us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.127us 67.20% 16.127us 2.688us 6 - aten::clone 0.96% 20.317us 86.36% 1.823ms 303.883us 0.000us 0.00% 9.184us 1.531us 6 - aten::copy_ 1.65% 34.881us 83.94% 1.772ms 295.377us 7.872us 32.80% 9.184us 1.531us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 32.80% 7.872us 1.312us 6 - Activity Buffer Request 79.76% 1.684ms 79.76% 1.684ms 1.684ms 1.312us 5.47% 1.312us 1.312us 1 - aten::empty_strided 1.46% 30.722us 1.46% 30.722us 5.120us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.53% 53.351us 2.53% 53.351us 8.892us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.45% 30.570us 1.85% 39.091us 3.258us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.40% 8.521us 0.40% 8.521us 0.710us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.94% 40.940us 1.94% 40.940us 6.823us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.26% 5.581us 0.26% 5.581us 5.581us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 351.006us 1460.58% 351.006us 351.006us 1 + hf_kernels_rotary 8.50% 188.914us 99.67% 2.215ms 2.215ms 0.000us 0.00% 25.344us 25.344us 1 + _rotary_dba7d1e::apply_rotary 1.92% 42.653us 3.86% 85.804us 14.301us 16.160us 67.24% 16.160us 2.693us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.160us 67.24% 16.160us 2.693us 6 + aten::clone 1.34% 29.789us 85.47% 1.899ms 316.566us 0.000us 0.00% 9.184us 1.531us 6 + aten::copy_ 1.59% 35.393us 82.63% 1.836ms 306.029us 7.872us 32.76% 9.184us 1.531us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 32.76% 7.872us 1.312us 6 + Activity Buffer Request 78.48% 1.744ms 78.48% 1.744ms 1.744ms 1.312us 5.46% 1.312us 1.312us 1 + aten::empty_strided 1.50% 33.432us 1.50% 33.432us 5.572us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.55% 56.710us 2.55% 56.710us 9.452us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.45% 32.280us 1.84% 40.820us 3.402us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.38% 8.540us 0.38% 8.540us 0.712us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.94% 43.151us 1.94% 43.151us 7.192us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.33% 7.360us 0.33% 7.360us 7.360us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.111ms -Self CUDA time total: 23.999us +Self CPU time total: 2.222ms +Self CUDA time total: 24.032us @@ -4061,23 +4061,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 318.846us 1316.19% 318.846us 318.846us 1 - hf_kernels_rotary 7.42% 158.343us 99.75% 2.128ms 2.128ms 0.000us 0.00% 25.505us 25.505us 1 - _rotary_dba7d1e::apply_rotary 1.98% 42.310us 3.93% 83.780us 13.963us 16.544us 68.29% 16.544us 2.757us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.544us 68.29% 16.544us 2.757us 6 - aten::clone 0.91% 19.412us 86.52% 1.845ms 307.560us 0.000us 0.00% 8.961us 1.493us 6 - aten::copy_ 1.63% 34.769us 84.21% 1.796ms 299.358us 7.681us 31.71% 8.961us 1.493us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.681us 31.71% 7.681us 1.280us 6 - Activity Buffer Request 80.16% 1.710ms 80.16% 1.710ms 1.710ms 1.280us 5.28% 1.280us 1.280us 1 - aten::empty_strided 1.40% 29.800us 1.40% 29.800us 4.967us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.42% 51.552us 2.42% 51.552us 8.592us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.48% 31.501us 1.89% 40.211us 3.351us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.41% 8.710us 0.41% 8.710us 0.726us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.94% 41.470us 1.94% 41.470us 6.912us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.25% 5.300us 0.25% 5.300us 5.300us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 328.861us 1359.32% 328.861us 328.861us 1 + hf_kernels_rotary 7.82% 169.265us 99.74% 2.160ms 2.160ms 0.000us 0.00% 25.505us 25.505us 1 + _rotary_dba7d1e::apply_rotary 1.90% 41.240us 3.83% 83.032us 13.839us 16.449us 67.99% 16.449us 2.742us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.449us 67.99% 16.449us 2.742us 6 + aten::clone 1.22% 26.522us 86.28% 1.868ms 311.384us 0.000us 0.00% 9.056us 1.509us 6 + aten::copy_ 1.60% 34.652us 83.63% 1.811ms 301.836us 7.744us 32.01% 9.056us 1.509us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 32.01% 7.744us 1.291us 6 + Activity Buffer Request 79.55% 1.723ms 79.55% 1.723ms 1.723ms 1.312us 5.42% 1.312us 1.312us 1 + aten::empty_strided 1.42% 30.770us 1.42% 30.770us 5.128us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.49% 53.840us 2.49% 53.840us 8.973us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.40% 30.337us 1.81% 39.289us 3.274us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.41% 8.952us 0.41% 8.952us 0.746us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.93% 41.792us 1.93% 41.792us 6.965us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.26% 5.540us 0.26% 5.540us 5.540us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.133ms -Self CUDA time total: 24.225us +Self CPU time total: 2.165ms +Self CUDA time total: 24.193us @@ -4087,23 +4087,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 321.313us 1144.93% 321.313us 321.313us 1 - hf_kernels_rotary 6.51% 155.543us 99.79% 2.386ms 2.386ms 0.000us 0.00% 29.792us 29.792us 1 - _rotary_dba7d1e::apply_rotary 1.75% 41.730us 3.43% 82.081us 13.680us 17.695us 63.05% 17.695us 2.949us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.695us 63.05% 17.695us 2.949us 6 - aten::clone 0.86% 20.620us 88.15% 2.107ms 351.189us 0.000us 0.00% 12.097us 2.016us 6 - aten::copy_ 1.42% 33.971us 86.01% 2.056ms 342.697us 10.369us 36.95% 12.097us 2.016us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.369us 36.95% 10.369us 1.728us 6 - Activity Buffer Request 74.19% 1.773ms 74.19% 1.773ms 1.773ms 1.728us 6.16% 1.728us 1.728us 1 - aten::empty_strided 1.27% 30.330us 1.27% 30.330us 5.055us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 10.41% 248.804us 10.41% 248.804us 41.467us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.33% 31.699us 1.70% 40.751us 3.396us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.38% 9.052us 0.38% 9.052us 0.754us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.69% 40.351us 1.69% 40.351us 6.725us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.21% 5.000us 0.21% 5.000us 5.000us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 332.000us 1184.36% 332.000us 332.000us 1 + hf_kernels_rotary 7.19% 171.403us 99.79% 2.378ms 2.378ms 0.000us 0.00% 29.792us 29.792us 1 + _rotary_dba7d1e::apply_rotary 1.72% 40.922us 3.47% 82.793us 13.799us 17.632us 62.90% 17.632us 2.939us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.632us 62.90% 17.632us 2.939us 6 + aten::clone 1.16% 27.640us 87.43% 2.084ms 347.303us 0.000us 0.00% 12.160us 2.027us 6 + aten::copy_ 1.42% 33.951us 84.95% 2.025ms 337.488us 10.400us 37.10% 12.160us 2.027us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 37.10% 10.400us 1.733us 6 + Activity Buffer Request 73.90% 1.761ms 73.90% 1.761ms 1.761ms 1.760us 6.28% 1.760us 1.760us 1 + aten::empty_strided 1.31% 31.250us 1.31% 31.250us 5.208us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.63% 229.586us 9.63% 229.586us 38.264us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.31% 31.193us 1.70% 40.482us 3.373us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.39% 9.289us 0.39% 9.289us 0.774us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.76% 41.871us 1.76% 41.871us 6.979us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.21% 5.050us 0.21% 5.050us 5.050us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.391ms -Self CUDA time total: 28.064us +Self CPU time total: 2.384ms +Self CUDA time total: 28.032us @@ -4113,23 +4113,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 325.341us 1337.81% 325.341us 325.341us 1 - hf_kernels_rotary 6.82% 156.303us 99.76% 2.286ms 2.286ms 0.000us 0.00% 25.599us 25.599us 1 - _rotary_dba7d1e::apply_rotary 1.86% 42.600us 3.70% 84.711us 14.118us 16.512us 67.90% 16.512us 2.752us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.512us 67.90% 16.512us 2.752us 6 - aten::clone 0.89% 20.410us 87.49% 2.005ms 334.180us 0.000us 0.00% 9.087us 1.514us 6 - aten::copy_ 1.54% 35.239us 85.28% 1.954ms 325.734us 7.807us 32.10% 9.087us 1.514us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.807us 32.10% 7.807us 1.301us 6 - Activity Buffer Request 73.15% 1.676ms 73.15% 1.676ms 1.676ms 1.280us 5.26% 1.280us 1.280us 1 - aten::empty_strided 1.32% 30.271us 1.32% 30.271us 5.045us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 10.60% 242.835us 10.60% 242.835us 40.472us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.38% 31.561us 1.75% 40.080us 3.340us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.37% 8.519us 0.37% 8.519us 0.710us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.84% 42.111us 1.84% 42.111us 7.018us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 5.490us 0.24% 5.490us 5.490us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 335.418us 1384.59% 335.418us 335.418us 1 + hf_kernels_rotary 20.26% 170.294us 99.34% 834.940us 834.940us 0.000us 0.00% 25.537us 25.537us 1 + _rotary_dba7d1e::apply_rotary 4.83% 40.562us 9.92% 83.412us 13.902us 16.513us 68.17% 16.513us 2.752us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.513us 68.17% 16.513us 2.752us 6 + aten::clone 2.64% 22.222us 64.45% 541.674us 90.279us 0.000us 0.00% 9.024us 1.504us 6 + aten::copy_ 4.18% 35.111us 57.94% 486.972us 81.162us 7.712us 31.83% 9.024us 1.504us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 31.83% 7.712us 1.285us 6 + Activity Buffer Request 27.90% 234.506us 27.90% 234.506us 234.506us 1.312us 5.42% 1.312us 1.312us 1 + aten::empty_strided 3.86% 32.480us 3.86% 32.480us 5.413us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 25.86% 217.355us 25.86% 217.355us 36.226us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.66% 30.769us 4.71% 39.560us 3.297us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.05% 8.791us 1.05% 8.791us 0.733us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.10% 42.850us 5.10% 42.850us 7.142us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.66% 5.560us 0.66% 5.560us 5.560us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.292ms -Self CUDA time total: 24.319us +Self CPU time total: 840.500us +Self CUDA time total: 24.225us @@ -4139,23 +4139,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 323.675us 1148.27% 323.675us 323.675us 1 - hf_kernels_rotary 6.64% 156.971us 99.78% 2.359ms 2.359ms 0.000us 0.00% 29.948us 29.948us 1 - _rotary_dba7d1e::apply_rotary 1.69% 39.892us 3.46% 81.891us 13.648us 17.695us 62.77% 17.695us 2.949us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.695us 62.77% 17.695us 2.949us 6 - aten::clone 0.91% 21.443us 87.99% 2.080ms 346.632us 0.000us 0.00% 12.253us 2.042us 6 - aten::copy_ 1.41% 33.321us 85.79% 2.028ms 337.974us 10.493us 37.23% 12.253us 2.042us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.493us 37.23% 10.493us 1.749us 6 - Activity Buffer Request 74.45% 1.760ms 74.45% 1.760ms 1.760ms 1.760us 6.24% 1.760us 1.760us 1 - aten::empty_strided 1.29% 30.510us 1.29% 30.510us 5.085us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.92% 234.593us 9.92% 234.593us 39.099us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.33% 31.410us 1.69% 40.001us 3.333us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.36% 8.591us 0.36% 8.591us 0.716us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.78% 41.999us 1.78% 41.999us 7.000us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.22% 5.111us 0.22% 5.111us 5.111us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 335.419us 1187.11% 335.419us 335.419us 1 + hf_kernels_rotary 22.11% 159.484us 99.14% 715.038us 715.038us 0.000us 0.00% 30.047us 30.047us 1 + _rotary_dba7d1e::apply_rotary 6.01% 43.310us 12.22% 88.120us 14.687us 17.727us 62.74% 17.727us 2.954us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.727us 62.74% 17.727us 2.954us 6 + aten::clone 2.86% 20.618us 59.23% 427.160us 71.193us 0.000us 0.00% 12.320us 2.053us 6 + aten::copy_ 4.70% 33.871us 52.15% 376.129us 62.688us 10.528us 37.26% 12.320us 2.053us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.528us 37.26% 10.528us 1.755us 6 + Activity Buffer Request 17.96% 129.563us 17.96% 129.563us 129.563us 1.792us 6.34% 1.792us 1.792us 1 + aten::empty_strided 4.22% 30.413us 4.22% 30.413us 5.069us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 29.49% 212.695us 29.49% 212.695us 35.449us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.36% 31.443us 5.58% 40.274us 3.356us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.22% 8.831us 1.22% 8.831us 0.736us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 6.21% 44.810us 6.21% 44.810us 7.468us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.86% 6.181us 0.86% 6.181us 6.181us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.364ms -Self CUDA time total: 28.188us +Self CPU time total: 721.219us +Self CUDA time total: 28.255us @@ -4165,23 +4165,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 324.125us 795.09% 324.125us 324.125us 1 - hf_kernels_rotary 6.45% 153.782us 99.79% 2.380ms 2.380ms 0.000us 0.00% 43.614us 43.614us 1 - _rotary_dba7d1e::apply_rotary 1.81% 43.080us 3.50% 83.561us 13.927us 23.646us 58.00% 23.646us 3.941us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.646us 58.00% 23.646us 3.941us 6 - aten::clone 0.86% 20.470us 88.19% 2.103ms 350.526us 0.000us 0.00% 19.968us 3.328us 6 - aten::copy_ 1.45% 34.599us 85.96% 2.050ms 341.697us 17.120us 42.00% 19.968us 3.328us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.120us 42.00% 17.120us 2.853us 6 - Activity Buffer Request 74.86% 1.785ms 74.86% 1.785ms 1.785ms 2.848us 6.99% 2.848us 2.848us 1 - aten::empty_strided 1.36% 32.503us 1.36% 32.503us 5.417us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.65% 230.194us 9.65% 230.194us 38.366us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.31% 31.171us 1.65% 39.351us 3.279us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.34% 8.180us 0.34% 8.180us 0.682us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.70% 40.481us 1.70% 40.481us 6.747us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.21% 5.060us 0.21% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 339.104us 833.12% 339.104us 339.104us 1 + hf_kernels_rotary 7.67% 177.694us 99.76% 2.310ms 2.310ms 0.000us 0.00% 43.583us 43.583us 1 + _rotary_dba7d1e::apply_rotary 1.80% 41.651us 3.62% 83.803us 13.967us 23.520us 57.78% 23.520us 3.920us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.520us 57.78% 23.520us 3.920us 6 + aten::clone 1.20% 27.761us 86.70% 2.008ms 334.588us 0.000us 0.00% 20.063us 3.344us 6 + aten::copy_ 1.49% 34.550us 84.17% 1.949ms 324.808us 17.183us 42.22% 20.063us 3.344us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.183us 42.22% 17.183us 2.864us 6 + Activity Buffer Request 73.48% 1.701ms 73.48% 1.701ms 1.701ms 2.880us 7.08% 2.880us 2.880us 1 + aten::empty_strided 1.34% 30.920us 1.34% 30.920us 5.153us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.20% 212.976us 9.20% 212.976us 35.496us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.39% 32.120us 1.76% 40.721us 3.393us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.37% 8.601us 0.37% 8.601us 0.717us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.82% 42.152us 1.82% 42.152us 7.025us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.24% 5.660us 0.24% 5.660us 5.660us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.385ms -Self CUDA time total: 40.766us +Self CPU time total: 2.315ms +Self CUDA time total: 40.703us @@ -4191,23 +4191,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 326.719us 435.58% 326.719us 326.719us 1 - hf_kernels_rotary 6.86% 156.773us 99.78% 2.282ms 2.282ms 0.000us 0.00% 83.583us 83.583us 1 - aten::clone 0.90% 20.639us 87.67% 2.005ms 334.152us 0.000us 0.00% 44.415us 7.402us 6 - aten::copy_ 1.46% 33.361us 85.45% 1.954ms 325.685us 35.839us 47.78% 44.415us 7.402us 6 - _rotary_dba7d1e::apply_rotary 1.78% 40.770us 3.52% 80.530us 13.422us 39.168us 52.22% 39.168us 6.528us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.168us 52.22% 39.168us 6.528us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 35.839us 47.78% 35.839us 5.973us 6 - Activity Buffer Request 74.00% 1.692ms 74.00% 1.692ms 1.692ms 8.576us 11.43% 8.576us 8.576us 1 - aten::empty_strided 1.32% 30.160us 1.32% 30.160us 5.027us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.99% 228.424us 9.99% 228.424us 38.071us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.36% 31.101us 1.73% 39.632us 3.303us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.37% 8.531us 0.37% 8.531us 0.711us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.74% 39.760us 1.74% 39.760us 6.627us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.22% 5.030us 0.22% 5.030us 5.030us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 342.239us 459.79% 342.239us 342.239us 1 + hf_kernels_rotary 7.38% 176.732us 99.78% 2.390ms 2.390ms 0.000us 0.00% 82.913us 82.913us 1 + aten::clone 1.17% 28.130us 87.22% 2.089ms 348.177us 0.000us 0.00% 43.777us 7.296us 6 + aten::copy_ 1.46% 34.971us 84.72% 2.029ms 338.203us 35.297us 47.42% 43.777us 7.296us 6 + _rotary_dba7d1e::apply_rotary 1.71% 40.931us 3.50% 83.864us 13.977us 39.136us 52.58% 39.136us 6.523us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.136us 52.58% 39.136us 6.523us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 35.297us 47.42% 35.297us 5.883us 6 + Activity Buffer Request 74.20% 1.777ms 74.20% 1.777ms 1.777ms 8.480us 11.39% 8.480us 8.480us 1 + aten::empty_strided 1.32% 31.711us 1.32% 31.711us 5.285us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.06% 217.015us 9.06% 217.015us 36.169us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.30% 31.251us 1.69% 40.431us 3.369us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.38% 9.180us 0.38% 9.180us 0.765us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.79% 42.933us 1.79% 42.933us 7.155us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.22% 5.191us 0.22% 5.191us 5.191us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.287ms -Self CUDA time total: 75.007us +Self CPU time total: 2.395ms +Self CUDA time total: 74.433us @@ -4217,23 +4217,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 313.534us 775.13% 313.534us 313.534us 1 - hf_kernels_rotary 17.26% 147.172us 99.40% 847.604us 847.604us 0.000us 0.00% 43.297us 43.297us 1 - _rotary_dba7d1e::apply_rotary 4.54% 38.680us 9.23% 78.702us 13.117us 23.425us 57.91% 23.425us 3.904us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.425us 57.91% 23.425us 3.904us 6 - aten::clone 2.34% 19.949us 68.60% 584.900us 97.483us 0.000us 0.00% 19.872us 3.312us 6 - aten::copy_ 3.97% 33.871us 62.79% 535.360us 89.227us 17.024us 42.09% 19.872us 3.312us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.024us 42.09% 17.024us 2.837us 6 - Activity Buffer Request 32.38% 276.114us 32.38% 276.114us 276.114us 2.848us 7.04% 2.848us 2.848us 1 - aten::empty_strided 3.47% 29.591us 3.47% 29.591us 4.932us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 26.43% 225.375us 26.43% 225.375us 37.563us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.45% 29.390us 4.32% 36.830us 3.069us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.87% 7.440us 0.87% 7.440us 0.620us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.69% 40.022us 4.69% 40.022us 6.670us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.60% 5.080us 0.60% 5.080us 5.080us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 332.929us 819.86% 332.929us 332.929us 1 + hf_kernels_rotary 7.07% 166.848us 99.78% 2.353ms 2.353ms 0.000us 0.00% 43.488us 43.488us 1 + _rotary_dba7d1e::apply_rotary 1.68% 39.560us 3.51% 82.681us 13.780us 23.488us 57.84% 23.488us 3.915us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.488us 57.84% 23.488us 3.915us 6 + aten::clone 1.16% 27.340us 87.49% 2.064ms 343.923us 0.000us 0.00% 20.000us 3.333us 6 + aten::copy_ 1.51% 35.519us 85.00% 2.005ms 334.108us 17.120us 42.16% 20.000us 3.333us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.120us 42.16% 17.120us 2.853us 6 + Activity Buffer Request 74.50% 1.757ms 74.50% 1.757ms 1.757ms 2.880us 7.09% 2.880us 2.880us 1 + aten::empty_strided 1.34% 31.550us 1.34% 31.550us 5.258us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.99% 212.096us 8.99% 212.096us 35.349us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.35% 31.900us 1.71% 40.350us 3.362us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.36% 8.450us 0.36% 8.450us 0.704us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.83% 43.121us 1.83% 43.121us 7.187us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.22% 5.120us 0.22% 5.120us 5.120us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 852.684us -Self CUDA time total: 40.449us +Self CPU time total: 2.359ms +Self CUDA time total: 40.608us @@ -4243,23 +4243,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 317.820us 416.43% 317.820us 317.820us 1 - hf_kernels_rotary 17.56% 148.193us 99.41% 838.994us 838.994us 0.000us 0.00% 85.633us 85.633us 1 - aten::clone 2.33% 19.639us 67.72% 571.550us 95.258us 0.000us 0.00% 46.146us 7.691us 6 - aten::copy_ 4.11% 34.662us 61.88% 522.280us 87.047us 36.833us 48.26% 46.146us 7.691us 6 - _rotary_dba7d1e::apply_rotary 4.71% 39.770us 9.52% 80.371us 13.395us 39.487us 51.74% 39.487us 6.581us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.487us 51.74% 39.487us 6.581us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 36.833us 48.26% 36.833us 6.139us 6 - Activity Buffer Request 31.55% 266.304us 31.55% 266.304us 266.304us 9.313us 12.20% 9.313us 9.313us 1 - aten::empty_strided 3.51% 29.631us 3.51% 29.631us 4.938us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 26.22% 221.314us 26.22% 221.314us 36.886us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.67% 30.998us 4.61% 38.880us 3.240us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.93% 7.882us 0.93% 7.882us 0.657us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.81% 40.601us 4.81% 40.601us 6.767us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.59% 5.020us 0.59% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 338.142us 439.39% 338.142us 338.142us 1 + hf_kernels_rotary 7.05% 174.064us 99.80% 2.465ms 2.465ms 0.000us 0.00% 86.270us 86.270us 1 + aten::clone 1.20% 29.650us 87.84% 2.170ms 361.584us 0.000us 0.00% 47.071us 7.845us 6 + aten::copy_ 1.42% 34.959us 85.36% 2.108ms 351.395us 37.759us 49.06% 47.071us 7.845us 6 + _rotary_dba7d1e::apply_rotary 1.66% 41.022us 3.32% 82.043us 13.674us 39.199us 50.94% 39.199us 6.533us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.199us 50.94% 39.199us 6.533us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 37.759us 49.06% 37.759us 6.293us 6 + Activity Buffer Request 75.49% 1.864ms 75.49% 1.864ms 1.864ms 9.312us 12.10% 9.312us 9.312us 1 + aten::empty_strided 1.27% 31.482us 1.27% 31.482us 5.247us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.46% 208.956us 8.46% 208.956us 34.826us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.25% 30.771us 1.59% 39.290us 3.274us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.34% 8.519us 0.34% 8.519us 0.710us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.66% 41.021us 1.66% 41.021us 6.837us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.20% 5.010us 0.20% 5.010us 5.010us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 844.014us -Self CUDA time total: 76.320us +Self CPU time total: 2.470ms +Self CUDA time total: 76.958us @@ -4269,23 +4269,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 319.423us 229.00% 319.423us 319.423us 1 - hf_kernels_rotary 18.34% 146.203us 99.38% 792.173us 792.173us 0.000us 0.00% 163.169us 163.169us 1 - aten::clone 2.42% 19.310us 66.03% 526.340us 87.723us 0.000us 0.00% 102.753us 17.126us 6 - aten::copy_ 4.16% 33.151us 59.92% 477.629us 79.605us 79.073us 56.69% 102.753us 17.126us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 79.073us 56.69% 79.073us 13.179us 6 - _rotary_dba7d1e::apply_rotary 5.00% 39.859us 10.11% 80.600us 13.433us 60.416us 43.31% 60.416us 10.069us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 60.416us 43.31% 60.416us 10.069us 6 - Activity Buffer Request 28.37% 226.154us 28.37% 226.154us 226.154us 23.680us 16.98% 23.680us 23.680us 1 - aten::empty_strided 3.69% 29.401us 3.69% 29.401us 4.900us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 27.39% 218.324us 27.39% 218.324us 36.387us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.87% 30.870us 4.90% 39.030us 3.252us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.02% 8.160us 1.02% 8.160us 0.680us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.11% 40.741us 5.11% 40.741us 6.790us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.62% 4.920us 0.62% 4.920us 4.920us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 343.679us 247.41% 343.679us 343.679us 1 + hf_kernels_rotary 7.47% 171.684us 99.76% 2.294ms 2.294ms 0.000us 0.00% 162.559us 162.559us 1 + aten::clone 1.25% 28.681us 86.89% 1.998ms 333.015us 0.000us 0.00% 102.592us 17.099us 6 + aten::copy_ 1.49% 34.332us 84.20% 1.936ms 322.703us 78.944us 56.83% 102.592us 17.099us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 78.944us 56.83% 78.944us 13.157us 6 + _rotary_dba7d1e::apply_rotary 1.83% 42.151us 3.66% 84.183us 14.030us 59.967us 43.17% 59.967us 9.995us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 59.967us 43.17% 59.967us 9.995us 6 + Activity Buffer Request 73.81% 1.697ms 73.81% 1.697ms 1.697ms 23.648us 17.02% 23.648us 23.648us 1 + aten::empty_strided 1.44% 33.190us 1.44% 33.190us 5.532us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.89% 204.504us 8.89% 204.504us 34.084us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.37% 31.511us 1.74% 40.120us 3.343us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.37% 8.609us 0.37% 8.609us 0.717us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.83% 42.032us 1.83% 42.032us 7.005us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.24% 5.461us 0.24% 5.461us 5.461us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 797.093us -Self CUDA time total: 139.489us +Self CPU time total: 2.300ms +Self CUDA time total: 138.911us @@ -4295,23 +4295,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 12.16% 146.839us 70.39% 850.223us 850.223us 0.000us 0.00% 769.020us 769.020us 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 710.970us 101.11% 710.970us 710.970us 1 - aten::clone 3.16% 38.139us 48.21% 582.349us 97.058us 0.000us 0.00% 567.581us 94.597us 6 - aten::copy_ 2.68% 32.339us 42.57% 514.168us 85.695us 501.725us 71.35% 567.581us 94.597us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 501.725us 71.35% 501.725us 83.621us 6 - _rotary_dba7d1e::apply_rotary 3.42% 41.301us 6.76% 81.602us 13.600us 201.439us 28.65% 201.439us 33.573us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 201.439us 28.65% 201.439us 33.573us 6 - Activity Buffer Request 21.65% 261.455us 21.65% 261.455us 261.455us 65.856us 9.37% 65.856us 65.856us 1 - aten::empty_strided 2.49% 30.042us 2.49% 30.042us 5.007us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 18.25% 220.374us 18.25% 220.374us 36.729us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.57% 30.999us 3.26% 39.433us 3.286us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.70% 8.434us 0.70% 8.434us 0.703us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 3.34% 40.301us 3.34% 40.301us 6.717us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 29.61% 357.615us 29.61% 357.615us 357.615us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 6.47% 175.111us 88.03% 2.384ms 2.384ms 0.000us 0.00% 770.847us 770.847us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 713.695us 101.13% 713.695us 713.695us 1 + aten::clone 1.03% 27.812us 76.48% 2.071ms 345.215us 0.000us 0.00% 568.671us 94.778us 6 + aten::copy_ 1.35% 36.632us 74.30% 2.012ms 335.367us 503.551us 71.35% 568.671us 94.778us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 503.551us 71.35% 503.551us 83.925us 6 + _rotary_dba7d1e::apply_rotary 1.88% 50.972us 3.57% 96.775us 16.129us 202.176us 28.65% 202.176us 33.696us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 202.176us 28.65% 202.176us 33.696us 6 + Activity Buffer Request 65.43% 1.772ms 65.43% 1.772ms 1.772ms 65.120us 9.23% 65.120us 65.120us 1 + aten::empty_strided 1.15% 31.280us 1.15% 31.280us 5.213us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 7.52% 203.605us 7.52% 203.605us 33.934us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.18% 32.050us 1.51% 41.000us 3.417us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.33% 8.950us 0.33% 8.950us 0.746us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.69% 45.803us 1.69% 45.803us 7.634us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 11.97% 324.077us 11.97% 324.077us 324.077us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.208ms -Self CUDA time total: 703.164us +Self CPU time total: 2.708ms +Self CUDA time total: 705.727us @@ -4321,23 +4321,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 317.658us 1193.13% 317.658us 317.658us 1 - hf_kernels_rotary 17.50% 146.563us 99.40% 832.443us 832.443us 0.000us 0.00% 27.968us 27.968us 1 - _rotary_dba7d1e::apply_rotary 4.88% 40.860us 9.91% 82.980us 13.830us 18.751us 70.43% 18.751us 3.125us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.751us 70.43% 18.751us 3.125us 6 - aten::clone 2.35% 19.662us 67.40% 564.410us 94.068us 0.000us 0.00% 9.217us 1.536us 6 - aten::copy_ 4.08% 34.181us 61.36% 513.878us 85.646us 7.873us 29.57% 9.217us 1.536us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.873us 29.57% 7.873us 1.312us 6 - Activity Buffer Request 31.10% 260.474us 31.10% 260.474us 260.474us 1.344us 5.05% 1.344us 1.344us 1 - aten::empty_strided 3.69% 30.870us 3.69% 30.870us 5.145us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 26.18% 219.223us 26.18% 219.223us 36.537us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.65% 30.540us 4.60% 38.490us 3.208us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.95% 7.950us 0.95% 7.950us 0.663us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.03% 42.120us 5.03% 42.120us 7.020us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.60% 5.020us 0.60% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 336.477us 1265.33% 336.477us 336.477us 1 + hf_kernels_rotary 7.39% 171.995us 99.77% 2.322ms 2.322ms 0.000us 0.00% 27.904us 27.904us 1 + _rotary_dba7d1e::apply_rotary 1.76% 41.061us 3.57% 83.061us 13.844us 18.816us 70.76% 18.816us 3.136us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.816us 70.76% 18.816us 3.136us 6 + aten::clone 1.24% 28.803us 87.11% 2.027ms 337.843us 0.000us 0.00% 9.088us 1.515us 6 + aten::copy_ 1.64% 38.082us 84.50% 1.966ms 327.735us 7.776us 29.24% 9.088us 1.515us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 29.24% 7.776us 1.296us 6 + Activity Buffer Request 72.85% 1.695ms 72.85% 1.695ms 1.695ms 1.312us 4.93% 1.312us 1.312us 1 + aten::empty_strided 1.37% 31.849us 1.37% 31.849us 5.308us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.01% 232.925us 10.01% 232.925us 38.821us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.33% 30.872us 1.71% 39.700us 3.308us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.38% 8.828us 0.38% 8.828us 0.736us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.80% 42.000us 1.80% 42.000us 7.000us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 5.320us 0.23% 5.320us 5.320us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 837.463us -Self CUDA time total: 26.624us +Self CPU time total: 2.327ms +Self CUDA time total: 26.592us @@ -4347,23 +4347,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 308.930us 1157.56% 308.930us 308.930us 1 - hf_kernels_rotary 17.42% 145.421us 99.38% 829.543us 829.543us 0.000us 0.00% 28.000us 28.000us 1 - _rotary_dba7d1e::apply_rotary 5.67% 47.303us 10.60% 88.453us 14.742us 18.944us 70.98% 18.944us 3.157us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.944us 70.98% 18.944us 3.157us 6 - aten::clone 2.28% 19.028us 66.78% 557.449us 92.908us 0.000us 0.00% 9.056us 1.509us 6 - aten::copy_ 3.82% 31.872us 61.07% 509.750us 84.958us 7.744us 29.02% 9.056us 1.509us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 29.02% 7.744us 1.291us 6 - Activity Buffer Request 31.42% 262.245us 31.42% 262.245us 262.245us 1.312us 4.92% 1.312us 1.312us 1 - aten::empty_strided 3.43% 28.671us 3.43% 28.671us 4.778us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 25.83% 215.633us 25.83% 215.633us 35.939us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.59% 29.990us 4.58% 38.220us 3.185us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.99% 8.230us 0.99% 8.230us 0.686us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.93% 41.150us 4.93% 41.150us 6.858us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.62% 5.211us 0.62% 5.211us 5.211us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 361.593us 1356.47% 361.593us 361.593us 1 + hf_kernels_rotary 20.01% 156.574us 99.29% 776.889us 776.889us 0.000us 0.00% 27.969us 27.969us 1 + _rotary_dba7d1e::apply_rotary 6.08% 47.572us 11.90% 93.114us 15.519us 18.881us 70.83% 18.881us 3.147us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.881us 70.83% 18.881us 3.147us 6 + aten::clone 2.86% 22.401us 61.73% 483.012us 80.502us 0.000us 0.00% 9.088us 1.515us 6 + aten::copy_ 5.02% 39.248us 54.77% 428.540us 71.423us 7.776us 29.17% 9.088us 1.515us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 29.17% 7.776us 1.296us 6 + Activity Buffer Request 23.43% 183.305us 23.43% 183.305us 183.305us 1.312us 4.92% 1.312us 1.312us 1 + aten::empty_strided 4.10% 32.071us 4.10% 32.071us 5.345us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 26.33% 205.987us 26.33% 205.987us 34.331us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.44% 34.709us 5.65% 44.189us 3.682us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.21% 9.480us 1.21% 9.480us 0.790us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.82% 45.542us 5.82% 45.542us 7.590us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.71% 5.560us 0.71% 5.560us 5.560us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 834.754us -Self CUDA time total: 26.688us +Self CPU time total: 782.449us +Self CUDA time total: 26.657us @@ -4373,23 +4373,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 326.626us 1069.89% 326.626us 326.626us 1 - hf_kernels_rotary 6.56% 153.551us 99.79% 2.337ms 2.337ms 0.000us 0.00% 32.289us 32.289us 1 - _rotary_dba7d1e::apply_rotary 1.75% 41.061us 3.50% 81.981us 13.664us 20.161us 66.04% 20.161us 3.360us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.161us 66.04% 20.161us 3.360us 6 - aten::clone 0.92% 21.442us 88.00% 2.061ms 343.521us 0.000us 0.00% 12.128us 2.021us 6 - aten::copy_ 1.46% 34.178us 85.80% 2.010ms 334.939us 10.368us 33.96% 12.128us 2.021us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.368us 33.96% 10.368us 1.728us 6 - Activity Buffer Request 75.10% 1.759ms 75.10% 1.759ms 1.759ms 1.760us 5.77% 1.760us 1.760us 1 - aten::empty_strided 1.28% 30.051us 1.28% 30.051us 5.009us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.25% 216.555us 9.25% 216.555us 36.092us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.37% 32.001us 1.73% 40.601us 3.383us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.37% 8.600us 0.37% 8.600us 0.717us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.75% 40.920us 1.75% 40.920us 6.820us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.21% 4.951us 0.21% 4.951us 4.951us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 329.530us 1073.84% 329.530us 329.530us 1 + hf_kernels_rotary 18.69% 150.673us 99.29% 800.610us 800.610us 0.000us 0.00% 32.447us 32.447us 1 + _rotary_dba7d1e::apply_rotary 5.09% 41.061us 10.37% 83.622us 13.937us 20.159us 65.69% 20.159us 3.360us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.159us 65.69% 20.159us 3.360us 6 + aten::clone 2.45% 19.762us 65.24% 526.013us 87.669us 0.000us 0.00% 12.288us 2.048us 6 + aten::copy_ 4.31% 34.749us 58.96% 475.401us 79.234us 10.528us 34.31% 12.288us 2.048us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.528us 34.31% 10.528us 1.755us 6 + Activity Buffer Request 29.87% 240.876us 29.87% 240.876us 240.876us 1.760us 5.74% 1.760us 1.760us 1 + aten::empty_strided 3.83% 30.850us 3.83% 30.850us 5.142us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 24.78% 199.776us 24.78% 199.776us 33.296us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.92% 31.582us 5.00% 40.302us 3.358us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.08% 8.720us 1.08% 8.720us 0.727us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.28% 42.561us 5.28% 42.561us 7.094us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.71% 5.701us 0.71% 5.701us 5.701us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.342ms -Self CUDA time total: 30.529us +Self CPU time total: 806.311us +Self CUDA time total: 30.687us @@ -4399,23 +4399,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 323.551us 758.49% 323.551us 323.551us 1 - hf_kernels_rotary 6.57% 149.229us 99.78% 2.268ms 2.268ms 0.000us 0.00% 45.505us 45.505us 1 - _rotary_dba7d1e::apply_rotary 1.78% 40.461us 3.63% 82.552us 13.759us 25.601us 60.02% 25.601us 4.267us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.601us 60.02% 25.601us 4.267us 6 - aten::clone 0.92% 20.920us 87.81% 1.996ms 332.671us 0.000us 0.00% 19.904us 3.317us 6 - aten::copy_ 1.45% 33.001us 85.55% 1.945ms 324.092us 17.056us 39.98% 19.904us 3.317us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.056us 39.98% 17.056us 2.843us 6 - Activity Buffer Request 74.47% 1.693ms 74.47% 1.693ms 1.693ms 2.848us 6.68% 2.848us 2.848us 1 - aten::empty_strided 1.34% 30.551us 1.34% 30.551us 5.092us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.62% 218.764us 9.62% 218.764us 36.461us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.40% 31.822us 1.77% 40.192us 3.349us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.37% 8.370us 0.37% 8.370us 0.697us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.85% 42.091us 1.85% 42.091us 7.015us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.22% 5.030us 0.22% 5.030us 5.030us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 329.435us 772.31% 329.435us 329.435us 1 + hf_kernels_rotary 20.10% 145.342us 99.21% 717.528us 717.528us 0.000us 0.00% 45.504us 45.504us 1 + _rotary_dba7d1e::apply_rotary 5.72% 41.400us 11.55% 83.552us 13.925us 25.568us 59.94% 25.568us 4.261us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.568us 59.94% 25.568us 4.261us 6 + aten::clone 2.77% 20.032us 61.99% 448.302us 74.717us 0.000us 0.00% 19.936us 3.323us 6 + aten::copy_ 4.93% 35.690us 55.04% 398.089us 66.348us 17.088us 40.06% 19.936us 3.323us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 40.06% 17.088us 2.848us 6 + Activity Buffer Request 22.43% 162.214us 22.43% 162.214us 162.214us 2.848us 6.68% 2.848us 2.848us 1 + aten::empty_strided 4.17% 30.181us 4.17% 30.181us 5.030us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 27.68% 200.185us 27.68% 200.185us 33.364us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.37% 31.593us 5.58% 40.332us 3.361us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.21% 8.739us 1.21% 8.739us 0.728us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.83% 42.152us 5.83% 42.152us 7.025us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.79% 5.700us 0.79% 5.700us 5.700us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.273ms -Self CUDA time total: 42.657us +Self CPU time total: 723.228us +Self CUDA time total: 42.656us @@ -4425,23 +4425,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 313.528us 1024.90% 313.528us 313.528us 1 - hf_kernels_rotary 17.21% 144.859us 99.41% 836.674us 836.674us 0.000us 0.00% 32.319us 32.319us 1 - _rotary_dba7d1e::apply_rotary 4.77% 40.179us 9.63% 81.081us 13.513us 20.224us 66.11% 20.224us 3.371us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.224us 66.11% 20.224us 3.371us 6 - aten::clone 2.34% 19.722us 68.09% 573.072us 95.512us 0.000us 0.00% 12.095us 2.016us 6 - aten::copy_ 3.82% 32.139us 62.12% 522.779us 87.130us 10.367us 33.89% 12.095us 2.016us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.367us 33.89% 10.367us 1.728us 6 - Activity Buffer Request 32.59% 274.305us 32.59% 274.305us 274.305us 1.728us 5.65% 1.728us 1.728us 1 - aten::empty_strided 3.63% 30.571us 3.63% 30.571us 5.095us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 25.70% 216.335us 25.70% 216.335us 36.056us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.51% 29.563us 4.47% 37.662us 3.139us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.96% 8.099us 0.96% 8.099us 0.675us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.86% 40.902us 4.86% 40.902us 6.817us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.59% 4.950us 0.59% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 330.651us 1075.32% 330.651us 330.651us 1 + hf_kernels_rotary 18.72% 150.931us 99.33% 800.790us 800.790us 0.000us 0.00% 32.508us 32.508us 1 + _rotary_dba7d1e::apply_rotary 5.21% 41.984us 10.24% 82.545us 13.758us 20.318us 66.08% 20.318us 3.386us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.318us 66.08% 20.318us 3.386us 6 + aten::clone 2.51% 20.253us 65.40% 527.224us 87.871us 0.000us 0.00% 12.190us 2.032us 6 + aten::copy_ 4.39% 35.371us 59.06% 476.161us 79.360us 10.431us 33.92% 12.190us 2.032us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 33.92% 10.431us 1.738us 6 + Activity Buffer Request 30.20% 243.496us 30.20% 243.496us 243.496us 1.759us 5.72% 1.759us 1.759us 1 + aten::empty_strided 3.82% 30.810us 3.82% 30.810us 5.135us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 24.47% 197.294us 24.47% 197.294us 32.882us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.87% 31.220us 4.97% 40.090us 3.341us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.10% 8.870us 1.10% 8.870us 0.739us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.03% 40.561us 5.03% 40.561us 6.760us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.67% 5.380us 0.67% 5.380us 5.380us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 841.624us -Self CUDA time total: 30.591us +Self CPU time total: 806.170us +Self CUDA time total: 30.749us @@ -4451,23 +4451,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 315.863us 736.07% 315.863us 315.863us 1 - hf_kernels_rotary 17.12% 141.480us 99.37% 821.243us 821.243us 0.000us 0.00% 45.760us 45.760us 1 - _rotary_dba7d1e::apply_rotary 4.88% 40.322us 9.94% 82.183us 13.697us 25.824us 60.18% 25.824us 4.304us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.824us 60.18% 25.824us 4.304us 6 - aten::clone 2.37% 19.588us 67.68% 559.279us 93.213us 0.000us 0.00% 19.936us 3.323us 6 - aten::copy_ 4.15% 34.293us 61.65% 509.500us 84.917us 17.088us 39.82% 19.936us 3.323us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 39.82% 17.088us 2.848us 6 - Activity Buffer Request 31.27% 258.454us 31.27% 258.454us 258.454us 2.848us 6.64% 2.848us 2.848us 1 - aten::empty_strided 3.65% 30.191us 3.65% 30.191us 5.032us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 26.23% 216.753us 26.23% 216.753us 36.126us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.65% 30.130us 4.63% 38.301us 3.192us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.99% 8.171us 0.99% 8.171us 0.681us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.07% 41.861us 5.07% 41.861us 6.977us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.63% 5.170us 0.63% 5.170us 5.170us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 330.616us 770.43% 330.616us 330.616us 1 + hf_kernels_rotary 17.99% 148.834us 99.35% 822.020us 822.020us 0.000us 0.00% 45.761us 45.761us 1 + _rotary_dba7d1e::apply_rotary 5.00% 41.361us 10.40% 86.012us 14.335us 25.857us 60.25% 25.857us 4.310us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.857us 60.25% 25.857us 4.310us 6 + aten::clone 2.40% 19.879us 66.22% 547.932us 91.322us 0.000us 0.00% 19.904us 3.317us 6 + aten::copy_ 4.29% 35.520us 60.10% 497.262us 82.877us 17.056us 39.75% 19.904us 3.317us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.056us 39.75% 17.056us 2.843us 6 + Activity Buffer Request 32.00% 264.767us 32.00% 264.767us 264.767us 2.848us 6.64% 2.848us 2.848us 1 + aten::empty_strided 3.72% 30.791us 3.72% 30.791us 5.132us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 23.81% 196.975us 23.81% 196.975us 32.829us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.76% 31.140us 4.74% 39.242us 3.270us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.98% 8.102us 0.98% 8.102us 0.675us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.40% 44.651us 5.40% 44.651us 7.442us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.65% 5.369us 0.65% 5.369us 5.369us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 826.413us -Self CUDA time total: 42.912us +Self CPU time total: 827.389us +Self CUDA time total: 42.913us @@ -4477,23 +4477,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 311.420us 333.98% 311.420us 311.420us 1 - hf_kernels_rotary 17.30% 140.163us 99.43% 805.353us 805.353us 0.000us 0.00% 108.190us 108.190us 1 - aten::clone 2.32% 18.760us 67.64% 547.879us 91.313us 0.000us 0.00% 66.399us 11.066us 6 - aten::copy_ 4.25% 34.400us 61.74% 500.059us 83.343us 51.455us 55.18% 66.399us 11.066us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 51.455us 55.18% 51.455us 8.576us 6 - _rotary_dba7d1e::apply_rotary 4.76% 38.581us 9.84% 79.710us 13.285us 41.791us 44.82% 41.791us 6.965us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 41.791us 44.82% 41.791us 6.965us 6 - Activity Buffer Request 31.09% 251.814us 31.09% 251.814us 251.814us 14.944us 16.03% 14.944us 14.944us 1 - aten::empty_strided 3.59% 29.060us 3.59% 29.060us 4.843us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 26.40% 213.845us 26.40% 213.845us 35.641us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.70% 29.950us 4.64% 37.601us 3.133us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.94% 7.651us 0.94% 7.651us 0.638us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.08% 41.129us 5.08% 41.129us 6.855us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.57% 4.640us 0.57% 4.640us 4.640us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 334.653us 358.15% 334.653us 334.653us 1 + hf_kernels_rotary 18.09% 151.114us 99.35% 829.780us 829.780us 0.000us 0.00% 109.215us 109.215us 1 + aten::clone 3.33% 27.839us 66.37% 554.323us 92.387us 0.000us 0.00% 68.064us 11.344us 6 + aten::copy_ 4.18% 34.911us 59.27% 495.081us 82.513us 52.288us 55.96% 68.064us 11.344us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.288us 55.96% 52.288us 8.715us 6 + _rotary_dba7d1e::apply_rotary 4.95% 41.342us 9.97% 83.303us 13.884us 41.151us 44.04% 41.151us 6.858us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 41.151us 44.04% 41.151us 6.858us 6 + Activity Buffer Request 31.64% 264.256us 31.64% 264.256us 264.256us 15.776us 16.88% 15.776us 15.776us 1 + aten::empty_strided 3.76% 31.403us 3.76% 31.403us 5.234us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 23.46% 195.914us 23.46% 195.914us 32.652us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.87% 32.310us 4.91% 41.040us 3.420us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.05% 8.730us 1.05% 8.730us 0.728us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.02% 41.961us 5.02% 41.961us 6.994us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.65% 5.470us 0.65% 5.470us 5.470us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 809.993us -Self CUDA time total: 93.246us +Self CPU time total: 835.250us +Self CUDA time total: 93.439us @@ -4503,23 +4503,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 314.038us 216.45% 314.038us 314.038us 1 - hf_kernels_rotary 17.88% 139.471us 99.38% 775.432us 775.432us 0.000us 0.00% 168.795us 168.795us 1 - aten::clone 2.37% 18.470us 66.41% 518.157us 86.360us 0.000us 0.00% 104.989us 17.498us 6 - aten::copy_ 4.24% 33.100us 60.11% 469.037us 78.173us 81.277us 56.02% 104.989us 17.498us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.277us 56.02% 81.277us 13.546us 6 - _rotary_dba7d1e::apply_rotary 5.14% 40.111us 10.39% 81.071us 13.512us 63.806us 43.98% 63.806us 10.634us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.806us 43.98% 63.806us 10.634us 6 - Activity Buffer Request 28.68% 223.784us 28.68% 223.784us 223.784us 23.712us 16.34% 23.712us 23.712us 1 - aten::empty_strided 3.93% 30.650us 3.93% 30.650us 5.108us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 27.19% 212.153us 27.19% 212.153us 35.359us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.73% 29.133us 4.71% 36.733us 3.061us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.97% 7.600us 0.97% 7.600us 0.633us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.25% 40.960us 5.25% 40.960us 6.827us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.62% 4.811us 0.62% 4.811us 4.811us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 402.293us 278.14% 402.293us 402.293us 1 + hf_kernels_rotary 18.23% 162.247us 99.42% 884.932us 884.932us 0.000us 0.00% 168.347us 168.347us 1 + aten::clone 2.56% 22.809us 61.35% 546.082us 91.014us 0.000us 0.00% 105.212us 17.535us 6 + aten::copy_ 3.90% 34.711us 55.16% 490.942us 81.824us 81.501us 56.35% 105.212us 17.535us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.501us 56.35% 81.501us 13.584us 6 + _rotary_dba7d1e::apply_rotary 4.99% 44.421us 15.06% 134.003us 22.334us 63.135us 43.65% 63.135us 10.522us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.135us 43.65% 63.135us 10.522us 6 + Activity Buffer Request 29.54% 262.907us 29.54% 262.907us 262.907us 23.711us 16.39% 23.711us 23.711us 1 + aten::empty_strided 3.63% 32.331us 3.63% 32.331us 5.388us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 21.72% 193.324us 21.72% 193.324us 32.221us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.75% 33.378us 4.79% 42.600us 3.550us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.04% 9.222us 1.04% 9.222us 0.769us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 10.06% 89.582us 10.06% 89.582us 14.930us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.58% 5.120us 0.58% 5.120us 5.120us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 780.243us -Self CUDA time total: 145.083us +Self CPU time total: 890.052us +Self CUDA time total: 144.636us @@ -4529,23 +4529,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 319.458us 399.31% 319.458us 319.458us 1 - hf_kernels_rotary 17.47% 143.453us 99.38% 816.213us 816.213us 0.000us 0.00% 90.147us 90.147us 1 - aten::clone 2.34% 19.239us 67.36% 553.208us 92.201us 0.000us 0.00% 47.811us 7.969us 6 - aten::copy_ 3.98% 32.663us 61.19% 502.549us 83.758us 37.666us 47.08% 47.811us 7.969us 6 - _rotary_dba7d1e::apply_rotary 4.95% 40.641us 9.85% 80.901us 13.484us 42.336us 52.92% 42.336us 7.056us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 42.336us 52.92% 42.336us 7.056us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 37.666us 47.08% 37.666us 6.278us 6 - Activity Buffer Request 31.07% 255.204us 31.07% 255.204us 255.204us 10.145us 12.68% 10.145us 10.145us 1 - aten::empty_strided 3.83% 31.420us 3.83% 31.420us 5.237us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 26.14% 214.682us 26.14% 214.682us 35.780us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.75% 30.830us 4.71% 38.651us 3.221us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.95% 7.821us 0.95% 7.821us 0.652us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.90% 40.260us 4.90% 40.260us 6.710us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.62% 5.090us 0.62% 5.090us 5.090us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 335.030us 423.20% 335.030us 335.030us 1 + hf_kernels_rotary 17.60% 149.462us 99.35% 843.781us 843.781us 0.000us 0.00% 89.342us 89.342us 1 + aten::clone 2.76% 23.471us 67.14% 570.174us 95.029us 0.000us 0.00% 47.328us 7.888us 6 + aten::copy_ 4.78% 40.570us 60.71% 515.622us 85.937us 37.152us 46.93% 47.328us 7.888us 6 + _rotary_dba7d1e::apply_rotary 4.86% 41.293us 9.90% 84.043us 14.007us 42.014us 53.07% 42.014us 7.002us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 42.014us 53.07% 42.014us 7.002us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 37.152us 46.93% 37.152us 6.192us 6 + Activity Buffer Request 33.04% 280.637us 33.04% 280.637us 280.637us 10.176us 12.85% 10.176us 10.176us 1 + aten::empty_strided 3.66% 31.081us 3.66% 31.081us 5.180us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 22.89% 194.415us 22.89% 194.415us 32.403us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.65% 30.971us 4.72% 40.102us 3.342us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.08% 9.131us 1.08% 9.131us 0.761us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.03% 42.750us 5.03% 42.750us 7.125us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.65% 5.500us 0.65% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 821.303us -Self CUDA time total: 80.002us +Self CPU time total: 849.281us +Self CUDA time total: 79.166us @@ -4555,23 +4555,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 325.402us 221.65% 325.402us 325.402us 1 - hf_kernels_rotary 17.64% 145.390us 99.37% 819.093us 819.093us 0.000us 0.00% 170.492us 170.492us 1 - aten::clone 2.33% 19.171us 66.92% 551.610us 91.935us 0.000us 0.00% 105.757us 17.626us 6 - aten::copy_ 4.29% 35.392us 61.01% 502.899us 83.816us 82.077us 55.91% 105.757us 17.626us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 82.077us 55.91% 82.077us 13.679us 6 - _rotary_dba7d1e::apply_rotary 4.98% 41.061us 10.11% 83.363us 13.894us 64.735us 44.09% 64.735us 10.789us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 64.735us 44.09% 64.735us 10.789us 6 - Activity Buffer Request 31.05% 255.945us 31.05% 255.945us 255.945us 23.680us 16.13% 23.680us 23.680us 1 - aten::empty_strided 3.58% 29.540us 3.58% 29.540us 4.923us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 25.67% 211.562us 25.67% 211.562us 35.260us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.74% 30.830us 4.70% 38.730us 3.228us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.96% 7.900us 0.96% 7.900us 0.658us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.13% 42.302us 5.13% 42.302us 7.050us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.63% 5.170us 0.63% 5.170us 5.170us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 335.451us 231.31% 335.451us 335.451us 1 + hf_kernels_rotary 20.10% 149.473us 99.28% 738.378us 738.378us 0.000us 0.00% 168.637us 168.637us 1 + aten::clone 2.71% 20.190us 62.55% 465.160us 77.527us 0.000us 0.00% 104.892us 17.482us 6 + aten::copy_ 4.80% 35.671us 55.64% 413.780us 68.963us 81.277us 56.04% 104.892us 17.482us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.277us 56.04% 81.277us 13.546us 6 + _rotary_dba7d1e::apply_rotary 5.48% 40.762us 11.33% 84.273us 14.046us 63.745us 43.96% 63.745us 10.624us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.745us 43.96% 63.745us 10.624us 6 + Activity Buffer Request 25.22% 187.575us 25.22% 187.575us 187.575us 23.615us 16.28% 23.615us 23.615us 1 + aten::empty_strided 4.19% 31.190us 4.19% 31.190us 5.198us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 25.62% 190.534us 25.62% 190.534us 31.756us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.11% 30.580us 5.31% 39.472us 3.289us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.20% 8.892us 1.20% 8.892us 0.741us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.85% 43.511us 5.85% 43.511us 7.252us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.72% 5.340us 0.72% 5.340us 5.340us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 824.263us -Self CUDA time total: 146.812us +Self CPU time total: 743.718us +Self CUDA time total: 145.022us @@ -4581,23 +4581,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 12.27% 141.491us 70.82% 816.704us 816.704us 0.000us 0.00% 741.813us 741.813us 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 683.383us 101.18% 683.383us 683.383us 1 - aten::clone 1.70% 19.590us 48.29% 556.839us 92.806us 0.000us 0.00% 558.329us 93.055us 6 - aten::copy_ 3.05% 35.121us 43.99% 507.258us 84.543us 491.962us 72.84% 558.329us 93.055us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 491.962us 72.84% 491.962us 81.994us 6 - _rotary_dba7d1e::apply_rotary 3.41% 39.351us 6.95% 80.182us 13.364us 183.484us 27.16% 183.484us 30.581us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 183.484us 27.16% 183.484us 30.581us 6 - Activity Buffer Request 22.79% 262.814us 22.79% 262.814us 262.814us 66.367us 9.83% 66.367us 66.367us 1 - aten::empty_strided 2.60% 29.991us 2.60% 29.991us 4.998us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 18.15% 209.323us 18.15% 209.323us 34.887us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.59% 29.922us 3.31% 38.192us 3.183us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.72% 8.270us 0.72% 8.270us 0.689us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 3.54% 40.831us 3.54% 40.831us 6.805us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 29.18% 336.495us 29.18% 336.495us 336.495us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 13.47% 148.469us 71.31% 785.819us 785.819us 0.000us 0.00% 741.458us 741.458us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 683.092us 101.20% 683.092us 683.092us 1 + aten::clone 1.91% 21.000us 46.53% 512.672us 85.445us 0.000us 0.00% 558.390us 93.065us 6 + aten::copy_ 3.10% 34.171us 41.83% 460.962us 76.827us 491.927us 72.88% 558.390us 93.065us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 491.927us 72.88% 491.927us 81.988us 6 + _rotary_dba7d1e::apply_rotary 3.81% 41.992us 7.74% 85.293us 14.215us 183.068us 27.12% 183.068us 30.511us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 183.068us 27.12% 183.068us 30.511us 6 + Activity Buffer Request 21.20% 233.636us 21.20% 233.636us 233.636us 66.463us 9.85% 66.463us 66.463us 1 + aten::empty_strided 2.79% 30.710us 2.79% 30.710us 5.118us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.53% 193.155us 17.53% 193.155us 32.192us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.79% 30.724us 3.57% 39.385us 3.282us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.79% 8.661us 0.79% 8.661us 0.722us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 3.93% 43.301us 3.93% 43.301us 7.217us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 28.69% 316.097us 28.69% 316.097us 316.097us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.153ms -Self CUDA time total: 675.446us +Self CPU time total: 1.102ms +Self CUDA time total: 674.995us @@ -4607,23 +4607,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 3.40% 150.944us 53.25% 2.364ms 2.364ms 0.000us 0.00% 2.619ms 2.619ms 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 2.449ms 100.31% 2.449ms 2.449ms 1 - aten::clone 0.54% 23.821us 47.05% 2.089ms 348.124us 0.000us 0.00% 1.393ms 232.202us 6 - aten::copy_ 0.77% 34.330us 45.79% 2.033ms 338.776us 1.215ms 49.78% 1.393ms 232.202us 6 - _rotary_dba7d1e::apply_rotary 0.95% 42.082us 1.92% 85.331us 14.222us 1.226ms 50.22% 1.226ms 204.346us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 1.226ms 50.22% 1.226ms 204.346us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.215ms 49.78% 1.215ms 202.575us 6 - Activity Buffer Request 40.08% 1.779ms 40.08% 1.779ms 1.779ms 177.759us 7.28% 177.759us 177.759us 1 - aten::empty_strided 0.73% 32.270us 0.73% 32.270us 5.378us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 4.93% 218.903us 4.93% 218.903us 36.484us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 0.70% 31.129us 0.88% 39.000us 3.250us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.18% 7.871us 0.18% 7.871us 0.656us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 0.97% 43.249us 0.97% 43.249us 7.208us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 46.75% 2.075ms 46.75% 2.075ms 2.075ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 5.24% 147.368us 25.60% 720.668us 720.668us 0.000us 0.00% 2.633ms 2.633ms 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 2.458ms 100.33% 2.458ms 2.458ms 1 + aten::clone 0.73% 20.601us 15.98% 449.780us 74.963us 0.000us 0.00% 1.394ms 232.377us 6 + aten::copy_ 1.20% 33.732us 14.16% 398.690us 66.448us 1.211ms 49.43% 1.394ms 232.377us 6 + _rotary_dba7d1e::apply_rotary 1.46% 41.000us 3.02% 85.091us 14.182us 1.239ms 50.57% 1.239ms 206.500us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 1.239ms 50.57% 1.239ms 206.500us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.211ms 49.43% 1.211ms 201.812us 6 + Activity Buffer Request 6.06% 170.604us 6.06% 170.604us 170.604us 183.391us 7.49% 183.391us 183.391us 1 + aten::empty_strided 1.08% 30.489us 1.08% 30.489us 5.081us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.90% 194.354us 6.90% 194.354us 32.392us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.07% 30.231us 1.37% 38.429us 3.202us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.29% 8.198us 0.29% 8.198us 0.683us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.57% 44.091us 1.57% 44.091us 7.349us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 74.40% 2.094ms 74.40% 2.094ms 2.094ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.439ms -Self CUDA time total: 2.442ms +Self CPU time total: 2.815ms +Self CUDA time total: 2.450ms impl wl p50(ms) ok @@ -4655,62 +4655,13 @@ hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True-▶ UV Install LogsFetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] -Fetching 5 files: 40%|████ | 2/5 [00:00<00:00, 19.18it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. +Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 9.18it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 9.78it/s]+Fetching 5 files: 40%|████ | 2/5 [00:01<00:02, 1.08it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:01<00:00, 2.71it/s]Artifacts:
rotary.jsonl